diff --git a/libswscale/ops_backend.h b/libswscale/ops_backend.h index 7aadc9d32e..f97b4303aa 100644 --- a/libswscale/ops_backend.h +++ b/libswscale/ops_backend.h @@ -102,17 +102,21 @@ typedef struct SwsOpIter { (pixel_t *) iter->out[2], (pixel_t *) iter->out[3], __VA_ARGS__) /* Helper macros to declare continuation functions */ -#define DECL_IMPL(NAME) \ - static void fn(NAME)(SwsOpIter *restrict iter, \ - const SwsOpImpl *restrict impl, \ - block_t x, block_t y, \ - block_t z, block_t w) +#define DECL_IMPL(FUNC, NAME, ...) \ + static void av_flatten fn(NAME)(SwsOpIter *restrict iter, \ + const SwsOpImpl *restrict impl, \ + void *restrict x, void *restrict y, \ + void *restrict z, void *restrict w) \ + { \ + CALL(FUNC, __VA_ARGS__); \ + } -/* Helper macro to call into the next continuation with a given type */ -#define CONTINUE(TYPE, ...) \ +/* Helper macro to call into the next continuation */ +#define CONTINUE(X, Y, Z, W) \ ((void (*)(SwsOpIter *, const SwsOpImpl *, \ - TYPE x, TYPE y, TYPE z, TYPE w)) impl->cont) \ - (iter, &impl[1], __VA_ARGS__) + void *restrict, void *restrict, \ + void *restrict, void *restrict)) impl->cont) \ + (iter, &impl[1], (X), (Y), (Z), (W)) /* Helper macros for common op setup code */ #define DECL_SETUP(NAME, PARAMS, OUT) \ @@ -139,10 +143,7 @@ static inline int ff_setup_memdup(const void *c, size_t size, SwsImplResult *out DECL_FUNC(NAME, const bool X, const bool Y, const bool Z, const bool W) #define WRAP_PATTERN(FUNC, X, Y, Z, W, ...) \ - DECL_IMPL(FUNC##_##X##Y##Z##W) \ - { \ - CALL(FUNC, X, Y, Z, W); \ - } \ + DECL_IMPL(FUNC, FUNC##_##X##Y##Z##W, X, Y, Z, W) \ \ DECL_ENTRY(FUNC##_##X##Y##Z##W, \ .unused = { !X, !Y, !Z, !W }, \ diff --git a/libswscale/ops_tmpl_common.c b/libswscale/ops_tmpl_common.c index 3817a437e5..0763a08b1c 100644 --- a/libswscale/ops_tmpl_common.c +++ b/libswscale/ops_tmpl_common.c @@ -41,7 +41,7 @@ DECL_PATTERN(convert_uint##N) wu[i] = w[i]; \ } \ \ - CONTINUE(u##N##block_t, xu, yu, zu, wu); \ + CONTINUE(xu, yu, zu, wu); \ } \ \ WRAP_COMMON_PATTERNS(convert_uint##N, \ @@ -75,14 +75,11 @@ DECL_PATTERN(clear) w[i] = impl->priv.px[3]; } - CONTINUE(block_t, x, y, z, w); + CONTINUE(x, y, z, w); } #define WRAP_CLEAR(X, Y, Z, W) \ -DECL_IMPL(clear##_##X##Y##Z##W) \ -{ \ - CALL(clear, X, Y, Z, W); \ -} \ +DECL_IMPL(clear, clear##_##X##Y##Z##W, X, Y, Z, W) \ \ DECL_ENTRY(clear##_##X##Y##Z##W, \ .setup = ff_sws_setup_clear, \ @@ -119,7 +116,7 @@ DECL_PATTERN(min) w[i] = FFMIN(w[i], impl->priv.px[3]); } - CONTINUE(block_t, x, y, z, w); + CONTINUE(x, y, z, w); } DECL_PATTERN(max) @@ -136,7 +133,7 @@ DECL_PATTERN(max) w[i] = FFMAX(w[i], impl->priv.px[3]); } - CONTINUE(block_t, x, y, z, w); + CONTINUE(x, y, z, w); } WRAP_COMMON_PATTERNS(min, @@ -167,7 +164,7 @@ DECL_PATTERN(scale) w[i] *= scale; } - CONTINUE(block_t, x, y, z, w); + CONTINUE(x, y, z, w); } WRAP_COMMON_PATTERNS(scale, @@ -239,7 +236,7 @@ DECL_READ(filter_v, const int elems) for (int i = 0; i < elems; i++) iter->in[i] += sizeof(block_t); - CONTINUE(f32block_t, xs, ys, zs, ws); + CONTINUE(xs, ys, zs, ws); } DECL_SETUP(setup_filter_h, params, out) @@ -292,11 +289,14 @@ DECL_READ(filter_h, const int elems) weights += filter_size; } - CONTINUE(f32block_t, xs, ys, zs, ws); + CONTINUE(xs, ys, zs, ws); } #define WRAP_FILTER(FUNC, DIR, ELEMS, SUFFIX) \ -DECL_IMPL(FUNC##ELEMS##SUFFIX) \ +static av_flatten void fn(FUNC##ELEMS##SUFFIX)(SwsOpIter *restrict iter, \ + const SwsOpImpl *restrict impl, \ + void *restrict x, void *restrict y,\ + void *restrict z, void *restrict w)\ { \ CALL_READ(FUNC##SUFFIX, ELEMS); \ } \ @@ -337,7 +337,7 @@ static void fn(process)(const SwsOpExec *exec, const void *priv, for (iter->y = y_start; iter->y < y_end; iter->y++) { for (int block = bx_start; block < bx_end; block++) { iter->x = block * SWS_BLOCK_SIZE; - CONTINUE(block_t, (void *) x, (void *) y, (void *) z, (void *) w); + CONTINUE(x, y, z, w); } const int y_bump = exec->in_bump_y ? exec->in_bump_y[iter->y] : 0; diff --git a/libswscale/ops_tmpl_float.c b/libswscale/ops_tmpl_float.c index 0d00714ff4..687b08b99b 100644 --- a/libswscale/ops_tmpl_float.c +++ b/libswscale/ops_tmpl_float.c @@ -98,14 +98,11 @@ DECL_FUNC(dither, const int size_log2) DITHER_COMP(z, 2) DITHER_COMP(w, 3) - CONTINUE(block_t, x, y, z, w); + CONTINUE(x, y, z, w); } #define WRAP_DITHER(N) \ -DECL_IMPL(dither##N) \ -{ \ - CALL(dither, N); \ -} \ +DECL_IMPL(dither, dither##N, N) \ \ DECL_ENTRY(dither##N, \ .op = SWS_OP_DITHER, \ @@ -185,14 +182,11 @@ DECL_FUNC(linear_mask, const uint32_t mask) w[i] += (mask & SWS_MASK(3, 3)) ? c.m[3][3] * ww : ww; } - CONTINUE(block_t, x, y, z, w); + CONTINUE(x, y, z, w); } #define WRAP_LINEAR(NAME, MASK) \ -DECL_IMPL(linear_##NAME) \ -{ \ - CALL(linear_mask, MASK); \ -} \ +DECL_IMPL(linear_mask, linear_##NAME, MASK) \ \ DECL_ENTRY(linear_##NAME, \ .op = SWS_OP_LINEAR, \ diff --git a/libswscale/ops_tmpl_int.c b/libswscale/ops_tmpl_int.c index fea0430799..baa3526029 100644 --- a/libswscale/ops_tmpl_int.c +++ b/libswscale/ops_tmpl_int.c @@ -71,7 +71,7 @@ DECL_READ(read_planar, const int elems) w[i] = in3[i]; } - CONTINUE(block_t, x, y, z, w); + CONTINUE(x, y, z, w); } DECL_READ(read_packed, const int elems) @@ -87,7 +87,7 @@ DECL_READ(read_packed, const int elems) w[i] = in0[elems * i + 3]; } - CONTINUE(block_t, x, y, z, w); + CONTINUE(x, y, z, w); } DECL_WRITE(write_planar, const int elems) @@ -119,7 +119,10 @@ DECL_WRITE(write_packed, const int elems) } #define WRAP_READ(FUNC, ELEMS, FRAC, PACKED) \ -DECL_IMPL(FUNC##ELEMS) \ +static av_flatten void fn(FUNC##ELEMS)(SwsOpIter *restrict iter, \ + const SwsOpImpl *restrict impl, \ + void *restrict x, void *restrict y, \ + void *restrict z, void *restrict w) \ { \ CALL_READ(FUNC, ELEMS); \ for (int i = 0; i < (PACKED ? 1 : ELEMS); i++) \ @@ -144,7 +147,10 @@ WRAP_READ(read_packed, 3, 0, true) WRAP_READ(read_packed, 4, 0, true) #define WRAP_WRITE(FUNC, ELEMS, FRAC, PACKED) \ -DECL_IMPL(FUNC##ELEMS) \ +static av_flatten void fn(FUNC##ELEMS)(SwsOpIter *restrict iter, \ + const SwsOpImpl *restrict impl, \ + void *restrict x, void *restrict y, \ + void *restrict z, void *restrict w) \ { \ CALL_WRITE(FUNC, ELEMS); \ for (int i = 0; i < (PACKED ? 1 : ELEMS); i++) \ @@ -178,7 +184,7 @@ DECL_READ(read_nibbles, const int elems) x[i + 1] = val & 0xF; /* low nibble */ } - CONTINUE(block_t, x, y, z, w); + CONTINUE(x, y, z, w); } DECL_READ(read_bits, const int elems) @@ -196,7 +202,7 @@ DECL_READ(read_bits, const int elems) x[i + 7] = (val >> 0) & 1; } - CONTINUE(block_t, x, y, z, w); + CONTINUE(x, y, z, w); } WRAP_READ(read_nibbles, 1, 1, false) @@ -243,7 +249,7 @@ DECL_PATTERN(swap_bytes) w[i] = SWAP_BYTES(w[i]); } - CONTINUE(block_t, x, y, z, w); + CONTINUE(x, y, z, w); } WRAP_COMMON_PATTERNS(swap_bytes, .op = SWS_OP_SWAP_BYTES); @@ -266,7 +272,7 @@ DECL_PATTERN(expand16) w16[i] = w[i] << 8 | w[i]; } - CONTINUE(u16block_t, x16, y16, z16, w16); + CONTINUE(x16, y16, z16, w16); } WRAP_COMMON_PATTERNS(expand16, @@ -287,7 +293,7 @@ DECL_PATTERN(expand32) w32[i] = (uint32_t)w[i] << 24 | w[i] << 16 | w[i] << 8 | w[i]; } - CONTINUE(u32block_t, x32, y32, z32, w32); + CONTINUE(x32, y32, z32, w32); } WRAP_COMMON_PATTERNS(expand32, @@ -297,44 +303,48 @@ WRAP_COMMON_PATTERNS(expand32, ); #endif +DECL_FUNC(pack, const int bits0, const int bits1, const int bits2, const int bits3) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + x[i] = x[i] << (bits1 + bits2 + bits3); + if (bits1) + x[i] |= y[i] << (bits2 + bits3); + if (bits2) + x[i] |= z[i] << bits3; + if (bits3) + x[i] |= w[i]; + } + + CONTINUE(x, y, z, w); +} + +DECL_FUNC(unpack, const int bits0, const int bits1, const int bits2, const int bits3) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + const pixel_t val = x[i]; + x[i] = val >> (bits1 + bits2 + bits3); + if (bits1) + y[i] = (val >> (bits2 + bits3)) & ((1 << bits1) - 1); + if (bits2) + z[i] = (val >> bits3) & ((1 << bits2) - 1); + if (bits3) + w[i] = val & ((1 << bits3) - 1); + } + + CONTINUE(x, y, z, w); +} + #define WRAP_PACK_UNPACK(X, Y, Z, W) \ -inline DECL_IMPL(pack_##X##Y##Z##W) \ -{ \ - SWS_LOOP \ - for (int i = 0; i < SWS_BLOCK_SIZE; i++) { \ - x[i] = x[i] << (Y+Z+W); \ - if (Y) \ - x[i] |= y[i] << (Z+W); \ - if (Z) \ - x[i] |= z[i] << W; \ - if (W) \ - x[i] |= w[i]; \ - } \ - \ - CONTINUE(block_t, x, y, z, w); \ -} \ +DECL_IMPL(pack, pack_##X##Y##Z##W, X, Y, Z, W) \ \ DECL_ENTRY(pack_##X##Y##Z##W, \ .op = SWS_OP_PACK, \ .pack.pattern = { X, Y, Z, W }, \ ); \ \ -inline DECL_IMPL(unpack_##X##Y##Z##W) \ -{ \ - SWS_LOOP \ - for (int i = 0; i < SWS_BLOCK_SIZE; i++) { \ - const pixel_t val = x[i]; \ - x[i] = val >> (Y+Z+W); \ - if (Y) \ - y[i] = (val >> (Z+W)) & ((1 << Y) - 1); \ - if (Z) \ - z[i] = (val >> W) & ((1 << Z) - 1); \ - if (W) \ - w[i] = val & ((1 << W) - 1); \ - } \ - \ - CONTINUE(block_t, x, y, z, w); \ -} \ +DECL_IMPL(unpack, unpack_##X##Y##Z##W, X, Y, Z, W) \ \ DECL_ENTRY(unpack_##X##Y##Z##W, \ .op = SWS_OP_UNPACK, \ @@ -363,7 +373,7 @@ DECL_PATTERN(lshift) w[i] <<= amount; } - CONTINUE(block_t, x, y, z, w); + CONTINUE(x, y, z, w); } DECL_PATTERN(rshift) @@ -378,7 +388,7 @@ DECL_PATTERN(rshift) w[i] >>= amount; } - CONTINUE(block_t, x, y, z, w); + CONTINUE(x, y, z, w); } WRAP_COMMON_PATTERNS(lshift, @@ -406,7 +416,7 @@ DECL_PATTERN(convert_float) wf[i] = w[i]; } - CONTINUE(f32block_t, xf, yf, zf, wf); + CONTINUE(xf, yf, zf, wf); } WRAP_COMMON_PATTERNS(convert_float, @@ -422,9 +432,10 @@ WRAP_COMMON_PATTERNS(convert_float, static void \ fn(swizzle_##X##Y##Z##W)(SwsOpIter *restrict iter, \ const SwsOpImpl *restrict impl, \ - block_t c0, block_t c1, block_t c2, block_t c3) \ + void *restrict c0, void *restrict c1, \ + void *restrict c2, void *restrict c3) \ { \ - CONTINUE(block_t, c##X, c##Y, c##Z, c##W); \ + CONTINUE(c##X, c##Y, c##Z, c##W); \ } \ \ DECL_ENTRY(swizzle_##X##Y##Z##W, \ @@ -453,18 +464,18 @@ DECL_SWIZZLE(0, 3, 2, 1) /* Broadcast luma -> rgb (only used for y(a) -> rgb(a)) */ #define DECL_EXPAND_LUMA(X, W, T0, T1) \ -static void \ -fn(expand_luma_##X##W)(SwsOpIter *restrict iter, \ - const SwsOpImpl *restrict impl, \ - block_t c0, block_t c1, block_t c2, block_t c3) \ +DECL_FUNC(expand_luma_##X##W##_impl, \ + block_t c0, block_t c1, block_t c2, block_t c3) \ { \ SWS_LOOP \ for (int i = 0; i < SWS_BLOCK_SIZE; i++) \ T0[i] = T1[i] = c0[i]; \ \ - CONTINUE(block_t, c##X, T0, T1, c##W); \ + CONTINUE(c##X, T0, T1, c##W); \ } \ \ +DECL_IMPL(expand_luma_##X##W##_impl, expand_luma_##X##W, x, y, z, w) \ + \ DECL_ENTRY(expand_luma_##X##W, \ .op = SWS_OP_SWIZZLE, \ .swizzle.in = { X, 0, 0, W }, \