mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 12:50:49 +08:00
swscale/aarch64: add NEON sws_ops backend
This commit pieces together the previous few commits to implement the NEON backend for sws_ops. In essence, a tool which runs on the target (sws_ops_aarch64) is used to enumerate all the functions that the backend needs to implement. The list it generates is stored in the repository (ops_entries.c). The list from above is used at build time by a code generator tool (ops_asmgen) to implement all the sws_ops functions the NEON backend supports, and generate a lookup function in C to retrieve the assembly function pointers. At runtime, the NEON backend fetches the function pointers to the assembly functions and chains them together in a continuation-passing style design, similar to the x86 backend. The following speedup is observed from legacy swscale to NEON: A520: Overall speedup=3.780x faster, min=0.137x max=91.928x A720: Overall speedup=4.129x faster, min=0.234x max=92.424x And the following from the C sws_ops implementation to NEON: A520: Overall speedup=5.513x faster, min=0.927x max=14.169x A720: Overall speedup=4.786x faster, min=0.585x max=20.157x The slowdowns from legacy to NEON are the same for C/x86. Mostly low bit-depth conversions that did not perform dithering in legacy. The 0.585x outlier from C to NEON is gbrpf32le -> gbrapf32le, which is mostly memcpy with the C implementation. All other conversions are better. Sponsored-by: Sovereign Tech Fund Signed-off-by: Ramiro Polla <ramiro.polla@gmail.com>
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -26,6 +26,8 @@
|
|||||||
*.spv
|
*.spv
|
||||||
*.spv.c
|
*.spv.c
|
||||||
*.spv.gz
|
*.spv.gz
|
||||||
|
*.gen.c
|
||||||
|
*.gen.S
|
||||||
*.ptx
|
*.ptx
|
||||||
*.ptx.c
|
*.ptx.c
|
||||||
*.ptx.gz
|
*.ptx.gz
|
||||||
|
|||||||
@@ -197,7 +197,7 @@ endif
|
|||||||
clean::
|
clean::
|
||||||
$(RM) $(BIN2CEXE) $(CLEANSUFFIXES:%=ffbuild/%)
|
$(RM) $(BIN2CEXE) $(CLEANSUFFIXES:%=ffbuild/%)
|
||||||
|
|
||||||
%.c %.h %.pc %.ver %.version: TAG = GEN
|
%.c %.h %.S %.pc %.ver %.version: TAG = GEN
|
||||||
|
|
||||||
# Dummy rule to stop make trying to rebuild removed or renamed headers
|
# Dummy rule to stop make trying to rebuild removed or renamed headers
|
||||||
%.h %_template.c:
|
%.h %_template.c:
|
||||||
@@ -266,7 +266,7 @@ $(TOOLOBJS): | tools
|
|||||||
|
|
||||||
OUTDIRS := $(OUTDIRS) $(dir $(OBJS) $(HOBJS) $(HOSTOBJS) $(SHLIBOBJS) $(STLIBOBJS) $(TESTOBJS))
|
OUTDIRS := $(OUTDIRS) $(dir $(OBJS) $(HOBJS) $(HOSTOBJS) $(SHLIBOBJS) $(STLIBOBJS) $(TESTOBJS))
|
||||||
|
|
||||||
CLEANSUFFIXES = *.d *.gcda *.gcno *.h.c *.ho *.map *.o *.objs *.pc *.ptx *.ptx.gz *.ptx.c *.spv *.spv.gz *.spv.c *.ver *.version *.html.gz *.html.c *.css.min.gz *.css.min *.css.c *$(DEFAULT_X86ASMD).asm *~ *.ilk *.pdb
|
CLEANSUFFIXES = *.d *.gcda *.gcno *.h.c *.ho *.map *.o *.objs *.pc *.ptx *.ptx.gz *.ptx.c *.spv *.spv.gz *.spv.c *.gen.c *.gen.S *.ver *.version *.html.gz *.html.c *.css.min.gz *.css.min *.css.c *$(DEFAULT_X86ASMD).asm *~ *.ilk *.pdb
|
||||||
LIBSUFFIXES = *.a *.lib *.so *.so.* *.dylib *.dll *.def *.dll.a
|
LIBSUFFIXES = *.a *.lib *.so *.so.* *.dylib *.dll *.def *.dll.a
|
||||||
|
|
||||||
define RULES
|
define RULES
|
||||||
|
|||||||
@@ -11,4 +11,21 @@ NEON-OBJS += aarch64/hscale.o \
|
|||||||
aarch64/xyz2rgb_neon.o \
|
aarch64/xyz2rgb_neon.o \
|
||||||
aarch64/yuv2rgb_neon.o \
|
aarch64/yuv2rgb_neon.o \
|
||||||
|
|
||||||
|
NEON-OBJS-$(CONFIG_UNSTABLE) += aarch64/ops.o
|
||||||
|
NEON-OBJS-$(CONFIG_UNSTABLE) += aarch64/ops_neon.gen.o
|
||||||
|
NEON-OBJS-$(CONFIG_UNSTABLE) += aarch64/ops_lookup.gen.o
|
||||||
|
|
||||||
|
$(SUBDIR)aarch64/ops_neon.gen.S: $(SUBDIR)aarch64/ops_asmgen$(HOSTEXESUF)
|
||||||
|
$(M)$< -ops > $@.tmp
|
||||||
|
$(CP) $@.tmp $@
|
||||||
|
$(RM) $@.tmp
|
||||||
|
|
||||||
|
$(SUBDIR)aarch64/ops_lookup.gen.c: $(SUBDIR)aarch64/ops_asmgen$(HOSTEXESUF)
|
||||||
|
$(M)$< -lookup > $@.tmp
|
||||||
|
$(CP) $@.tmp $@
|
||||||
|
$(RM) $@.tmp
|
||||||
|
|
||||||
|
clean::
|
||||||
|
$(RM) $(CLEANSUFFIXES:%=libswscale/aarch64/%)
|
||||||
|
|
||||||
HOSTPROGS = aarch64/ops_asmgen
|
HOSTPROGS = aarch64/ops_asmgen
|
||||||
|
|||||||
257
libswscale/aarch64/ops.c
Normal file
257
libswscale/aarch64/ops.c
Normal file
@@ -0,0 +1,257 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2026 Ramiro Polla
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "../ops_chain.h"
|
||||||
|
|
||||||
|
#include "libavutil/avassert.h"
|
||||||
|
#include "libavutil/avstring.h"
|
||||||
|
#include "libavutil/tree.h"
|
||||||
|
|
||||||
|
#include "ops_lookup.h"
|
||||||
|
|
||||||
|
#include "ops_impl_conv.c"
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
typedef struct SwsAArch64BackendContext {
|
||||||
|
SwsContext *sws;
|
||||||
|
int block_size;
|
||||||
|
} SwsAArch64BackendContext;
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
static int aarch64_setup_linear(const SwsAArch64OpImplParams *p,
|
||||||
|
const SwsOp *op, SwsImplResult *res)
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Compute number of full vector registers needed to pack all non-zero
|
||||||
|
* coefficients.
|
||||||
|
*/
|
||||||
|
const int num_vregs = linear_num_vregs(p);
|
||||||
|
av_assert0(num_vregs <= 4);
|
||||||
|
float *coeffs = av_malloc(num_vregs * 4 * sizeof(float));
|
||||||
|
if (!coeffs)
|
||||||
|
return AVERROR(ENOMEM);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copy non-zero coefficients, reordered to match SwsAArch64LinearOpMask.
|
||||||
|
* The coefficients are packed in sequential order. The same order must
|
||||||
|
* be followed in asmgen_op_linear().
|
||||||
|
*/
|
||||||
|
int i_coeff = 0;
|
||||||
|
LOOP_LINEAR_MASK(p, i, j) {
|
||||||
|
const int jj = linear_index_to_sws_op(j);
|
||||||
|
coeffs[i_coeff++] = (float) op->lin.m[i][jj].num / op->lin.m[i][jj].den;
|
||||||
|
}
|
||||||
|
|
||||||
|
res->priv.ptr = coeffs;
|
||||||
|
res->free = ff_op_priv_free;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
static int aarch64_setup_dither(const SwsAArch64OpImplParams *p,
|
||||||
|
const SwsOp *op, SwsImplResult *res)
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* The input dither matrix is (1 << size_log2)² pixels large. It is
|
||||||
|
* periodic, so the x and y offsets should be masked to fit inside
|
||||||
|
* (1 << size_log2).
|
||||||
|
* The width of the matrix is assumed to be at least 8, which matches
|
||||||
|
* the maximum block_size for aarch64 asmgen when f32 operations
|
||||||
|
* (i.e., dithering) are used. This guarantees that the x offset is
|
||||||
|
* aligned and that reading block_size elements does not extend past
|
||||||
|
* the end of the row. The x offset doesn't change between components,
|
||||||
|
* so it is only required to be masked once.
|
||||||
|
* The y offset, on the other hand, may change per component, and
|
||||||
|
* would therefore need to be masked for every y_offset value. To
|
||||||
|
* simplify the execution, we over-allocate the number of rows of
|
||||||
|
* the output dither matrix by the largest y_offset value. This way,
|
||||||
|
* we only need to mask y offset once, and can safely increment the
|
||||||
|
* dither matrix pointer by fixed offsets for every y_offset change.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Find the largest y_offset value. */
|
||||||
|
const int size = 1 << op->dither.size_log2;
|
||||||
|
const int8_t *off = op->dither.y_offset;
|
||||||
|
int max_offset = 0;
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
if (off[i] >= 0)
|
||||||
|
max_offset = FFMAX(max_offset, off[i] & (size - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Allocate (size + max_offset) rows to allow over-reading the matrix. */
|
||||||
|
const int stride = size * sizeof(float);
|
||||||
|
const int num_rows = size + max_offset;
|
||||||
|
float *matrix = av_malloc(num_rows * stride);
|
||||||
|
if (!matrix)
|
||||||
|
return AVERROR(ENOMEM);
|
||||||
|
|
||||||
|
for (int i = 0; i < size * size; i++)
|
||||||
|
matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
|
||||||
|
|
||||||
|
memcpy(&matrix[size * size], matrix, max_offset * stride);
|
||||||
|
|
||||||
|
res->priv.ptr = matrix;
|
||||||
|
res->free = ff_op_priv_free;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
static int aarch64_setup(SwsOpList *ops, int block_size, int n,
|
||||||
|
const SwsAArch64OpImplParams *p, SwsImplResult *out)
|
||||||
|
{
|
||||||
|
SwsOp *op = &ops->ops[n];
|
||||||
|
switch (op->op) {
|
||||||
|
case SWS_OP_READ:
|
||||||
|
/* Negative shift values to perform right shift using ushl. */
|
||||||
|
if (op->rw.frac == 3) {
|
||||||
|
out->priv = (SwsOpPriv) {
|
||||||
|
.u8 = {
|
||||||
|
-7, -6, -5, -4, -3, -2, -1, 0,
|
||||||
|
-7, -6, -5, -4, -3, -2, -1, 0,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case SWS_OP_WRITE:
|
||||||
|
/* Shift values for ushl. */
|
||||||
|
if (op->rw.frac == 3) {
|
||||||
|
out->priv = (SwsOpPriv) {
|
||||||
|
.u8 = {
|
||||||
|
7, 6, 5, 4, 3, 2, 1, 0,
|
||||||
|
7, 6, 5, 4, 3, 2, 1, 0,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case SWS_OP_CLEAR:
|
||||||
|
case SWS_OP_MIN:
|
||||||
|
case SWS_OP_MAX:
|
||||||
|
ff_sws_setup_q4(&(const SwsImplParams) { .op = op }, out);
|
||||||
|
break;
|
||||||
|
case SWS_OP_SCALE:
|
||||||
|
ff_sws_setup_q(&(const SwsImplParams) { .op = op }, out);
|
||||||
|
break;
|
||||||
|
case SWS_OP_LINEAR:
|
||||||
|
return aarch64_setup_linear(p, op, out);
|
||||||
|
case SWS_OP_DITHER:
|
||||||
|
return aarch64_setup_dither(p, op, out);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
static int aarch64_optimize(SwsAArch64BackendContext *bctx, SwsOpList *ops)
|
||||||
|
{
|
||||||
|
/* Currently, no optimization is performed. This is just a placeholder. */
|
||||||
|
|
||||||
|
/* Use at most two full vregs during the widest precision section */
|
||||||
|
bctx->block_size = (ff_sws_op_list_max_size(ops) == 4) ? 8 : 16;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
static int aarch64_compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
|
||||||
|
{
|
||||||
|
SwsAArch64BackendContext bctx;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
const int cpu_flags = av_get_cpu_flags();
|
||||||
|
if (!(cpu_flags & AV_CPU_FLAG_NEON))
|
||||||
|
return AVERROR(ENOTSUP);
|
||||||
|
|
||||||
|
/* Make on-stack copy of `ops` to iterate over */
|
||||||
|
SwsOpList rest = *ops;
|
||||||
|
bctx.sws = ctx;
|
||||||
|
ret = aarch64_optimize(&bctx, &rest);
|
||||||
|
if (ret < 0)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
SwsOpChain *chain = ff_sws_op_chain_alloc();
|
||||||
|
if (!chain)
|
||||||
|
return AVERROR(ENOMEM);
|
||||||
|
chain->cpu_flags = AV_CPU_FLAG_NEON;
|
||||||
|
|
||||||
|
*out = (SwsCompiledOp) {
|
||||||
|
.priv = chain,
|
||||||
|
.slice_align = 1,
|
||||||
|
.free = ff_sws_op_chain_free_cb,
|
||||||
|
.block_size = bctx.block_size,
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Look up kernel functions. */
|
||||||
|
for (int i = 0; i < rest.num_ops; i++) {
|
||||||
|
SwsAArch64OpImplParams params = { 0 };
|
||||||
|
ret = convert_to_aarch64_impl(ctx, &rest, i, bctx.block_size, ¶ms);
|
||||||
|
if (ret < 0)
|
||||||
|
goto error;
|
||||||
|
SwsFuncPtr func = ff_sws_aarch64_lookup(¶ms);
|
||||||
|
if (!func) {
|
||||||
|
ret = AVERROR(ENOTSUP);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
SwsImplResult res = { 0 };
|
||||||
|
ret = aarch64_setup(&rest, bctx.block_size, i, ¶ms, &res);
|
||||||
|
if (ret < 0)
|
||||||
|
goto error;
|
||||||
|
ret = ff_sws_op_chain_append(chain, func, res.free, &res.priv);
|
||||||
|
if (ret < 0)
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Look up process/process_return functions. */
|
||||||
|
const SwsOp *read = ff_sws_op_list_input(&rest);
|
||||||
|
const SwsOp *write = ff_sws_op_list_output(&rest);
|
||||||
|
const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
|
||||||
|
const int write_planes = write->rw.packed ? 1 : write->rw.elems;
|
||||||
|
SwsAArch64OpMask mask = 0;
|
||||||
|
for (int i = 0; i < FFMAX(read_planes, write_planes); i++)
|
||||||
|
MASK_SET(mask, i, 1);
|
||||||
|
|
||||||
|
SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS, .mask = mask };
|
||||||
|
SwsAArch64OpImplParams return_params = { .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = mask };
|
||||||
|
SwsFuncPtr process_func = ff_sws_aarch64_lookup(&process_params);
|
||||||
|
SwsFuncPtr return_func = ff_sws_aarch64_lookup(&return_params);
|
||||||
|
if (!process_func || !return_func) {
|
||||||
|
ret = AVERROR(ENOTSUP);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = ff_sws_op_chain_append(chain, return_func, NULL, &(SwsOpPriv) { 0 });
|
||||||
|
if (ret < 0)
|
||||||
|
goto error;
|
||||||
|
|
||||||
|
out->func = (SwsOpFunc) process_func;
|
||||||
|
out->cpu_flags = chain->cpu_flags;
|
||||||
|
|
||||||
|
error:
|
||||||
|
if (ret < 0)
|
||||||
|
ff_sws_op_chain_free(chain);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
const SwsOpBackend backend_aarch64 = {
|
||||||
|
.name = "aarch64",
|
||||||
|
.compile = aarch64_compile,
|
||||||
|
.hw_format = AV_PIX_FMT_NONE,
|
||||||
|
};
|
||||||
@@ -143,6 +143,12 @@ static inline int linear_num_vregs(const SwsAArch64OpImplParams *params)
|
|||||||
return (count + 3) / 4;
|
return (count + 3) / 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int linear_index_to_sws_op(int idx)
|
||||||
|
{
|
||||||
|
const int reorder_col[5] = { 4, 0, 1, 2, 3 };
|
||||||
|
return reorder_col[idx];
|
||||||
|
}
|
||||||
|
|
||||||
static inline int linear_index_is_offset(int idx)
|
static inline int linear_index_is_offset(int idx)
|
||||||
{
|
{
|
||||||
return (idx == 0);
|
return (idx == 0);
|
||||||
|
|||||||
30
libswscale/aarch64/ops_lookup.h
Normal file
30
libswscale/aarch64/ops_lookup.h
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2026 Ramiro Polla
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SWSCALE_AARCH64_OPS_LOOKUP_H
|
||||||
|
#define SWSCALE_AARCH64_OPS_LOOKUP_H
|
||||||
|
|
||||||
|
#include "libswscale/ops_chain.h"
|
||||||
|
#include "libswscale/aarch64/ops_impl.h"
|
||||||
|
|
||||||
|
/* Look up the exported function pointer for the given parameters. */
|
||||||
|
SwsFuncPtr ff_sws_aarch64_lookup(const SwsAArch64OpImplParams *p);
|
||||||
|
|
||||||
|
#endif /* SWSCALE_AARCH64_OPS_LOOKUP_H */
|
||||||
@@ -32,12 +32,15 @@
|
|||||||
|
|
||||||
extern const SwsOpBackend backend_c;
|
extern const SwsOpBackend backend_c;
|
||||||
extern const SwsOpBackend backend_murder;
|
extern const SwsOpBackend backend_murder;
|
||||||
|
extern const SwsOpBackend backend_aarch64;
|
||||||
extern const SwsOpBackend backend_x86;
|
extern const SwsOpBackend backend_x86;
|
||||||
extern const SwsOpBackend backend_vulkan;
|
extern const SwsOpBackend backend_vulkan;
|
||||||
|
|
||||||
const SwsOpBackend * const ff_sws_op_backends[] = {
|
const SwsOpBackend * const ff_sws_op_backends[] = {
|
||||||
&backend_murder,
|
&backend_murder,
|
||||||
#if ARCH_X86_64 && HAVE_X86ASM
|
#if ARCH_AARCH64 && HAVE_NEON
|
||||||
|
&backend_aarch64,
|
||||||
|
#elif ARCH_X86_64 && HAVE_X86ASM
|
||||||
&backend_x86,
|
&backend_x86,
|
||||||
#endif
|
#endif
|
||||||
&backend_c,
|
&backend_c,
|
||||||
|
|||||||
Reference in New Issue
Block a user