mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-21 05:11:59 +08:00
This commit pieces together the previous few commits to implement the NEON backend for sws_ops. In essence, a tool which runs on the target (sws_ops_aarch64) is used to enumerate all the functions that the backend needs to implement. The list it generates is stored in the repository (ops_entries.c). The list from above is used at build time by a code generator tool (ops_asmgen) to implement all the sws_ops functions the NEON backend supports, and generate a lookup function in C to retrieve the assembly function pointers. At runtime, the NEON backend fetches the function pointers to the assembly functions and chains them together in a continuation-passing style design, similar to the x86 backend. The following speedup is observed from legacy swscale to NEON: A520: Overall speedup=3.780x faster, min=0.137x max=91.928x A720: Overall speedup=4.129x faster, min=0.234x max=92.424x And the following from the C sws_ops implementation to NEON: A520: Overall speedup=5.513x faster, min=0.927x max=14.169x A720: Overall speedup=4.786x faster, min=0.585x max=20.157x The slowdowns from legacy to NEON are the same for C/x86. Mostly low bit-depth conversions that did not perform dithering in legacy. The 0.585x outlier from C to NEON is gbrpf32le -> gbrapf32le, which is mostly memcpy with the C implementation. All other conversions are better. Sponsored-by: Sovereign Tech Fund Signed-off-by: Ramiro Polla <ramiro.polla@gmail.com>
32 lines
1.0 KiB
Makefile
32 lines
1.0 KiB
Makefile
OBJS += aarch64/rgb2rgb.o \
|
|
aarch64/swscale.o \
|
|
aarch64/swscale_unscaled.o \
|
|
|
|
NEON-OBJS += aarch64/hscale.o \
|
|
aarch64/input.o \
|
|
aarch64/output.o \
|
|
aarch64/range_convert_neon.o \
|
|
aarch64/rgb2rgb_neon.o \
|
|
aarch64/swscale_unscaled_neon.o \
|
|
aarch64/xyz2rgb_neon.o \
|
|
aarch64/yuv2rgb_neon.o \
|
|
|
|
NEON-OBJS-$(CONFIG_UNSTABLE) += aarch64/ops.o
|
|
NEON-OBJS-$(CONFIG_UNSTABLE) += aarch64/ops_neon.gen.o
|
|
NEON-OBJS-$(CONFIG_UNSTABLE) += aarch64/ops_lookup.gen.o
|
|
|
|
$(SUBDIR)aarch64/ops_neon.gen.S: $(SUBDIR)aarch64/ops_asmgen$(HOSTEXESUF)
|
|
$(M)$< -ops > $@.tmp
|
|
$(CP) $@.tmp $@
|
|
$(RM) $@.tmp
|
|
|
|
$(SUBDIR)aarch64/ops_lookup.gen.c: $(SUBDIR)aarch64/ops_asmgen$(HOSTEXESUF)
|
|
$(M)$< -lookup > $@.tmp
|
|
$(CP) $@.tmp $@
|
|
$(RM) $@.tmp
|
|
|
|
clean::
|
|
$(RM) $(CLEANSUFFIXES:%=libswscale/aarch64/%)
|
|
|
|
HOSTPROGS = aarch64/ops_asmgen
|