diff --git a/libavutil/aarch64/crc.S b/libavutil/aarch64/crc.S index 892c7ff229..6ff109aa71 100644 --- a/libavutil/aarch64/crc.S +++ b/libavutil/aarch64/crc.S @@ -1,5 +1,6 @@ /* * Copyright (c) 2025 Zhao Zhili + * Copyright (c) 2026 Shreesh Adiga <16567adigashreesh@gmail.com> * * This file is part of FFmpeg. * @@ -67,3 +68,267 @@ endfunc DISABLE_ARM_CRC #endif + +#if HAVE_PMULL && HAVE_EOR3 +ENABLE_PMULL +ENABLE_EOR3 + +const reverse_shuffle, align=4 + .byte 15, 14, 13, 12 + .byte 11, 10, 9, 8 + .byte 7, 6, 5, 4 + .byte 3, 2, 1, 0 +endconst + +const partial_bytes_shuf_tab, align=4 + .byte 255, 254, 253, 252 + .byte 251, 250, 249, 248 + .byte 247, 246, 245, 244 + .byte 243, 242, 241, 240 + .byte 0, 1, 2, 3 + .byte 4, 5, 6, 7 + .byte 8, 9, 10, 11 + .byte 12, 13, 14, 15 +endconst + +// performs Vfold = pmull(Vfold, Vconst) xor pmull2(Vfold, Vconst) xor Vdata +.macro FOLD_SINGLE Vfold, Vconst, Vdata, Vtemp + pmull \Vtemp\().1q, \Vconst\().1d, \Vfold\().1d + pmull2 \Vfold\().1q, \Vconst\().2d, \Vfold\().2d + eor3 \Vfold\().16b, \Vfold\().16b, \Vdata\().16b, \Vtemp\().16b +.endm + +// assume Vfold is v16 and v0 is filled with 0 +// uses v17 as temp +.macro FOLD_128_TO_64 le, Vconst +.if ! \le + fmov d17, d16 + pmull2 v16.1q, v16.2d, \Vconst\().2d + ext v17.16b, v0.16b, v17.16b, #12 + eor v16.16b, v17.16b, v16.16b + ext \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8 + pmull2 v17.1q, \Vconst\().2d, v16.2d + eor v16.16b, v16.16b, v17.16b +.else + ext v17.16b, v16.16b, v0.16b, #8 + pmull v16.1q, v16.1d, \Vconst\().1d + eor v16.16b, v16.16b, v17.16b + ext v17.16b, v0.16b, v16.16b, #12 + ext \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8 + pmull v17.1q, \Vconst\().1d, v17.1d + eor v16.16b, v16.16b, v17.16b +.endif +.endm + +// assume Vfold is v16 and v0 is filled with 0 +// uses v17 as temp +.macro FOLD_64_TO_32 le, Vconst +.if ! \le + pmull v17.1q, v16.1d, \Vconst\().1d + pmull2 v17.1q, v17.2d, \Vconst\().2d + eor v16.16b, v16.16b, v17.16b + fmov w0, s16 + rev w0, w0 +.else + mov v16.s[0], wzr + pmull v17.1q, v16.1d, \Vconst\().1d + eor v17.16b, v17.16b, v16.16b + ext \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8 + pmull v17.1q, v17.1d, \Vconst\().1d + eor v16.16b, v16.16b, v17.16b + mov w0, v16.s[2] +.endif +.endm + +#define CTX_OFFSET 4 + +// ff_crc[_le]_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer, size_t length) +// x0 - ctx; pre-computed fold constants, see crc_init_aarch64 for details on the structure. +// x1 - crc initial value +// x2 - buffer pointer +// x3 - length of buffer +// +// Additional registers used: +// x10 - buffer + length; points to end of buffer +// x11-x15 used as temp registers as needed +// +// v0 - zero register to perform vector shift during reduction using EXT +// v1 - crc input from x1 +// v2 - holds the reverse shuffle values 15, 14 ... 0 to reverse vector using TBL +// v3 - holds the constant values used in PMULL/PMULL2 loaded via x0 ctx +// v4-v7 - used as temp for PMULL/PMULL2 output +// v16 - holds the primary fold register later reduced to 32 bits +// Registers v16-v19 are used in parallel 4x fold loop +// Registers v20-23 are used to load the 4 next data chunks +// Registers v17-v21 are used as temp elsewhere +.macro crc_fn_template le +.if \le +function ff_crc_le_neon_pmull, export=1 +.else +function ff_crc_neon_pmull, export=1 + movrel x9, reverse_shuffle + ldr q2, [x9] +.endif + add x10, x2, x3 + movi v0.2d, #0 + mov v1.16b, v0.16b + mov v1.s[0], w1 + cmp x3, #64 + b.lo 8f // less than 64 bytes + ld1 {v16.16b-v19.16b}, [x2], #64 + eor v16.16b, v16.16b, v1.16b +.if ! \le + tbl v16.16b, { v16.16b }, v2.16b + tbl v17.16b, { v17.16b }, v2.16b + tbl v18.16b, { v18.16b }, v2.16b + tbl v19.16b, { v19.16b }, v2.16b +.endif + sub x11, x10, x2 + cmp x11, #64 + b.lo 2f // reduce 4x to 1x + ldr q3, [x0, #(CTX_OFFSET + 0)] + +1: // fold 4x loop + ld1 {v20.16b-v23.16b}, [x2], #64 +.if ! \le + tbl v20.16b, { v20.16b }, v2.16b + tbl v21.16b, { v21.16b }, v2.16b + tbl v22.16b, { v22.16b }, v2.16b + tbl v23.16b, { v23.16b }, v2.16b +.endif + pmull v4.1q, v16.1d, v3.1d + pmull v5.1q, v17.1d, v3.1d + pmull v6.1q, v18.1d, v3.1d + pmull v7.1q, v19.1d, v3.1d + pmull2 v16.1q, v16.2d, v3.2d + pmull2 v17.1q, v17.2d, v3.2d + pmull2 v18.1q, v18.2d, v3.2d + pmull2 v19.1q, v19.2d, v3.2d + eor3 v16.16b, v16.16b, v4.16b, v20.16b + eor3 v17.16b, v17.16b, v5.16b, v21.16b + eor3 v18.16b, v18.16b, v6.16b, v22.16b + eor3 v19.16b, v19.16b, v7.16b, v23.16b + sub x11, x10, x2 + cmp x11, #64 + b.hs 1b // fold 4x loop + +2: // reduce 4x to 1x + ldr q3, [x0, #(CTX_OFFSET + 16)] + FOLD_SINGLE v16, v3, v17, v4 + FOLD_SINGLE v16, v3, v18, v4 + FOLD_SINGLE v16, v3, v19, v4 + +3: // fold 1x pre + sub x11, x10, x2 + cmp x11, #16 + b.lo 5f // partial_block + +4: // fold 1x loop + ldr q17, [x2], #16 +.if ! \le + tbl v17.16b, { v17.16b }, v2.16b +.endif + FOLD_SINGLE v16, v3, v17, v4 + sub x11, x10, x2 + cmp x11, #16 + b.hs 4b // fold 1x loop + +5: // partial_block + cmp x10, x2 + b.eq 6f // reduce 128 to 64 + ldr q17, [x10, #-16] + movrel x1, partial_bytes_shuf_tab + ldr q18, [x1, x11] +.if ! \le + tbl v16.16b, { v16.16b }, v2.16b +.endif + mvn v19.16b, v18.16b + tbl v20.16b, { v16.16b }, v19.16b + cmge v21.16b, v18.16b, #0 + bsl v21.16b, v17.16b, v20.16b + tbl v16.16b, { v16.16b }, v18.16b +.if ! \le + tbl v16.16b, { v16.16b }, v2.16b + tbl v21.16b, { v21.16b }, v2.16b +.endif + FOLD_SINGLE v16, v3, v21, v4 + +6: // reduce 128 to 64 + ldr q3, [x0, #(CTX_OFFSET + 32)] + FOLD_128_TO_64 \le, v3 + +7: // reduce 64 to 32 + ldr q3, [x0, #(CTX_OFFSET + 48)] + FOLD_64_TO_32 \le, v3 + ret + +8: // less than 64 bytes + cmp x3, #16 + b.lo 9f // less than 16 bytes + ldr q16, [x2], #16 + eor v16.16b, v16.16b, v1.16b +.if ! \le + tbl v16.16b, { v16.16b }, v2.16b +.endif + ldr q3, [x0, #(CTX_OFFSET + 16)] + b 3b // fold 1x pre + +9: // less than 16 bytes + movrel x15, partial_bytes_shuf_tab + add x15, x15, x3 + str q0, [sp, #-16]! + add x9, sp, x3 + // memcpy 0 to 15 bytes from src to stack and load it to v16 with zero extension + tbz x3, #3, 10f + ldr x11, [x2] + ldr x12, [x10, #-8] + str x11, [sp] + str x12, [x9, #-8] + b 12f +10: + tbz x3, #2, 11f + ldr w11, [x2] + ldr w12, [x10, #-4] + str w11, [sp] + str w12, [x9, #-4] + b 12f +11: + cbz x3, 12f + lsr x11, x3, #1 + ldrb w12, [x2] + ldrb w14, [x10, #-1] + ldrb w13, [x2, x11] + strb w12, [sp] + strb w13, [sp, x11] + strb w14, [x9, #-1] +12: + ldr q16, [sp], #16 + eor v16.16b, v16.16b, v1.16b + cmp x3, #5 + b.lo 13f // less than 5 bytes + ldr q17, [x15] + tbl v16.16b, { v16.16b }, v17.16b +.if ! \le + tbl v16.16b, { v16.16b }, v2.16b +.endif + b 6b // reduce 128 to 64 + +13: // less than 5 bytes +.if ! \le + ldr q17, [x15, #12] + tbl v16.16b, { v16.16b }, v17.16b + rev64 v16.16b, v16.16b +.else + ldr q17, [x15, #8] + tbl v16.16b, { v16.16b }, v17.16b +.endif + b 7b // reduce 64 to 32 +endfunc +.endm + +crc_fn_template 0 +crc_fn_template 1 + +DISABLE_PMULL +DISABLE_EOR3 +#endif diff --git a/libavutil/aarch64/crc.h b/libavutil/aarch64/crc.h index 08efdd28f3..e31625606a 100644 --- a/libavutil/aarch64/crc.h +++ b/libavutil/aarch64/crc.h @@ -29,26 +29,165 @@ #include "libavutil/avassert.h" #include "libavutil/cpu.h" #include "libavutil/crc.h" +#include "libavutil/intreadwrite.h" +#if HAVE_ARM_CRC FF_VISIBILITY_PUSH_HIDDEN uint32_t ff_crc32_aarch64(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer, size_t length); FF_VISIBILITY_POP_HIDDEN +#endif + +#if HAVE_PMULL && HAVE_EOR3 +#include "libavutil/crc_internal.h" + +FF_VISIBILITY_PUSH_HIDDEN +uint32_t ff_crc_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer, + size_t length); +uint32_t ff_crc_le_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer, + size_t length); +FF_VISIBILITY_POP_HIDDEN + +enum { + CRC_C = 0, + PMULL_BE, + PMULL_LE, +}; + +static const AVCRC crc_table_pmull[AV_CRC_MAX][17] = { + [AV_CRC_8_ATM] = { + PMULL_BE, + 0xbc000000, 0x0, 0x32000000, 0x0, + 0x94000000, 0x0, 0xc4000000, 0x0, + 0x62000000, 0x0, 0x79000000, 0x0, + 0x07156a16, 0x1, 0x07000000, 0x1, + }, + [AV_CRC_8_EBU] = { + PMULL_BE, + 0xf3000000, 0x0, 0xb5000000, 0x0, + 0x0d000000, 0x0, 0xfc000000, 0x0, + 0x6a000000, 0x0, 0x65000000, 0x0, + 0x1c4b8192, 0x1, 0x1d000000, 0x1, + }, + [AV_CRC_16_ANSI] = { + PMULL_BE, + 0x807d0000, 0x0, 0xf9e30000, 0x0, + 0xff830000, 0x0, 0xf9130000, 0x0, + 0x807b0000, 0x0, 0x86630000, 0x0, + 0xfffbffe7, 0x1, 0x80050000, 0x1, + }, + [AV_CRC_16_CCITT] = { + PMULL_BE, + 0x59b00000, 0x0, 0x60190000, 0x0, + 0x45630000, 0x0, 0xd5f60000, 0x0, + 0xaa510000, 0x0, 0xeb230000, 0x0, + 0x11303471, 0x1, 0x10210000, 0x1, + }, + [AV_CRC_24_IEEE] = { + PMULL_BE, + 0x467d2400, 0x0, 0x1f428700, 0x0, + 0x64e4d700, 0x0, 0x2c8c9d00, 0x0, + 0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0, + 0xf845fe24, 0x1, 0x864cfb00, 0x1, + }, + [AV_CRC_32_IEEE] = { + PMULL_BE, + 0xe6228b11, 0x0, 0x8833794c, 0x0, + 0xe8a45605, 0x0, 0xc5b9cd4c, 0x0, + 0x490d678d, 0x0, 0xf200aa66, 0x0, + 0x04d101df, 0x1, 0x04c11db7, 0x1, + }, + [AV_CRC_32_IEEE_LE] = { + PMULL_LE, + 0x54442bd4, 0x1, 0xc6e41596, 0x1, + 0x751997d0, 0x1, 0xccaa009e, 0x0, + 0xccaa009e, 0x0, 0x63cd6124, 0x1, + 0xf7011640, 0x1, 0xdb710641, 0x1, + }, + [AV_CRC_16_ANSI_LE] = { + PMULL_LE, + 0x1b0c2, 0x0, 0x0000bffa, 0x0, + 0x1d0c2, 0x0, 0x00018cc2, 0x0, + 0x00018cc2, 0x0, 0x1bc02, 0x0, + 0xcfffbffe, 0x1, 0x14003, 0x0, + }, +}; + + +static inline void crc_init_aarch64(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size) +{ + uint64_t poly_; + if (le) { + // convert the reversed representation to regular form + poly = reverse(poly, bits) >> 1; + } + // convert to 32 degree polynomial + poly_ = ((uint64_t)poly) << (32 - bits); + + uint64_t div; + uint8_t *dst = (uint8_t*)(ctx + 1); + if (le) { + ctx[0] = PMULL_LE; + AV_WN64(dst + 0, xnmodp(4 * 128 + 32, poly_, 32, &div, le)); + AV_WN64(dst + 8, xnmodp(4 * 128 - 32, poly_, 32, &div, le)); + uint64_t tmp = xnmodp(128 - 32, poly_, 32, &div, le); + AV_WN64(dst + 16, xnmodp(128 + 32, poly_, 32, &div, le)); + AV_WN64(dst + 24, tmp); + AV_WN64(dst + 32, tmp); + AV_WN64(dst + 40, xnmodp(64, poly_, 32, &div, le)); + AV_WN64(dst + 48, div); + AV_WN64(dst + 56, reverse(poly_ | (1ULL << 32), 32)); + } else { + ctx[0] = PMULL_BE; + AV_WN64(dst + 0, xnmodp(4 * 128, poly_, 32, &div, le)); + AV_WN64(dst + 8, xnmodp(4 * 128 + 64, poly_, 32, &div, le)); + AV_WN64(dst + 16, xnmodp(128, poly_, 32, &div, le)); + AV_WN64(dst + 24, xnmodp(128 + 64, poly_, 32, &div, le)); + AV_WN64(dst + 32, xnmodp(64, poly_, 32, &div, le)); + AV_WN64(dst + 48, div); + AV_WN64(dst + 40, xnmodp(96, poly_, 32, &div, le)); + AV_WN64(dst + 56, poly_ | (1ULL << 32)); + } +} +#endif + +static inline av_cold int ff_crc_init_aarch64(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size) +{ +#if HAVE_PMULL && HAVE_EOR3 + int cpu_flags = av_get_cpu_flags(); + + if (have_pmull(cpu_flags) && have_eor3(cpu_flags)) { + crc_init_aarch64(ctx, le, bits, poly, ctx_size); + return 1; + } +#endif + return 0; +} static inline uint32_t ff_crc_aarch64(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer, size_t length) { -#if HAVE_ARM_CRC - av_assert2(ctx[0] == AV_CRC_32_IEEE_LE + 1); - return ff_crc32_aarch64(ctx, crc, buffer, length); -#else - av_unreachable("AARCH64 has only AV_CRC_32_IEEE_LE arch-specific CRC code"); - return 0; + switch (ctx[0]) { +#if HAVE_PMULL && HAVE_EOR3 + case PMULL_BE: return ff_crc_neon_pmull(ctx, crc, buffer, length); + case PMULL_LE: return ff_crc_le_neon_pmull(ctx, crc, buffer, length); #endif +#if HAVE_ARM_CRC + case (AV_CRC_32_IEEE_LE + 1): return ff_crc32_aarch64(ctx, crc, buffer, length); +#endif + default: av_unreachable("AARCH64 has PMULL_LE, PMULL_BE and AV_CRC_32_IEEE_LE arch-specific CRC code"); + } + return 0; } static inline const AVCRC *ff_crc_get_table_aarch64(AVCRCId crc_id) { + int cpu_flags = av_get_cpu_flags(); +#if HAVE_PMULL && HAVE_EOR3 + if (have_pmull(cpu_flags) && have_eor3(cpu_flags)) { + return crc_table_pmull[crc_id]; + } +#endif #if HAVE_ARM_CRC static const AVCRC crc32_ieee_le_ctx[] = { AV_CRC_32_IEEE_LE + 1 @@ -57,7 +196,6 @@ static inline const AVCRC *ff_crc_get_table_aarch64(AVCRCId crc_id) if (crc_id != AV_CRC_32_IEEE_LE) return NULL; - int cpu_flags = av_get_cpu_flags(); if (have_arm_crc(cpu_flags)) { return crc32_ieee_le_ctx; } diff --git a/libavutil/crc.c b/libavutil/crc.c index c8ccaf8162..335448f8db 100644 --- a/libavutil/crc.c +++ b/libavutil/crc.c @@ -357,6 +357,10 @@ int av_crc_init(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size) int done = ff_crc_init_x86(ctx, le, bits, poly, ctx_size); if (done) return 0; +#elif ARCH_AARCH64 + int done = ff_crc_init_aarch64(ctx, le, bits, poly, ctx_size); + if (done) + return 0; #endif for (i = 0; i < 256; i++) {