avutil/crc: add aarch64 NEON PMULL+EOR3 SIMD implementation for av_crc

Implemented clmul algorithm for aarch64 using PMULL and EOR3 instructions.
The logic and structure is same as x86 clmul implementation with
slight rearrangement of constants as per PMULL and PMULL2 instructions.

Benchmarking in Android (Termux) on a MediaTek Dimensity 9400 SoC:

./tests/checkasm/checkasm --test=crc --bench --runs=12
benchmarking with native FFmpeg timers
nop: 0.2
checkasm: SVE 128 bits, using random seed 2502847808
checkasm: bench runs 4096 (1 << 12)
CRC:
 - crc.crc [OK]
PMULL:
 - crc.crc [OK]
checkasm: all 10 tests passed
crc_8_ATM_c:                                            26.0 ( 1.00x)
crc_8_ATM_pmull_eor3:                                    0.7 (37.17x)
crc_8_EBU_c:                                            46.4 ( 1.00x)
crc_8_EBU_pmull_eor3:                                    1.5 (31.47x)
crc_16_ANSI_c:                                          36.3 ( 1.00x)
crc_16_ANSI_pmull_eor3:                                  1.1 (31.70x)
crc_16_ANSI_LE_c:                                       90.9 ( 1.00x)
crc_16_ANSI_LE_pmull_eor3:                               2.8 (32.30x)
crc_16_CCITT_c:                                        118.0 ( 1.00x)
crc_16_CCITT_pmull_eor3:                                 3.7 (32.00x)
crc_24_IEEE_c:                                           1.6 ( 1.00x)
crc_24_IEEE_pmull_eor3:                                  0.1 (12.19x)
crc_32_IEEE_c:                                          45.2 ( 1.00x)
crc_32_IEEE_pmull_eor3:                                  1.4 (31.39x)
crc_32_IEEE_LE_c:                                       49.1 ( 1.00x)
crc_32_IEEE_LE_crc:                                      2.5 (19.51x)
crc_32_IEEE_LE_pmull_eor3:                               1.5 (32.84x)
crc_custom_polynomial_c:                                45.3 ( 1.00x)
crc_custom_polynomial_pmull_eor3:                        1.3 (35.16x)
This commit is contained in:
Shreesh Adiga
2026-02-05 18:49:21 +05:30
committed by Martin Storsjö
parent 952e588600
commit 5085432f8b
3 changed files with 414 additions and 7 deletions

View File

@@ -1,5 +1,6 @@
/*
* Copyright (c) 2025 Zhao Zhili <quinkblack@foxmail.com>
* Copyright (c) 2026 Shreesh Adiga <16567adigashreesh@gmail.com>
*
* This file is part of FFmpeg.
*
@@ -67,3 +68,267 @@ endfunc
DISABLE_ARM_CRC
#endif
#if HAVE_PMULL && HAVE_EOR3
ENABLE_PMULL
ENABLE_EOR3
const reverse_shuffle, align=4
.byte 15, 14, 13, 12
.byte 11, 10, 9, 8
.byte 7, 6, 5, 4
.byte 3, 2, 1, 0
endconst
const partial_bytes_shuf_tab, align=4
.byte 255, 254, 253, 252
.byte 251, 250, 249, 248
.byte 247, 246, 245, 244
.byte 243, 242, 241, 240
.byte 0, 1, 2, 3
.byte 4, 5, 6, 7
.byte 8, 9, 10, 11
.byte 12, 13, 14, 15
endconst
// performs Vfold = pmull(Vfold, Vconst) xor pmull2(Vfold, Vconst) xor Vdata
.macro FOLD_SINGLE Vfold, Vconst, Vdata, Vtemp
pmull \Vtemp\().1q, \Vconst\().1d, \Vfold\().1d
pmull2 \Vfold\().1q, \Vconst\().2d, \Vfold\().2d
eor3 \Vfold\().16b, \Vfold\().16b, \Vdata\().16b, \Vtemp\().16b
.endm
// assume Vfold is v16 and v0 is filled with 0
// uses v17 as temp
.macro FOLD_128_TO_64 le, Vconst
.if ! \le
fmov d17, d16
pmull2 v16.1q, v16.2d, \Vconst\().2d
ext v17.16b, v0.16b, v17.16b, #12
eor v16.16b, v17.16b, v16.16b
ext \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8
pmull2 v17.1q, \Vconst\().2d, v16.2d
eor v16.16b, v16.16b, v17.16b
.else
ext v17.16b, v16.16b, v0.16b, #8
pmull v16.1q, v16.1d, \Vconst\().1d
eor v16.16b, v16.16b, v17.16b
ext v17.16b, v0.16b, v16.16b, #12
ext \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8
pmull v17.1q, \Vconst\().1d, v17.1d
eor v16.16b, v16.16b, v17.16b
.endif
.endm
// assume Vfold is v16 and v0 is filled with 0
// uses v17 as temp
.macro FOLD_64_TO_32 le, Vconst
.if ! \le
pmull v17.1q, v16.1d, \Vconst\().1d
pmull2 v17.1q, v17.2d, \Vconst\().2d
eor v16.16b, v16.16b, v17.16b
fmov w0, s16
rev w0, w0
.else
mov v16.s[0], wzr
pmull v17.1q, v16.1d, \Vconst\().1d
eor v17.16b, v17.16b, v16.16b
ext \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8
pmull v17.1q, v17.1d, \Vconst\().1d
eor v16.16b, v16.16b, v17.16b
mov w0, v16.s[2]
.endif
.endm
#define CTX_OFFSET 4
// ff_crc[_le]_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer, size_t length)
// x0 - ctx; pre-computed fold constants, see crc_init_aarch64 for details on the structure.
// x1 - crc initial value
// x2 - buffer pointer
// x3 - length of buffer
//
// Additional registers used:
// x10 - buffer + length; points to end of buffer
// x11-x15 used as temp registers as needed
//
// v0 - zero register to perform vector shift during reduction using EXT
// v1 - crc input from x1
// v2 - holds the reverse shuffle values 15, 14 ... 0 to reverse vector using TBL
// v3 - holds the constant values used in PMULL/PMULL2 loaded via x0 ctx
// v4-v7 - used as temp for PMULL/PMULL2 output
// v16 - holds the primary fold register later reduced to 32 bits
// Registers v16-v19 are used in parallel 4x fold loop
// Registers v20-23 are used to load the 4 next data chunks
// Registers v17-v21 are used as temp elsewhere
.macro crc_fn_template le
.if \le
function ff_crc_le_neon_pmull, export=1
.else
function ff_crc_neon_pmull, export=1
movrel x9, reverse_shuffle
ldr q2, [x9]
.endif
add x10, x2, x3
movi v0.2d, #0
mov v1.16b, v0.16b
mov v1.s[0], w1
cmp x3, #64
b.lo 8f // less than 64 bytes
ld1 {v16.16b-v19.16b}, [x2], #64
eor v16.16b, v16.16b, v1.16b
.if ! \le
tbl v16.16b, { v16.16b }, v2.16b
tbl v17.16b, { v17.16b }, v2.16b
tbl v18.16b, { v18.16b }, v2.16b
tbl v19.16b, { v19.16b }, v2.16b
.endif
sub x11, x10, x2
cmp x11, #64
b.lo 2f // reduce 4x to 1x
ldr q3, [x0, #(CTX_OFFSET + 0)]
1: // fold 4x loop
ld1 {v20.16b-v23.16b}, [x2], #64
.if ! \le
tbl v20.16b, { v20.16b }, v2.16b
tbl v21.16b, { v21.16b }, v2.16b
tbl v22.16b, { v22.16b }, v2.16b
tbl v23.16b, { v23.16b }, v2.16b
.endif
pmull v4.1q, v16.1d, v3.1d
pmull v5.1q, v17.1d, v3.1d
pmull v6.1q, v18.1d, v3.1d
pmull v7.1q, v19.1d, v3.1d
pmull2 v16.1q, v16.2d, v3.2d
pmull2 v17.1q, v17.2d, v3.2d
pmull2 v18.1q, v18.2d, v3.2d
pmull2 v19.1q, v19.2d, v3.2d
eor3 v16.16b, v16.16b, v4.16b, v20.16b
eor3 v17.16b, v17.16b, v5.16b, v21.16b
eor3 v18.16b, v18.16b, v6.16b, v22.16b
eor3 v19.16b, v19.16b, v7.16b, v23.16b
sub x11, x10, x2
cmp x11, #64
b.hs 1b // fold 4x loop
2: // reduce 4x to 1x
ldr q3, [x0, #(CTX_OFFSET + 16)]
FOLD_SINGLE v16, v3, v17, v4
FOLD_SINGLE v16, v3, v18, v4
FOLD_SINGLE v16, v3, v19, v4
3: // fold 1x pre
sub x11, x10, x2
cmp x11, #16
b.lo 5f // partial_block
4: // fold 1x loop
ldr q17, [x2], #16
.if ! \le
tbl v17.16b, { v17.16b }, v2.16b
.endif
FOLD_SINGLE v16, v3, v17, v4
sub x11, x10, x2
cmp x11, #16
b.hs 4b // fold 1x loop
5: // partial_block
cmp x10, x2
b.eq 6f // reduce 128 to 64
ldr q17, [x10, #-16]
movrel x1, partial_bytes_shuf_tab
ldr q18, [x1, x11]
.if ! \le
tbl v16.16b, { v16.16b }, v2.16b
.endif
mvn v19.16b, v18.16b
tbl v20.16b, { v16.16b }, v19.16b
cmge v21.16b, v18.16b, #0
bsl v21.16b, v17.16b, v20.16b
tbl v16.16b, { v16.16b }, v18.16b
.if ! \le
tbl v16.16b, { v16.16b }, v2.16b
tbl v21.16b, { v21.16b }, v2.16b
.endif
FOLD_SINGLE v16, v3, v21, v4
6: // reduce 128 to 64
ldr q3, [x0, #(CTX_OFFSET + 32)]
FOLD_128_TO_64 \le, v3
7: // reduce 64 to 32
ldr q3, [x0, #(CTX_OFFSET + 48)]
FOLD_64_TO_32 \le, v3
ret
8: // less than 64 bytes
cmp x3, #16
b.lo 9f // less than 16 bytes
ldr q16, [x2], #16
eor v16.16b, v16.16b, v1.16b
.if ! \le
tbl v16.16b, { v16.16b }, v2.16b
.endif
ldr q3, [x0, #(CTX_OFFSET + 16)]
b 3b // fold 1x pre
9: // less than 16 bytes
movrel x15, partial_bytes_shuf_tab
add x15, x15, x3
str q0, [sp, #-16]!
add x9, sp, x3
// memcpy 0 to 15 bytes from src to stack and load it to v16 with zero extension
tbz x3, #3, 10f
ldr x11, [x2]
ldr x12, [x10, #-8]
str x11, [sp]
str x12, [x9, #-8]
b 12f
10:
tbz x3, #2, 11f
ldr w11, [x2]
ldr w12, [x10, #-4]
str w11, [sp]
str w12, [x9, #-4]
b 12f
11:
cbz x3, 12f
lsr x11, x3, #1
ldrb w12, [x2]
ldrb w14, [x10, #-1]
ldrb w13, [x2, x11]
strb w12, [sp]
strb w13, [sp, x11]
strb w14, [x9, #-1]
12:
ldr q16, [sp], #16
eor v16.16b, v16.16b, v1.16b
cmp x3, #5
b.lo 13f // less than 5 bytes
ldr q17, [x15]
tbl v16.16b, { v16.16b }, v17.16b
.if ! \le
tbl v16.16b, { v16.16b }, v2.16b
.endif
b 6b // reduce 128 to 64
13: // less than 5 bytes
.if ! \le
ldr q17, [x15, #12]
tbl v16.16b, { v16.16b }, v17.16b
rev64 v16.16b, v16.16b
.else
ldr q17, [x15, #8]
tbl v16.16b, { v16.16b }, v17.16b
.endif
b 7b // reduce 64 to 32
endfunc
.endm
crc_fn_template 0
crc_fn_template 1
DISABLE_PMULL
DISABLE_EOR3
#endif

View File

@@ -29,26 +29,165 @@
#include "libavutil/avassert.h"
#include "libavutil/cpu.h"
#include "libavutil/crc.h"
#include "libavutil/intreadwrite.h"
#if HAVE_ARM_CRC
FF_VISIBILITY_PUSH_HIDDEN
uint32_t ff_crc32_aarch64(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer,
size_t length);
FF_VISIBILITY_POP_HIDDEN
#endif
#if HAVE_PMULL && HAVE_EOR3
#include "libavutil/crc_internal.h"
FF_VISIBILITY_PUSH_HIDDEN
uint32_t ff_crc_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer,
size_t length);
uint32_t ff_crc_le_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer,
size_t length);
FF_VISIBILITY_POP_HIDDEN
enum {
CRC_C = 0,
PMULL_BE,
PMULL_LE,
};
static const AVCRC crc_table_pmull[AV_CRC_MAX][17] = {
[AV_CRC_8_ATM] = {
PMULL_BE,
0xbc000000, 0x0, 0x32000000, 0x0,
0x94000000, 0x0, 0xc4000000, 0x0,
0x62000000, 0x0, 0x79000000, 0x0,
0x07156a16, 0x1, 0x07000000, 0x1,
},
[AV_CRC_8_EBU] = {
PMULL_BE,
0xf3000000, 0x0, 0xb5000000, 0x0,
0x0d000000, 0x0, 0xfc000000, 0x0,
0x6a000000, 0x0, 0x65000000, 0x0,
0x1c4b8192, 0x1, 0x1d000000, 0x1,
},
[AV_CRC_16_ANSI] = {
PMULL_BE,
0x807d0000, 0x0, 0xf9e30000, 0x0,
0xff830000, 0x0, 0xf9130000, 0x0,
0x807b0000, 0x0, 0x86630000, 0x0,
0xfffbffe7, 0x1, 0x80050000, 0x1,
},
[AV_CRC_16_CCITT] = {
PMULL_BE,
0x59b00000, 0x0, 0x60190000, 0x0,
0x45630000, 0x0, 0xd5f60000, 0x0,
0xaa510000, 0x0, 0xeb230000, 0x0,
0x11303471, 0x1, 0x10210000, 0x1,
},
[AV_CRC_24_IEEE] = {
PMULL_BE,
0x467d2400, 0x0, 0x1f428700, 0x0,
0x64e4d700, 0x0, 0x2c8c9d00, 0x0,
0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0,
0xf845fe24, 0x1, 0x864cfb00, 0x1,
},
[AV_CRC_32_IEEE] = {
PMULL_BE,
0xe6228b11, 0x0, 0x8833794c, 0x0,
0xe8a45605, 0x0, 0xc5b9cd4c, 0x0,
0x490d678d, 0x0, 0xf200aa66, 0x0,
0x04d101df, 0x1, 0x04c11db7, 0x1,
},
[AV_CRC_32_IEEE_LE] = {
PMULL_LE,
0x54442bd4, 0x1, 0xc6e41596, 0x1,
0x751997d0, 0x1, 0xccaa009e, 0x0,
0xccaa009e, 0x0, 0x63cd6124, 0x1,
0xf7011640, 0x1, 0xdb710641, 0x1,
},
[AV_CRC_16_ANSI_LE] = {
PMULL_LE,
0x1b0c2, 0x0, 0x0000bffa, 0x0,
0x1d0c2, 0x0, 0x00018cc2, 0x0,
0x00018cc2, 0x0, 0x1bc02, 0x0,
0xcfffbffe, 0x1, 0x14003, 0x0,
},
};
static inline void crc_init_aarch64(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size)
{
uint64_t poly_;
if (le) {
// convert the reversed representation to regular form
poly = reverse(poly, bits) >> 1;
}
// convert to 32 degree polynomial
poly_ = ((uint64_t)poly) << (32 - bits);
uint64_t div;
uint8_t *dst = (uint8_t*)(ctx + 1);
if (le) {
ctx[0] = PMULL_LE;
AV_WN64(dst + 0, xnmodp(4 * 128 + 32, poly_, 32, &div, le));
AV_WN64(dst + 8, xnmodp(4 * 128 - 32, poly_, 32, &div, le));
uint64_t tmp = xnmodp(128 - 32, poly_, 32, &div, le);
AV_WN64(dst + 16, xnmodp(128 + 32, poly_, 32, &div, le));
AV_WN64(dst + 24, tmp);
AV_WN64(dst + 32, tmp);
AV_WN64(dst + 40, xnmodp(64, poly_, 32, &div, le));
AV_WN64(dst + 48, div);
AV_WN64(dst + 56, reverse(poly_ | (1ULL << 32), 32));
} else {
ctx[0] = PMULL_BE;
AV_WN64(dst + 0, xnmodp(4 * 128, poly_, 32, &div, le));
AV_WN64(dst + 8, xnmodp(4 * 128 + 64, poly_, 32, &div, le));
AV_WN64(dst + 16, xnmodp(128, poly_, 32, &div, le));
AV_WN64(dst + 24, xnmodp(128 + 64, poly_, 32, &div, le));
AV_WN64(dst + 32, xnmodp(64, poly_, 32, &div, le));
AV_WN64(dst + 48, div);
AV_WN64(dst + 40, xnmodp(96, poly_, 32, &div, le));
AV_WN64(dst + 56, poly_ | (1ULL << 32));
}
}
#endif
static inline av_cold int ff_crc_init_aarch64(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size)
{
#if HAVE_PMULL && HAVE_EOR3
int cpu_flags = av_get_cpu_flags();
if (have_pmull(cpu_flags) && have_eor3(cpu_flags)) {
crc_init_aarch64(ctx, le, bits, poly, ctx_size);
return 1;
}
#endif
return 0;
}
static inline uint32_t ff_crc_aarch64(const AVCRC *ctx, uint32_t crc,
const uint8_t *buffer, size_t length)
{
#if HAVE_ARM_CRC
av_assert2(ctx[0] == AV_CRC_32_IEEE_LE + 1);
return ff_crc32_aarch64(ctx, crc, buffer, length);
#else
av_unreachable("AARCH64 has only AV_CRC_32_IEEE_LE arch-specific CRC code");
return 0;
switch (ctx[0]) {
#if HAVE_PMULL && HAVE_EOR3
case PMULL_BE: return ff_crc_neon_pmull(ctx, crc, buffer, length);
case PMULL_LE: return ff_crc_le_neon_pmull(ctx, crc, buffer, length);
#endif
#if HAVE_ARM_CRC
case (AV_CRC_32_IEEE_LE + 1): return ff_crc32_aarch64(ctx, crc, buffer, length);
#endif
default: av_unreachable("AARCH64 has PMULL_LE, PMULL_BE and AV_CRC_32_IEEE_LE arch-specific CRC code");
}
return 0;
}
static inline const AVCRC *ff_crc_get_table_aarch64(AVCRCId crc_id)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_PMULL && HAVE_EOR3
if (have_pmull(cpu_flags) && have_eor3(cpu_flags)) {
return crc_table_pmull[crc_id];
}
#endif
#if HAVE_ARM_CRC
static const AVCRC crc32_ieee_le_ctx[] = {
AV_CRC_32_IEEE_LE + 1
@@ -57,7 +196,6 @@ static inline const AVCRC *ff_crc_get_table_aarch64(AVCRCId crc_id)
if (crc_id != AV_CRC_32_IEEE_LE)
return NULL;
int cpu_flags = av_get_cpu_flags();
if (have_arm_crc(cpu_flags)) {
return crc32_ieee_le_ctx;
}

View File

@@ -357,6 +357,10 @@ int av_crc_init(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size)
int done = ff_crc_init_x86(ctx, le, bits, poly, ctx_size);
if (done)
return 0;
#elif ARCH_AARCH64
int done = ff_crc_init_aarch64(ctx, le, bits, poly, ctx_size);
if (done)
return 0;
#endif
for (i = 0; i < 256; i++) {