avutil/crc: add aarch64 NEON PMULL+EOR3 SIMD implementation for av_crc

Implemented clmul algorithm for aarch64 using PMULL and EOR3 instructions. The logic and structure is same as x86 clmul implementation with slight rearrangement of constants as per PMULL and PMULL2 instructions. Benchmarking in Android (Termux) on a MediaTek Dimensity 9400 SoC: ./tests/checkasm/checkasm --test=crc --bench --runs=12 benchmarking with native FFmpeg timers nop: 0.2 checkasm: SVE 128 bits, using random seed 2502847808 checkasm: bench runs 4096 (1 << 12) CRC: - crc.crc [OK] PMULL: - crc.crc [OK] checkasm: all 10 tests passed crc_8_ATM_c: 26.0 ( 1.00x) crc_8_ATM_pmull_eor3: 0.7 (37.17x) crc_8_EBU_c: 46.4 ( 1.00x) crc_8_EBU_pmull_eor3: 1.5 (31.47x) crc_16_ANSI_c: 36.3 ( 1.00x) crc_16_ANSI_pmull_eor3: 1.1 (31.70x) crc_16_ANSI_LE_c: 90.9 ( 1.00x) crc_16_ANSI_LE_pmull_eor3: 2.8 (32.30x) crc_16_CCITT_c: 118.0 ( 1.00x) crc_16_CCITT_pmull_eor3: 3.7 (32.00x) crc_24_IEEE_c: 1.6 ( 1.00x) crc_24_IEEE_pmull_eor3: 0.1 (12.19x) crc_32_IEEE_c: 45.2 ( 1.00x) crc_32_IEEE_pmull_eor3: 1.4 (31.39x) crc_32_IEEE_LE_c: 49.1 ( 1.00x) crc_32_IEEE_LE_crc: 2.5 (19.51x) crc_32_IEEE_LE_pmull_eor3: 1.5 (32.84x) crc_custom_polynomial_c: 45.3 ( 1.00x) crc_custom_polynomial_pmull_eor3: 1.3 (35.16x)
2026-04-20 21:00:41 +08:00 · 2026-02-05 18:49:21 +05:30
parent 952e588600
commit 5085432f8b
3 changed files with 414 additions and 7 deletions
--- a/libavutil/aarch64/crc.S
+++ b/libavutil/aarch64/crc.S
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2025 Zhao Zhili <quinkblack@foxmail.com>
+ * Copyright (c) 2026 Shreesh Adiga <16567adigashreesh@gmail.com>
 *
 * This file is part of FFmpeg.
 *
@@ -67,3 +68,267 @@ endfunc

 DISABLE_ARM_CRC
 #endif
+
+#if HAVE_PMULL && HAVE_EOR3
+ENABLE_PMULL
+ENABLE_EOR3
+
+const reverse_shuffle, align=4
+        .byte  15, 14, 13, 12
+        .byte  11, 10,  9,  8
+        .byte   7,  6,  5,  4
+        .byte   3,  2,  1,  0
+endconst
+
+const partial_bytes_shuf_tab, align=4
+        .byte  255, 254, 253, 252
+        .byte  251, 250, 249, 248
+        .byte  247, 246, 245, 244
+        .byte  243, 242, 241, 240
+        .byte   0,  1,  2,  3
+        .byte   4,  5,  6,  7
+        .byte   8,  9, 10, 11
+        .byte  12, 13, 14, 15
+endconst
+
+// performs Vfold = pmull(Vfold, Vconst) xor pmull2(Vfold, Vconst) xor Vdata
+.macro FOLD_SINGLE Vfold, Vconst, Vdata, Vtemp
+        pmull           \Vtemp\().1q, \Vconst\().1d, \Vfold\().1d
+        pmull2          \Vfold\().1q, \Vconst\().2d, \Vfold\().2d
+        eor3            \Vfold\().16b, \Vfold\().16b, \Vdata\().16b, \Vtemp\().16b
+.endm
+
+// assume Vfold is v16 and v0 is filled with 0
+// uses v17 as temp
+.macro FOLD_128_TO_64 le, Vconst
+.if ! \le
+        fmov            d17, d16
+        pmull2          v16.1q, v16.2d, \Vconst\().2d
+        ext             v17.16b, v0.16b, v17.16b, #12
+        eor             v16.16b, v17.16b, v16.16b
+        ext             \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8
+        pmull2          v17.1q, \Vconst\().2d, v16.2d
+        eor             v16.16b, v16.16b, v17.16b
+.else
+        ext             v17.16b, v16.16b, v0.16b, #8
+        pmull           v16.1q, v16.1d, \Vconst\().1d
+        eor             v16.16b, v16.16b, v17.16b
+        ext             v17.16b, v0.16b, v16.16b, #12
+        ext             \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8
+        pmull           v17.1q, \Vconst\().1d, v17.1d
+        eor             v16.16b, v16.16b, v17.16b
+.endif
+.endm
+
+// assume Vfold is v16 and v0 is filled with 0
+// uses v17 as temp
+.macro FOLD_64_TO_32 le, Vconst
+.if ! \le
+        pmull           v17.1q, v16.1d, \Vconst\().1d
+        pmull2          v17.1q, v17.2d, \Vconst\().2d
+        eor             v16.16b, v16.16b, v17.16b
+        fmov            w0, s16
+        rev             w0, w0
+.else
+        mov             v16.s[0], wzr
+        pmull           v17.1q, v16.1d, \Vconst\().1d
+        eor             v17.16b, v17.16b, v16.16b
+        ext             \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8
+        pmull           v17.1q, v17.1d, \Vconst\().1d
+        eor             v16.16b, v16.16b, v17.16b
+        mov             w0, v16.s[2]
+.endif
+.endm
+
+#define CTX_OFFSET 4
+
+// ff_crc[_le]_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer, size_t length)
+// x0 - ctx; pre-computed fold constants, see crc_init_aarch64 for details on the structure.
+// x1 - crc initial value
+// x2 - buffer pointer
+// x3 - length of buffer
+//
+// Additional registers used:
+// x10 - buffer + length; points to end of buffer
+// x11-x15 used as temp registers as needed
+//
+// v0 - zero register to perform vector shift during reduction using EXT
+// v1 - crc input from x1
+// v2 - holds the reverse shuffle values 15, 14 ... 0 to reverse vector using TBL
+// v3 - holds the constant values used in PMULL/PMULL2 loaded via x0 ctx
+// v4-v7 - used as temp for PMULL/PMULL2 output
+// v16 - holds the primary fold register later reduced to 32 bits
+// Registers v16-v19 are used in parallel 4x fold loop
+// Registers v20-23 are used to load the 4 next data chunks
+// Registers v17-v21 are used as temp elsewhere
+.macro crc_fn_template le
+.if \le
+function ff_crc_le_neon_pmull, export=1
+.else
+function ff_crc_neon_pmull, export=1
+        movrel          x9, reverse_shuffle
+        ldr             q2, [x9]
+.endif
+        add             x10, x2, x3
+        movi            v0.2d, #0
+        mov             v1.16b, v0.16b
+        mov             v1.s[0], w1
+        cmp             x3, #64
+        b.lo            8f // less than 64 bytes
+        ld1             {v16.16b-v19.16b}, [x2], #64
+        eor             v16.16b, v16.16b, v1.16b
+.if ! \le
+        tbl             v16.16b, { v16.16b }, v2.16b
+        tbl             v17.16b, { v17.16b }, v2.16b
+        tbl             v18.16b, { v18.16b }, v2.16b
+        tbl             v19.16b, { v19.16b }, v2.16b
+.endif
+        sub             x11, x10, x2
+        cmp             x11, #64
+        b.lo            2f // reduce 4x to 1x
+        ldr             q3, [x0, #(CTX_OFFSET + 0)]
+
+1: // fold 4x loop
+        ld1             {v20.16b-v23.16b}, [x2], #64
+.if ! \le
+        tbl             v20.16b, { v20.16b }, v2.16b
+        tbl             v21.16b, { v21.16b }, v2.16b
+        tbl             v22.16b, { v22.16b }, v2.16b
+        tbl             v23.16b, { v23.16b }, v2.16b
+.endif
+        pmull           v4.1q, v16.1d, v3.1d
+        pmull           v5.1q, v17.1d, v3.1d
+        pmull           v6.1q, v18.1d, v3.1d
+        pmull           v7.1q, v19.1d, v3.1d
+        pmull2          v16.1q, v16.2d, v3.2d
+        pmull2          v17.1q, v17.2d, v3.2d
+        pmull2          v18.1q, v18.2d, v3.2d
+        pmull2          v19.1q, v19.2d, v3.2d
+        eor3            v16.16b, v16.16b, v4.16b, v20.16b
+        eor3            v17.16b, v17.16b, v5.16b, v21.16b
+        eor3            v18.16b, v18.16b, v6.16b, v22.16b
+        eor3            v19.16b, v19.16b, v7.16b, v23.16b
+        sub             x11, x10, x2
+        cmp             x11, #64
+        b.hs            1b // fold 4x loop
+
+2: // reduce 4x to 1x
+        ldr             q3, [x0, #(CTX_OFFSET + 16)]
+        FOLD_SINGLE     v16, v3, v17, v4
+        FOLD_SINGLE     v16, v3, v18, v4
+        FOLD_SINGLE     v16, v3, v19, v4
+
+3: // fold 1x pre
+        sub             x11, x10, x2
+        cmp             x11, #16
+        b.lo            5f // partial_block
+
+4: // fold 1x loop
+        ldr             q17, [x2], #16
+.if ! \le
+        tbl             v17.16b, { v17.16b }, v2.16b
+.endif
+        FOLD_SINGLE     v16, v3, v17, v4
+        sub             x11, x10, x2
+        cmp             x11, #16
+        b.hs            4b // fold 1x loop
+
+5: // partial_block
+        cmp             x10, x2
+        b.eq            6f // reduce 128 to 64
+        ldr             q17, [x10, #-16]
+        movrel          x1, partial_bytes_shuf_tab
+        ldr             q18, [x1, x11]
+.if ! \le
+        tbl             v16.16b, { v16.16b }, v2.16b
+.endif
+        mvn             v19.16b, v18.16b
+        tbl             v20.16b, { v16.16b }, v19.16b
+        cmge            v21.16b, v18.16b, #0
+        bsl             v21.16b, v17.16b, v20.16b
+        tbl             v16.16b, { v16.16b }, v18.16b
+.if ! \le
+        tbl             v16.16b, { v16.16b }, v2.16b
+        tbl             v21.16b, { v21.16b }, v2.16b
+.endif
+        FOLD_SINGLE     v16, v3, v21, v4
+
+6: // reduce 128 to 64
+        ldr             q3, [x0, #(CTX_OFFSET + 32)]
+        FOLD_128_TO_64  \le, v3
+
+7: // reduce 64 to 32
+        ldr             q3, [x0, #(CTX_OFFSET + 48)]
+        FOLD_64_TO_32   \le, v3
+        ret
+
+8: // less than 64 bytes
+        cmp             x3, #16
+        b.lo            9f // less than 16 bytes
+        ldr             q16, [x2], #16
+        eor             v16.16b, v16.16b, v1.16b
+.if ! \le
+        tbl             v16.16b, { v16.16b }, v2.16b
+.endif
+        ldr             q3, [x0, #(CTX_OFFSET + 16)]
+        b               3b // fold 1x pre
+
+9: // less than 16 bytes
+        movrel          x15, partial_bytes_shuf_tab
+        add             x15, x15, x3
+        str             q0, [sp, #-16]!
+        add             x9, sp, x3
+        // memcpy 0 to 15 bytes from src to stack and load it to v16 with zero extension
+        tbz             x3, #3, 10f
+        ldr             x11, [x2]
+        ldr             x12, [x10, #-8]
+        str             x11, [sp]
+        str             x12, [x9, #-8]
+        b               12f
+10:
+        tbz             x3, #2, 11f
+        ldr             w11, [x2]
+        ldr             w12, [x10, #-4]
+        str             w11, [sp]
+        str             w12, [x9, #-4]
+        b               12f
+11:
+        cbz             x3, 12f
+        lsr             x11, x3, #1
+        ldrb            w12, [x2]
+        ldrb            w14, [x10, #-1]
+        ldrb            w13, [x2, x11]
+        strb            w12, [sp]
+        strb            w13, [sp, x11]
+        strb            w14, [x9, #-1]
+12:
+        ldr             q16, [sp], #16
+        eor             v16.16b, v16.16b, v1.16b
+        cmp             x3, #5
+        b.lo            13f // less than 5 bytes
+        ldr             q17, [x15]
+        tbl             v16.16b, { v16.16b }, v17.16b
+.if ! \le
+        tbl             v16.16b, { v16.16b }, v2.16b
+.endif
+        b               6b // reduce 128 to 64
+
+13: // less than 5 bytes
+.if ! \le
+        ldr             q17, [x15, #12]
+        tbl             v16.16b, { v16.16b }, v17.16b
+        rev64           v16.16b, v16.16b
+.else
+        ldr             q17, [x15, #8]
+        tbl             v16.16b, { v16.16b }, v17.16b
+.endif
+        b               7b // reduce 64 to 32
+endfunc
+.endm
+
+crc_fn_template 0
+crc_fn_template 1
+
+DISABLE_PMULL
+DISABLE_EOR3
+#endif
--- a/libavutil/aarch64/crc.h
+++ b/libavutil/aarch64/crc.h
@@ -29,26 +29,165 @@
 #include "libavutil/avassert.h"
 #include "libavutil/cpu.h"
 #include "libavutil/crc.h"
+#include "libavutil/intreadwrite.h"

+#if HAVE_ARM_CRC
 FF_VISIBILITY_PUSH_HIDDEN
 uint32_t ff_crc32_aarch64(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer,
                          size_t length);
 FF_VISIBILITY_POP_HIDDEN
+#endif
+
+#if HAVE_PMULL && HAVE_EOR3
+#include "libavutil/crc_internal.h"
+
+FF_VISIBILITY_PUSH_HIDDEN
+uint32_t ff_crc_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer,
+                           size_t length);
+uint32_t ff_crc_le_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer,
+                              size_t length);
+FF_VISIBILITY_POP_HIDDEN
+
+enum {
+    CRC_C    = 0,
+    PMULL_BE,
+    PMULL_LE,
+};
+
+static const AVCRC crc_table_pmull[AV_CRC_MAX][17] = {
+    [AV_CRC_8_ATM] = {
+        PMULL_BE,
+        0xbc000000, 0x0, 0x32000000, 0x0,
+        0x94000000, 0x0, 0xc4000000, 0x0,
+        0x62000000, 0x0, 0x79000000, 0x0,
+        0x07156a16, 0x1, 0x07000000, 0x1,
+    },
+    [AV_CRC_8_EBU] = {
+        PMULL_BE,
+        0xf3000000, 0x0, 0xb5000000, 0x0,
+        0x0d000000, 0x0, 0xfc000000, 0x0,
+        0x6a000000, 0x0, 0x65000000, 0x0,
+        0x1c4b8192, 0x1, 0x1d000000, 0x1,
+    },
+    [AV_CRC_16_ANSI] = {
+        PMULL_BE,
+        0x807d0000, 0x0, 0xf9e30000, 0x0,
+        0xff830000, 0x0, 0xf9130000, 0x0,
+        0x807b0000, 0x0, 0x86630000, 0x0,
+        0xfffbffe7, 0x1, 0x80050000, 0x1,
+    },
+    [AV_CRC_16_CCITT] = {
+        PMULL_BE,
+        0x59b00000, 0x0, 0x60190000, 0x0,
+        0x45630000, 0x0, 0xd5f60000, 0x0,
+        0xaa510000, 0x0, 0xeb230000, 0x0,
+        0x11303471, 0x1, 0x10210000, 0x1,
+    },
+    [AV_CRC_24_IEEE] = {
+        PMULL_BE,
+        0x467d2400, 0x0, 0x1f428700, 0x0,
+        0x64e4d700, 0x0, 0x2c8c9d00, 0x0,
+        0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0,
+        0xf845fe24, 0x1, 0x864cfb00, 0x1,
+    },
+    [AV_CRC_32_IEEE] = {
+        PMULL_BE,
+        0xe6228b11, 0x0, 0x8833794c, 0x0,
+        0xe8a45605, 0x0, 0xc5b9cd4c, 0x0,
+        0x490d678d, 0x0, 0xf200aa66, 0x0,
+        0x04d101df, 0x1, 0x04c11db7, 0x1,
+    },
+    [AV_CRC_32_IEEE_LE] = {
+        PMULL_LE,
+        0x54442bd4, 0x1, 0xc6e41596, 0x1,
+        0x751997d0, 0x1, 0xccaa009e, 0x0,
+        0xccaa009e, 0x0, 0x63cd6124, 0x1,
+        0xf7011640, 0x1, 0xdb710641, 0x1,
+    },
+    [AV_CRC_16_ANSI_LE] = {
+        PMULL_LE,
+        0x1b0c2, 0x0, 0x0000bffa, 0x0,
+        0x1d0c2, 0x0, 0x00018cc2, 0x0,
+        0x00018cc2, 0x0, 0x1bc02, 0x0,
+        0xcfffbffe, 0x1, 0x14003, 0x0,
+    },
+};
+
+
+static inline void crc_init_aarch64(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size)
+{
+    uint64_t poly_;
+    if (le) {
+        // convert the reversed representation to regular form
+        poly = reverse(poly, bits) >> 1;
+    }
+    // convert to 32 degree polynomial
+    poly_ = ((uint64_t)poly) << (32 - bits);
+
+    uint64_t div;
+    uint8_t *dst = (uint8_t*)(ctx + 1);
+    if (le) {
+        ctx[0] = PMULL_LE;
+        AV_WN64(dst +  0, xnmodp(4 * 128 + 32, poly_, 32, &div, le));
+        AV_WN64(dst +  8, xnmodp(4 * 128 - 32, poly_, 32, &div, le));
+        uint64_t tmp = xnmodp(128 - 32, poly_, 32, &div, le);
+        AV_WN64(dst + 16, xnmodp(128 + 32, poly_, 32, &div, le));
+        AV_WN64(dst + 24, tmp);
+        AV_WN64(dst + 32, tmp);
+        AV_WN64(dst + 40, xnmodp(64, poly_, 32, &div, le));
+        AV_WN64(dst + 48, div);
+        AV_WN64(dst + 56, reverse(poly_ | (1ULL << 32), 32));
+    } else {
+        ctx[0] = PMULL_BE;
+        AV_WN64(dst +  0, xnmodp(4 * 128, poly_, 32, &div, le));
+        AV_WN64(dst +  8, xnmodp(4 * 128 + 64, poly_, 32, &div, le));
+        AV_WN64(dst + 16, xnmodp(128, poly_, 32, &div, le));
+        AV_WN64(dst + 24, xnmodp(128 + 64, poly_, 32, &div, le));
+        AV_WN64(dst + 32, xnmodp(64, poly_, 32, &div, le));
+        AV_WN64(dst + 48, div);
+        AV_WN64(dst + 40, xnmodp(96, poly_, 32, &div, le));
+        AV_WN64(dst + 56, poly_ | (1ULL << 32));
+    }
+}
+#endif
+
+static inline av_cold int ff_crc_init_aarch64(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size)
+{
+#if HAVE_PMULL && HAVE_EOR3
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_pmull(cpu_flags) && have_eor3(cpu_flags)) {
+        crc_init_aarch64(ctx, le, bits, poly, ctx_size);
+        return 1;
+    }
+#endif
+    return 0;
+}

 static inline uint32_t ff_crc_aarch64(const AVCRC *ctx, uint32_t crc,
                                      const uint8_t *buffer, size_t length)
 {
-#if HAVE_ARM_CRC
-    av_assert2(ctx[0] == AV_CRC_32_IEEE_LE + 1);
-    return ff_crc32_aarch64(ctx, crc, buffer, length);
-#else
-    av_unreachable("AARCH64 has only AV_CRC_32_IEEE_LE arch-specific CRC code");
-    return 0;
+    switch (ctx[0]) {
+#if HAVE_PMULL && HAVE_EOR3
+    case PMULL_BE: return ff_crc_neon_pmull(ctx, crc, buffer, length);
+    case PMULL_LE: return ff_crc_le_neon_pmull(ctx, crc, buffer, length);
 #endif
+#if HAVE_ARM_CRC
+    case (AV_CRC_32_IEEE_LE + 1): return ff_crc32_aarch64(ctx, crc, buffer, length);
+#endif
+    default: av_unreachable("AARCH64 has PMULL_LE, PMULL_BE and AV_CRC_32_IEEE_LE arch-specific CRC code");
+    }
+    return 0;
 }

 static inline const AVCRC *ff_crc_get_table_aarch64(AVCRCId crc_id)
 {
+    int cpu_flags = av_get_cpu_flags();
+#if HAVE_PMULL && HAVE_EOR3
+    if (have_pmull(cpu_flags) && have_eor3(cpu_flags)) {
+        return crc_table_pmull[crc_id];
+    }
+#endif
 #if HAVE_ARM_CRC
    static const AVCRC crc32_ieee_le_ctx[] = {
        AV_CRC_32_IEEE_LE + 1
@@ -57,7 +196,6 @@ static inline const AVCRC *ff_crc_get_table_aarch64(AVCRCId crc_id)
    if (crc_id != AV_CRC_32_IEEE_LE)
        return NULL;

-    int cpu_flags = av_get_cpu_flags();
    if (have_arm_crc(cpu_flags)) {
        return crc32_ieee_le_ctx;
    }
--- a/libavutil/crc.c
+++ b/libavutil/crc.c
@@ -357,6 +357,10 @@ int av_crc_init(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size)
    int done = ff_crc_init_x86(ctx, le, bits, poly, ctx_size);
    if (done)
        return 0;
+#elif ARCH_AARCH64
+    int done = ff_crc_init_aarch64(ctx, le, bits, poly, ctx_size);
+    if (done)
+        return 0;
 #endif

    for (i = 0; i < 256; i++) {