Files
FFmpeg/libavutil/aarch64/crc.h
Shreesh Adiga 5085432f8b avutil/crc: add aarch64 NEON PMULL+EOR3 SIMD implementation for av_crc
Implemented clmul algorithm for aarch64 using PMULL and EOR3 instructions.
The logic and structure is same as x86 clmul implementation with
slight rearrangement of constants as per PMULL and PMULL2 instructions.

Benchmarking in Android (Termux) on a MediaTek Dimensity 9400 SoC:

./tests/checkasm/checkasm --test=crc --bench --runs=12
benchmarking with native FFmpeg timers
nop: 0.2
checkasm: SVE 128 bits, using random seed 2502847808
checkasm: bench runs 4096 (1 << 12)
CRC:
 - crc.crc [OK]
PMULL:
 - crc.crc [OK]
checkasm: all 10 tests passed
crc_8_ATM_c:                                            26.0 ( 1.00x)
crc_8_ATM_pmull_eor3:                                    0.7 (37.17x)
crc_8_EBU_c:                                            46.4 ( 1.00x)
crc_8_EBU_pmull_eor3:                                    1.5 (31.47x)
crc_16_ANSI_c:                                          36.3 ( 1.00x)
crc_16_ANSI_pmull_eor3:                                  1.1 (31.70x)
crc_16_ANSI_LE_c:                                       90.9 ( 1.00x)
crc_16_ANSI_LE_pmull_eor3:                               2.8 (32.30x)
crc_16_CCITT_c:                                        118.0 ( 1.00x)
crc_16_CCITT_pmull_eor3:                                 3.7 (32.00x)
crc_24_IEEE_c:                                           1.6 ( 1.00x)
crc_24_IEEE_pmull_eor3:                                  0.1 (12.19x)
crc_32_IEEE_c:                                          45.2 ( 1.00x)
crc_32_IEEE_pmull_eor3:                                  1.4 (31.39x)
crc_32_IEEE_LE_c:                                       49.1 ( 1.00x)
crc_32_IEEE_LE_crc:                                      2.5 (19.51x)
crc_32_IEEE_LE_pmull_eor3:                               1.5 (32.84x)
crc_custom_polynomial_c:                                45.3 ( 1.00x)
crc_custom_polynomial_pmull_eor3:                        1.3 (35.16x)
2026-03-11 14:03:36 +00:00

207 lines
6.2 KiB
C

/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_AARCH64_CRC_H
#define AVUTIL_AARCH64_CRC_H
#include <stddef.h>
#include <stdint.h>
#include "config.h"
#include "cpu.h"
#include "libavutil/attributes_internal.h"
#include "libavutil/avassert.h"
#include "libavutil/cpu.h"
#include "libavutil/crc.h"
#include "libavutil/intreadwrite.h"
#if HAVE_ARM_CRC
FF_VISIBILITY_PUSH_HIDDEN
uint32_t ff_crc32_aarch64(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer,
size_t length);
FF_VISIBILITY_POP_HIDDEN
#endif
#if HAVE_PMULL && HAVE_EOR3
#include "libavutil/crc_internal.h"
FF_VISIBILITY_PUSH_HIDDEN
uint32_t ff_crc_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer,
size_t length);
uint32_t ff_crc_le_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer,
size_t length);
FF_VISIBILITY_POP_HIDDEN
enum {
CRC_C = 0,
PMULL_BE,
PMULL_LE,
};
static const AVCRC crc_table_pmull[AV_CRC_MAX][17] = {
[AV_CRC_8_ATM] = {
PMULL_BE,
0xbc000000, 0x0, 0x32000000, 0x0,
0x94000000, 0x0, 0xc4000000, 0x0,
0x62000000, 0x0, 0x79000000, 0x0,
0x07156a16, 0x1, 0x07000000, 0x1,
},
[AV_CRC_8_EBU] = {
PMULL_BE,
0xf3000000, 0x0, 0xb5000000, 0x0,
0x0d000000, 0x0, 0xfc000000, 0x0,
0x6a000000, 0x0, 0x65000000, 0x0,
0x1c4b8192, 0x1, 0x1d000000, 0x1,
},
[AV_CRC_16_ANSI] = {
PMULL_BE,
0x807d0000, 0x0, 0xf9e30000, 0x0,
0xff830000, 0x0, 0xf9130000, 0x0,
0x807b0000, 0x0, 0x86630000, 0x0,
0xfffbffe7, 0x1, 0x80050000, 0x1,
},
[AV_CRC_16_CCITT] = {
PMULL_BE,
0x59b00000, 0x0, 0x60190000, 0x0,
0x45630000, 0x0, 0xd5f60000, 0x0,
0xaa510000, 0x0, 0xeb230000, 0x0,
0x11303471, 0x1, 0x10210000, 0x1,
},
[AV_CRC_24_IEEE] = {
PMULL_BE,
0x467d2400, 0x0, 0x1f428700, 0x0,
0x64e4d700, 0x0, 0x2c8c9d00, 0x0,
0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0,
0xf845fe24, 0x1, 0x864cfb00, 0x1,
},
[AV_CRC_32_IEEE] = {
PMULL_BE,
0xe6228b11, 0x0, 0x8833794c, 0x0,
0xe8a45605, 0x0, 0xc5b9cd4c, 0x0,
0x490d678d, 0x0, 0xf200aa66, 0x0,
0x04d101df, 0x1, 0x04c11db7, 0x1,
},
[AV_CRC_32_IEEE_LE] = {
PMULL_LE,
0x54442bd4, 0x1, 0xc6e41596, 0x1,
0x751997d0, 0x1, 0xccaa009e, 0x0,
0xccaa009e, 0x0, 0x63cd6124, 0x1,
0xf7011640, 0x1, 0xdb710641, 0x1,
},
[AV_CRC_16_ANSI_LE] = {
PMULL_LE,
0x1b0c2, 0x0, 0x0000bffa, 0x0,
0x1d0c2, 0x0, 0x00018cc2, 0x0,
0x00018cc2, 0x0, 0x1bc02, 0x0,
0xcfffbffe, 0x1, 0x14003, 0x0,
},
};
static inline void crc_init_aarch64(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size)
{
uint64_t poly_;
if (le) {
// convert the reversed representation to regular form
poly = reverse(poly, bits) >> 1;
}
// convert to 32 degree polynomial
poly_ = ((uint64_t)poly) << (32 - bits);
uint64_t div;
uint8_t *dst = (uint8_t*)(ctx + 1);
if (le) {
ctx[0] = PMULL_LE;
AV_WN64(dst + 0, xnmodp(4 * 128 + 32, poly_, 32, &div, le));
AV_WN64(dst + 8, xnmodp(4 * 128 - 32, poly_, 32, &div, le));
uint64_t tmp = xnmodp(128 - 32, poly_, 32, &div, le);
AV_WN64(dst + 16, xnmodp(128 + 32, poly_, 32, &div, le));
AV_WN64(dst + 24, tmp);
AV_WN64(dst + 32, tmp);
AV_WN64(dst + 40, xnmodp(64, poly_, 32, &div, le));
AV_WN64(dst + 48, div);
AV_WN64(dst + 56, reverse(poly_ | (1ULL << 32), 32));
} else {
ctx[0] = PMULL_BE;
AV_WN64(dst + 0, xnmodp(4 * 128, poly_, 32, &div, le));
AV_WN64(dst + 8, xnmodp(4 * 128 + 64, poly_, 32, &div, le));
AV_WN64(dst + 16, xnmodp(128, poly_, 32, &div, le));
AV_WN64(dst + 24, xnmodp(128 + 64, poly_, 32, &div, le));
AV_WN64(dst + 32, xnmodp(64, poly_, 32, &div, le));
AV_WN64(dst + 48, div);
AV_WN64(dst + 40, xnmodp(96, poly_, 32, &div, le));
AV_WN64(dst + 56, poly_ | (1ULL << 32));
}
}
#endif
static inline av_cold int ff_crc_init_aarch64(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size)
{
#if HAVE_PMULL && HAVE_EOR3
int cpu_flags = av_get_cpu_flags();
if (have_pmull(cpu_flags) && have_eor3(cpu_flags)) {
crc_init_aarch64(ctx, le, bits, poly, ctx_size);
return 1;
}
#endif
return 0;
}
static inline uint32_t ff_crc_aarch64(const AVCRC *ctx, uint32_t crc,
const uint8_t *buffer, size_t length)
{
switch (ctx[0]) {
#if HAVE_PMULL && HAVE_EOR3
case PMULL_BE: return ff_crc_neon_pmull(ctx, crc, buffer, length);
case PMULL_LE: return ff_crc_le_neon_pmull(ctx, crc, buffer, length);
#endif
#if HAVE_ARM_CRC
case (AV_CRC_32_IEEE_LE + 1): return ff_crc32_aarch64(ctx, crc, buffer, length);
#endif
default: av_unreachable("AARCH64 has PMULL_LE, PMULL_BE and AV_CRC_32_IEEE_LE arch-specific CRC code");
}
return 0;
}
static inline const AVCRC *ff_crc_get_table_aarch64(AVCRCId crc_id)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_PMULL && HAVE_EOR3
if (have_pmull(cpu_flags) && have_eor3(cpu_flags)) {
return crc_table_pmull[crc_id];
}
#endif
#if HAVE_ARM_CRC
static const AVCRC crc32_ieee_le_ctx[] = {
AV_CRC_32_IEEE_LE + 1
};
if (crc_id != AV_CRC_32_IEEE_LE)
return NULL;
if (have_arm_crc(cpu_flags)) {
return crc32_ieee_le_ctx;
}
#endif
return NULL;
}
#endif /* AVUTIL_AARCH64_CRC_H */