Files
FFmpeg/libavutil/x86/crc.asm
Andreas Rheinhardt dc03cffe9c avutil/crc: Use x86 clmul for CRC when available
Observed near 10x speedup on AMD Zen4 7950x:
av_crc_c:                                            22057.0 ( 1.00x)
av_crc_clmul:                                         2202.8 (10.01x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2026-01-04 15:49:30 +01:00

299 lines
7.3 KiB
NASM

;*****************************************************************************
;* Copyright (c) 2025 Shreesh Adiga <16567adigashreesh@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86util.asm"
SECTION_RODATA
reverse_shuffle: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
partial_bytes_shuf_tab: db 255, 254, 253, 252, 251, 250, 249, 248,\
247, 246, 245, 244, 243, 242, 241, 240,\
0, 1, 2, 3, 4, 5, 6, 7,\
8, 9, 10, 11, 12, 13, 14, 15
SECTION .text
%macro FOLD_128_TO_64 4
; %1 LE ; %2 128 bit fold reg ; %3 pre-computed constant reg ; %4 tmp reg
%if %1 == 1
mova %4, %2
pclmulqdq %2, %3, 0x00
psrldq %4, 8
pxor %2, %4
mova %4, %2
psllq %4, 32
pclmulqdq %4, %3, 0x10
pxor %2, %4
%else
movq %4, %2
pclmulqdq %2, %3, 0x11
pslldq %4, 4
pxor %4, %2
mova %2, %4
pclmulqdq %4, %3, 0x01
pxor %2, %4
%endif
%endmacro
%macro FOLD_64_TO_32 4
; %1 LE ; %2 128 bit fold reg ; %3 pre-computed constant reg ; %4 tmp reg
%if %1 == 1
pxor %4, %4
pblendw %4, %2, 0xfc
mova %2, %4
pclmulqdq %4, %3, 0x00
pxor %4, %2
pclmulqdq %4, %3, 0x10
pxor %2, %4
pextrd eax, %2, 2
%else
mova %4, %2
pclmulqdq %2, %3, 0x00
pclmulqdq %2, %3, 0x11
pxor %2, %4
movd eax, %2
bswap eax
%endif
%endmacro
%macro FOLD_SINGLE 4
; %1 temp ; %2 fold reg ; %3 pre-computed constants ; %4 input data block
mova %1, %2
pclmulqdq %1, %3, 0x01
pxor %1, %4
pclmulqdq %2, %3, 0x10
pxor %2, %1
%endmacro
%macro XMM_SHIFT_LEFT 4
; %1 xmm input reg ; %2 shift bytes amount ; %3 temp xmm register ; %4 temp gpr
lea %4, [partial_bytes_shuf_tab]
movu %3, [%4 + 16 - (%2)]
pshufb %1, %3
%endmacro
%macro MEMCPY_0_15 6
; %1 dst ; %2 src ; %3 len ; %4, %5 temp gpr register; %6 done label
cmp %3, 8
jae .between_8_15
cmp %3, 4
jae .between_4_7
cmp %3, 1
ja .between_2_3
jb %6
mov %4b, [%2]
mov [%1], %4b
jmp %6
.between_8_15:
%if ARCH_X86_64
mov %4q, [%2]
mov %5q, [%2 + %3 - 8]
mov [%1], %4q
mov [%1 + %3 - 8], %5q
jmp %6
%else
xor %5, %5
.copy4b:
mov %4d, [%2 + %5]
mov [%1 + %5], %4d
add %5, 4
lea %4, [%5 + 4]
cmp %4, %3
jb .copy4b
mov %4d, [%2 + %3 - 4]
mov [%1 + %3 - 4], %4d
jmp %6
%endif
.between_4_7:
mov %4d, [%2]
mov %5d, [%2 + %3 - 4]
mov [%1], %4d
mov [%1 + %3 - 4], %5d
jmp %6
.between_2_3:
mov %4w, [%2]
mov %5w, [%2 + %3 - 2]
mov [%1], %4w
mov [%1 + %3 - 2], %5w
; fall through, %6 label is expected to be next instruction
%endmacro
%macro CRC 1
%define CTX r0+4
;-----------------------------------------------------------------------------------------------
; ff_crc[_le]_clmul(const uint8_t *ctx, uint32_t crc, const uint8_t *buffer, size_t length
;-----------------------------------------------------------------------------------------------
; %1 == 1 - LE format
%if %1 == 1
cglobal crc_le, 4, 6, 6+4*ARCH_X86_64, 0x10
%else
cglobal crc, 4, 6, 7+4*ARCH_X86_64, 0x10
%endif
%if ARCH_X86_32
%define m10 m6
%endif
%if %1 == 0
mova m10, [reverse_shuffle]
%endif
movd m4, r1d
%if ARCH_X86_32
; skip 4x unrolled loop due to only 8 XMM reg being available in X86_32
jmp .less_than_64bytes
%else
cmp r3, 64
jb .less_than_64bytes
movu m1, [r2 + 0]
movu m3, [r2 + 16]
movu m2, [r2 + 32]
movu m0, [r2 + 48]
pxor m1, m4
%if %1 == 0
pshufb m0, m10
pshufb m1, m10
pshufb m2, m10
pshufb m3, m10
%endif
mov r4, 64
cmp r3, 128
jb .reduce_4x_to_1
movu m4, [CTX]
.fold_4x_loop:
movu m6, [r2 + r4 + 0]
movu m7, [r2 + r4 + 16]
movu m8, [r2 + r4 + 32]
movu m9, [r2 + r4 + 48]
%if %1 == 0
pshufb m6, m10
pshufb m7, m10
pshufb m8, m10
pshufb m9, m10
%endif
FOLD_SINGLE m5, m1, m4, m6
FOLD_SINGLE m5, m3, m4, m7
FOLD_SINGLE m5, m2, m4, m8
FOLD_SINGLE m5, m0, m4, m9
add r4, 64
lea r5, [r4 + 64]
cmp r5, r3
jbe .fold_4x_loop
.reduce_4x_to_1:
movu m4, [CTX + 16]
FOLD_SINGLE m5, m1, m4, m3
FOLD_SINGLE m5, m1, m4, m2
FOLD_SINGLE m5, m1, m4, m0
%endif
.fold_1x_pre:
lea r5, [r4 + 16]
cmp r5, r3
ja .partial_block
.fold_1x_loop:
movu m2, [r2 + r4]
%if %1 == 0
pshufb m2, m10
%endif
FOLD_SINGLE m5, m1, m4, m2
add r4, 16
lea r5, [r4 + 16]
cmp r5, r3
jbe .fold_1x_loop
.partial_block:
cmp r4, r3
jae .reduce_128_to_64
movu m2, [r2 + r3 - 16]
and r3, 0xf
lea r4, [partial_bytes_shuf_tab]
movu m0, [r3 + r4]
%if %1 == 0
pshufb m1, m10
%endif
mova m3, m1
pcmpeqd m5, m5 ; m5 = _mm_set1_epi8(0xff)
pxor m5, m0
pshufb m3, m5
pblendvb m2, m3, m0
pshufb m1, m0
%if %1 == 0
pshufb m1, m10
pshufb m2, m10
%endif
FOLD_SINGLE m5, m1, m4, m2
.reduce_128_to_64:
movu m4, [CTX + 32]
FOLD_128_TO_64 %1, m1, m4, m5
.reduce_64_to_32:
movu m4, [CTX + 48]
FOLD_64_TO_32 %1, m1, m4, m5
RET
.less_than_64bytes:
cmp r3, 16
jb .less_than_16bytes
movu m1, [r2]
pxor m1, m4
%if %1 == 0
pshufb m1, m10
%endif
mov r4, 16
movu m4, [CTX + 16]
jmp .fold_1x_pre
.less_than_16bytes:
pxor m1, m1
movu [rsp], m1
MEMCPY_0_15 rsp, r2, r3, r1, r4, .memcpy_done
.memcpy_done:
movu m1, [rsp]
pxor m1, m4
cmp r3, 5
jb .less_than_5bytes
XMM_SHIFT_LEFT m1, (16 - r3), m2, r4
%if %1 == 0
pshufb m1, m10
%endif
jmp .reduce_128_to_64
.less_than_5bytes:
%if %1 == 0
XMM_SHIFT_LEFT m1, (4 - r3), m2, r4
movq m10, [reverse_shuffle + 8] ; 0x0001020304050607
pshufb m1, m10
%else
XMM_SHIFT_LEFT m1, (8 - r3), m2, r4
%endif
jmp .reduce_64_to_32
%endmacro
INIT_XMM clmul
CRC 0
CRC 1