avcodec/aarch64/vvc: Unroll vvc_bdof_grad_filter_8x_neon

Before and after:
A53:
apply_bdof_8_16x8_neon:                               2733.1 ( 4.88x)
apply_bdof_8_16x16_neon:                              5458.6 ( 4.86x)
apply_bdof_10_16x8_neon:                              2789.8 ( 4.64x)
apply_bdof_10_16x16_neon:                             5523.8 ( 4.68x)
apply_bdof_12_16x8_neon:                              2792.8 ( 4.58x)
apply_bdof_12_16x16_neon:                             5519.5 ( 4.63x)

apply_bdof_8_16x8_neon:                               2571.8 ( 5.12x)
apply_bdof_8_16x16_neon:                              5173.3 ( 5.12x)
apply_bdof_10_16x8_neon:                              2635.1 ( 4.87x)
apply_bdof_10_16x16_neon:                             5243.0 ( 4.89x)
apply_bdof_12_16x8_neon:                              2613.0 ( 4.89x)
apply_bdof_12_16x16_neon:                             5231.7 ( 4.90x)

A78:
apply_bdof_8_16x8_neon:                                565.3 ( 8.43x)
apply_bdof_8_16x16_neon:                              1109.5 ( 8.60x)
apply_bdof_10_16x8_neon:                               568.2 ( 7.92x)
apply_bdof_10_16x16_neon:                             1114.1 ( 8.08x)
apply_bdof_12_16x8_neon:                               570.2 ( 7.87x)
apply_bdof_12_16x16_neon:                             1116.3 ( 8.03x)

apply_bdof_8_16x8_neon:                                541.4 ( 8.81x)
apply_bdof_8_16x16_neon:                              1065.9 ( 8.97x)
apply_bdof_10_16x8_neon:                               543.2 ( 8.32x)
apply_bdof_10_16x16_neon:                             1071.5 ( 8.39x)
apply_bdof_12_16x8_neon:                               544.2 ( 8.25x)
apply_bdof_12_16x16_neon:                             1074.1 ( 8.37x)
This commit is contained in:
Krzysztof Pyrkosz
2025-09-14 19:13:24 +02:00
committed by Martin Storsjö
parent e5ac70042e
commit 56a638d836

View File

@@ -803,28 +803,21 @@ function vvc_bdof_grad_filter_8x_neon, export=0
src1 .req x5
width .req w6
height .req w7
tbnz w6, #4, 16f
1:
mov x10, src0
mov w11, width
mov x12, gh0
mov x13, gv0
mov x14, src1
mov x15, gh1
mov x16, gv1
2:
ldur q0, [x10, #2]
ldur q1, [x10, #-2]
ldr q2, [x10, #(VVC_MAX_PB_SIZE << 1)]
ldr q3, [x10, #-(VVC_MAX_PB_SIZE << 1)]
8:
ldur q0, [src0, #2]
ldur q1, [src0, #-2]
ldr q2, [src0, #(VVC_MAX_PB_SIZE << 1)]
ldr q3, [src0, #-(VVC_MAX_PB_SIZE << 1)]
sshr v0.8h, v0.8h, #6
sshr v1.8h, v1.8h, #6
ldur q4, [x14, #2]
ldur q5, [x14, #-2]
ldur q4, [src1, #2]
ldur q5, [src1, #-2]
sshr v2.8h, v2.8h, #6
sshr v3.8h, v3.8h, #6
ldr q6, [x14, #(VVC_MAX_PB_SIZE << 1)]
ldr q7, [x14, #-(VVC_MAX_PB_SIZE << 1)]
ldr q6, [src1, #(VVC_MAX_PB_SIZE << 1)]
ldr q7, [src1, #-(VVC_MAX_PB_SIZE << 1)]
// results of gradient_h0
sub v0.8h, v0.8h, v1.8h
// results of gradient_v0
@@ -839,26 +832,20 @@ function vvc_bdof_grad_filter_8x_neon, export=0
// results of gradient_v1
sub v6.8h, v6.8h, v7.8h
add x10, x10, #16
add x14, x14, #16
// (gradient_h0 + gradient_h1) >> 1
shadd v1.8h, v0.8h, v4.8h
// gradient_h0 - gradient_h1
sub v5.8h, v0.8h, v4.8h
subs w11, w11, #8
// (gradient_v0 + gradient_v1) >> 1
shadd v3.8h, v2.8h, v6.8h
// gradient_v0 - gradient_v1
sub v7.8h, v2.8h, v6.8h
st1 {v1.8h}, [x12], #16
st1 {v5.8h}, [x15], #16
st1 {v3.8h}, [x13], #16
st1 {v7.8h}, [x16], #16
b.ne 2b
st1 {v1.8h}, [gh0]
st1 {v5.8h}, [gh1]
st1 {v3.8h}, [gv0]
st1 {v7.8h}, [gv1]
subs height, height, #1
add gh0, gh0, #(BDOF_BLOCK_SIZE << 1)
@@ -867,7 +854,84 @@ function vvc_bdof_grad_filter_8x_neon, export=0
add gh1, gh1, #(BDOF_BLOCK_SIZE << 1)
add gv1, gv1, #(BDOF_BLOCK_SIZE << 1)
add src1, src1, #(VVC_MAX_PB_SIZE << 1)
b.ne 1b
b.ne 8b
ret
16:
ldur q0, [src0, #2]
ldur q1, [src0, #18]
ldur q16, [src0, #-2]
sshr v0.8h, v0.8h, #6
ldur q17, [src0, #14]
sshr v1.8h, v1.8h, #6
ldp q18, q19, [src0, #-(VVC_MAX_PB_SIZE << 1)]
sshr v16.8h, v16.8h, #6
ldp q2, q3, [src0, #(VVC_MAX_PB_SIZE << 1)]!
ldur q20, [src1, #2]
sshr v17.8h, v17.8h, #6
ldur q21, [src1, #18]
sshr v2.8h, v2.8h, #6
ldur q22, [src1, #-2]
sshr v3.8h, v3.8h, #6
ldur q23, [src1, #14]
sshr v18.8h, v18.8h, #6
ldp q26, q27, [src1, #-(VVC_MAX_PB_SIZE << 1)]
sshr v19.8h, v19.8h, #6
ldp q24, q25, [src1, #(VVC_MAX_PB_SIZE << 1)]!
// results of gradient_h0
sub v0.8h, v0.8h, v16.8h
sub v1.8h, v1.8h, v17.8h
// results of gradient_v0
sub v2.8h, v2.8h, v18.8h
sub v3.8h, v3.8h, v19.8h
sshr v20.8h, v20.8h, #6
sshr v21.8h, v21.8h, #6
sshr v22.8h, v22.8h, #6
sshr v23.8h, v23.8h, #6
// results of gradient_h1
sub v20.8h, v20.8h, v22.8h
sub v21.8h, v21.8h, v23.8h
sshr v24.8h, v24.8h, #6
sshr v25.8h, v25.8h, #6
// gradient_h0 - gradient_h1
sub v22.8h, v0.8h, v20.8h
sub v23.8h, v1.8h, v21.8h
// (gradient_h0 + gradient_h1) >> 1
shadd v16.8h, v0.8h, v20.8h
shadd v17.8h, v1.8h, v21.8h
st1 {v22.8h, v23.8h}, [gh1], #32
sshr v26.8h, v26.8h, #6
sshr v27.8h, v27.8h, #6
st1 {v16.8h, v17.8h}, [gh0], #32
// results of gradient_v1
sub v24.8h, v24.8h, v26.8h
sub v25.8h, v25.8h, v27.8h
// (gradient_v0 + gradient_v1) >> 1
shadd v18.8h, v2.8h, v24.8h
shadd v19.8h, v3.8h, v25.8h
// gradient_v0 - gradient_v1
sub v26.8h, v2.8h, v24.8h
sub v27.8h, v3.8h, v25.8h
st1 {v18.8h,v19.8h}, [gv0], #32
subs height, height, #1
st1 {v26.8h,v27.8h}, [gv1], #32
b.ne 16b
ret
.unreq gh0