mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-22 05:40:27 +08:00
avcodec/aarch64/vvc: Unroll vvc_bdof_grad_filter_8x_neon
Before and after: A53: apply_bdof_8_16x8_neon: 2733.1 ( 4.88x) apply_bdof_8_16x16_neon: 5458.6 ( 4.86x) apply_bdof_10_16x8_neon: 2789.8 ( 4.64x) apply_bdof_10_16x16_neon: 5523.8 ( 4.68x) apply_bdof_12_16x8_neon: 2792.8 ( 4.58x) apply_bdof_12_16x16_neon: 5519.5 ( 4.63x) apply_bdof_8_16x8_neon: 2571.8 ( 5.12x) apply_bdof_8_16x16_neon: 5173.3 ( 5.12x) apply_bdof_10_16x8_neon: 2635.1 ( 4.87x) apply_bdof_10_16x16_neon: 5243.0 ( 4.89x) apply_bdof_12_16x8_neon: 2613.0 ( 4.89x) apply_bdof_12_16x16_neon: 5231.7 ( 4.90x) A78: apply_bdof_8_16x8_neon: 565.3 ( 8.43x) apply_bdof_8_16x16_neon: 1109.5 ( 8.60x) apply_bdof_10_16x8_neon: 568.2 ( 7.92x) apply_bdof_10_16x16_neon: 1114.1 ( 8.08x) apply_bdof_12_16x8_neon: 570.2 ( 7.87x) apply_bdof_12_16x16_neon: 1116.3 ( 8.03x) apply_bdof_8_16x8_neon: 541.4 ( 8.81x) apply_bdof_8_16x16_neon: 1065.9 ( 8.97x) apply_bdof_10_16x8_neon: 543.2 ( 8.32x) apply_bdof_10_16x16_neon: 1071.5 ( 8.39x) apply_bdof_12_16x8_neon: 544.2 ( 8.25x) apply_bdof_12_16x16_neon: 1074.1 ( 8.37x)
This commit is contained in:
committed by
Martin Storsjö
parent
e5ac70042e
commit
56a638d836
@@ -803,28 +803,21 @@ function vvc_bdof_grad_filter_8x_neon, export=0
|
||||
src1 .req x5
|
||||
width .req w6
|
||||
height .req w7
|
||||
tbnz w6, #4, 16f
|
||||
|
||||
1:
|
||||
mov x10, src0
|
||||
mov w11, width
|
||||
mov x12, gh0
|
||||
mov x13, gv0
|
||||
mov x14, src1
|
||||
mov x15, gh1
|
||||
mov x16, gv1
|
||||
2:
|
||||
ldur q0, [x10, #2]
|
||||
ldur q1, [x10, #-2]
|
||||
ldr q2, [x10, #(VVC_MAX_PB_SIZE << 1)]
|
||||
ldr q3, [x10, #-(VVC_MAX_PB_SIZE << 1)]
|
||||
8:
|
||||
ldur q0, [src0, #2]
|
||||
ldur q1, [src0, #-2]
|
||||
ldr q2, [src0, #(VVC_MAX_PB_SIZE << 1)]
|
||||
ldr q3, [src0, #-(VVC_MAX_PB_SIZE << 1)]
|
||||
sshr v0.8h, v0.8h, #6
|
||||
sshr v1.8h, v1.8h, #6
|
||||
ldur q4, [x14, #2]
|
||||
ldur q5, [x14, #-2]
|
||||
ldur q4, [src1, #2]
|
||||
ldur q5, [src1, #-2]
|
||||
sshr v2.8h, v2.8h, #6
|
||||
sshr v3.8h, v3.8h, #6
|
||||
ldr q6, [x14, #(VVC_MAX_PB_SIZE << 1)]
|
||||
ldr q7, [x14, #-(VVC_MAX_PB_SIZE << 1)]
|
||||
ldr q6, [src1, #(VVC_MAX_PB_SIZE << 1)]
|
||||
ldr q7, [src1, #-(VVC_MAX_PB_SIZE << 1)]
|
||||
// results of gradient_h0
|
||||
sub v0.8h, v0.8h, v1.8h
|
||||
// results of gradient_v0
|
||||
@@ -839,26 +832,20 @@ function vvc_bdof_grad_filter_8x_neon, export=0
|
||||
// results of gradient_v1
|
||||
sub v6.8h, v6.8h, v7.8h
|
||||
|
||||
add x10, x10, #16
|
||||
add x14, x14, #16
|
||||
|
||||
// (gradient_h0 + gradient_h1) >> 1
|
||||
shadd v1.8h, v0.8h, v4.8h
|
||||
// gradient_h0 - gradient_h1
|
||||
sub v5.8h, v0.8h, v4.8h
|
||||
|
||||
subs w11, w11, #8
|
||||
|
||||
// (gradient_v0 + gradient_v1) >> 1
|
||||
shadd v3.8h, v2.8h, v6.8h
|
||||
// gradient_v0 - gradient_v1
|
||||
sub v7.8h, v2.8h, v6.8h
|
||||
|
||||
st1 {v1.8h}, [x12], #16
|
||||
st1 {v5.8h}, [x15], #16
|
||||
st1 {v3.8h}, [x13], #16
|
||||
st1 {v7.8h}, [x16], #16
|
||||
b.ne 2b
|
||||
st1 {v1.8h}, [gh0]
|
||||
st1 {v5.8h}, [gh1]
|
||||
st1 {v3.8h}, [gv0]
|
||||
st1 {v7.8h}, [gv1]
|
||||
|
||||
subs height, height, #1
|
||||
add gh0, gh0, #(BDOF_BLOCK_SIZE << 1)
|
||||
@@ -867,7 +854,84 @@ function vvc_bdof_grad_filter_8x_neon, export=0
|
||||
add gh1, gh1, #(BDOF_BLOCK_SIZE << 1)
|
||||
add gv1, gv1, #(BDOF_BLOCK_SIZE << 1)
|
||||
add src1, src1, #(VVC_MAX_PB_SIZE << 1)
|
||||
b.ne 1b
|
||||
b.ne 8b
|
||||
ret
|
||||
|
||||
16:
|
||||
ldur q0, [src0, #2]
|
||||
ldur q1, [src0, #18]
|
||||
ldur q16, [src0, #-2]
|
||||
sshr v0.8h, v0.8h, #6
|
||||
ldur q17, [src0, #14]
|
||||
sshr v1.8h, v1.8h, #6
|
||||
ldp q18, q19, [src0, #-(VVC_MAX_PB_SIZE << 1)]
|
||||
sshr v16.8h, v16.8h, #6
|
||||
ldp q2, q3, [src0, #(VVC_MAX_PB_SIZE << 1)]!
|
||||
ldur q20, [src1, #2]
|
||||
sshr v17.8h, v17.8h, #6
|
||||
ldur q21, [src1, #18]
|
||||
sshr v2.8h, v2.8h, #6
|
||||
ldur q22, [src1, #-2]
|
||||
sshr v3.8h, v3.8h, #6
|
||||
ldur q23, [src1, #14]
|
||||
sshr v18.8h, v18.8h, #6
|
||||
ldp q26, q27, [src1, #-(VVC_MAX_PB_SIZE << 1)]
|
||||
sshr v19.8h, v19.8h, #6
|
||||
ldp q24, q25, [src1, #(VVC_MAX_PB_SIZE << 1)]!
|
||||
|
||||
// results of gradient_h0
|
||||
sub v0.8h, v0.8h, v16.8h
|
||||
sub v1.8h, v1.8h, v17.8h
|
||||
|
||||
// results of gradient_v0
|
||||
sub v2.8h, v2.8h, v18.8h
|
||||
sub v3.8h, v3.8h, v19.8h
|
||||
|
||||
sshr v20.8h, v20.8h, #6
|
||||
sshr v21.8h, v21.8h, #6
|
||||
sshr v22.8h, v22.8h, #6
|
||||
sshr v23.8h, v23.8h, #6
|
||||
|
||||
// results of gradient_h1
|
||||
sub v20.8h, v20.8h, v22.8h
|
||||
sub v21.8h, v21.8h, v23.8h
|
||||
|
||||
sshr v24.8h, v24.8h, #6
|
||||
sshr v25.8h, v25.8h, #6
|
||||
|
||||
// gradient_h0 - gradient_h1
|
||||
sub v22.8h, v0.8h, v20.8h
|
||||
sub v23.8h, v1.8h, v21.8h
|
||||
|
||||
// (gradient_h0 + gradient_h1) >> 1
|
||||
shadd v16.8h, v0.8h, v20.8h
|
||||
shadd v17.8h, v1.8h, v21.8h
|
||||
|
||||
st1 {v22.8h, v23.8h}, [gh1], #32
|
||||
|
||||
sshr v26.8h, v26.8h, #6
|
||||
sshr v27.8h, v27.8h, #6
|
||||
|
||||
st1 {v16.8h, v17.8h}, [gh0], #32
|
||||
|
||||
// results of gradient_v1
|
||||
sub v24.8h, v24.8h, v26.8h
|
||||
sub v25.8h, v25.8h, v27.8h
|
||||
|
||||
// (gradient_v0 + gradient_v1) >> 1
|
||||
shadd v18.8h, v2.8h, v24.8h
|
||||
shadd v19.8h, v3.8h, v25.8h
|
||||
|
||||
// gradient_v0 - gradient_v1
|
||||
sub v26.8h, v2.8h, v24.8h
|
||||
sub v27.8h, v3.8h, v25.8h
|
||||
|
||||
st1 {v18.8h,v19.8h}, [gv0], #32
|
||||
|
||||
subs height, height, #1
|
||||
st1 {v26.8h,v27.8h}, [gv1], #32
|
||||
|
||||
b.ne 16b
|
||||
ret
|
||||
|
||||
.unreq gh0
|
||||
|
||||
Reference in New Issue
Block a user