diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index 79ff720cdd..df6b59510d 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -803,28 +803,21 @@ function vvc_bdof_grad_filter_8x_neon, export=0 src1 .req x5 width .req w6 height .req w7 + tbnz w6, #4, 16f -1: - mov x10, src0 - mov w11, width - mov x12, gh0 - mov x13, gv0 - mov x14, src1 - mov x15, gh1 - mov x16, gv1 -2: - ldur q0, [x10, #2] - ldur q1, [x10, #-2] - ldr q2, [x10, #(VVC_MAX_PB_SIZE << 1)] - ldr q3, [x10, #-(VVC_MAX_PB_SIZE << 1)] +8: + ldur q0, [src0, #2] + ldur q1, [src0, #-2] + ldr q2, [src0, #(VVC_MAX_PB_SIZE << 1)] + ldr q3, [src0, #-(VVC_MAX_PB_SIZE << 1)] sshr v0.8h, v0.8h, #6 sshr v1.8h, v1.8h, #6 - ldur q4, [x14, #2] - ldur q5, [x14, #-2] + ldur q4, [src1, #2] + ldur q5, [src1, #-2] sshr v2.8h, v2.8h, #6 sshr v3.8h, v3.8h, #6 - ldr q6, [x14, #(VVC_MAX_PB_SIZE << 1)] - ldr q7, [x14, #-(VVC_MAX_PB_SIZE << 1)] + ldr q6, [src1, #(VVC_MAX_PB_SIZE << 1)] + ldr q7, [src1, #-(VVC_MAX_PB_SIZE << 1)] // results of gradient_h0 sub v0.8h, v0.8h, v1.8h // results of gradient_v0 @@ -839,26 +832,20 @@ function vvc_bdof_grad_filter_8x_neon, export=0 // results of gradient_v1 sub v6.8h, v6.8h, v7.8h - add x10, x10, #16 - add x14, x14, #16 - // (gradient_h0 + gradient_h1) >> 1 shadd v1.8h, v0.8h, v4.8h // gradient_h0 - gradient_h1 sub v5.8h, v0.8h, v4.8h - subs w11, w11, #8 - // (gradient_v0 + gradient_v1) >> 1 shadd v3.8h, v2.8h, v6.8h // gradient_v0 - gradient_v1 sub v7.8h, v2.8h, v6.8h - st1 {v1.8h}, [x12], #16 - st1 {v5.8h}, [x15], #16 - st1 {v3.8h}, [x13], #16 - st1 {v7.8h}, [x16], #16 - b.ne 2b + st1 {v1.8h}, [gh0] + st1 {v5.8h}, [gh1] + st1 {v3.8h}, [gv0] + st1 {v7.8h}, [gv1] subs height, height, #1 add gh0, gh0, #(BDOF_BLOCK_SIZE << 1) @@ -867,7 +854,84 @@ function vvc_bdof_grad_filter_8x_neon, export=0 add gh1, gh1, #(BDOF_BLOCK_SIZE << 1) add gv1, gv1, #(BDOF_BLOCK_SIZE << 1) add src1, src1, #(VVC_MAX_PB_SIZE << 1) - b.ne 1b + b.ne 8b + ret + +16: + ldur q0, [src0, #2] + ldur q1, [src0, #18] + ldur q16, [src0, #-2] + sshr v0.8h, v0.8h, #6 + ldur q17, [src0, #14] + sshr v1.8h, v1.8h, #6 + ldp q18, q19, [src0, #-(VVC_MAX_PB_SIZE << 1)] + sshr v16.8h, v16.8h, #6 + ldp q2, q3, [src0, #(VVC_MAX_PB_SIZE << 1)]! + ldur q20, [src1, #2] + sshr v17.8h, v17.8h, #6 + ldur q21, [src1, #18] + sshr v2.8h, v2.8h, #6 + ldur q22, [src1, #-2] + sshr v3.8h, v3.8h, #6 + ldur q23, [src1, #14] + sshr v18.8h, v18.8h, #6 + ldp q26, q27, [src1, #-(VVC_MAX_PB_SIZE << 1)] + sshr v19.8h, v19.8h, #6 + ldp q24, q25, [src1, #(VVC_MAX_PB_SIZE << 1)]! + + // results of gradient_h0 + sub v0.8h, v0.8h, v16.8h + sub v1.8h, v1.8h, v17.8h + + // results of gradient_v0 + sub v2.8h, v2.8h, v18.8h + sub v3.8h, v3.8h, v19.8h + + sshr v20.8h, v20.8h, #6 + sshr v21.8h, v21.8h, #6 + sshr v22.8h, v22.8h, #6 + sshr v23.8h, v23.8h, #6 + + // results of gradient_h1 + sub v20.8h, v20.8h, v22.8h + sub v21.8h, v21.8h, v23.8h + + sshr v24.8h, v24.8h, #6 + sshr v25.8h, v25.8h, #6 + + // gradient_h0 - gradient_h1 + sub v22.8h, v0.8h, v20.8h + sub v23.8h, v1.8h, v21.8h + + // (gradient_h0 + gradient_h1) >> 1 + shadd v16.8h, v0.8h, v20.8h + shadd v17.8h, v1.8h, v21.8h + + st1 {v22.8h, v23.8h}, [gh1], #32 + + sshr v26.8h, v26.8h, #6 + sshr v27.8h, v27.8h, #6 + + st1 {v16.8h, v17.8h}, [gh0], #32 + + // results of gradient_v1 + sub v24.8h, v24.8h, v26.8h + sub v25.8h, v25.8h, v27.8h + + // (gradient_v0 + gradient_v1) >> 1 + shadd v18.8h, v2.8h, v24.8h + shadd v19.8h, v3.8h, v25.8h + + // gradient_v0 - gradient_v1 + sub v26.8h, v2.8h, v24.8h + sub v27.8h, v3.8h, v25.8h + + st1 {v18.8h,v19.8h}, [gv0], #32 + + subs height, height, #1 + st1 {v26.8h,v27.8h}, [gv1], #32 + + b.ne 16b ret .unreq gh0