avcodec/aarch64/vvc: Implement dmvr_h_8

A78:
dmvr_h_8_12x20_neon:                                    76.6 ( 4.31x)
dmvr_h_8_20x12_neon:                                    65.8 ( 3.49x)
dmvr_h_8_20x20_neon:                                   106.6 ( 3.62x)

A72:
dmvr_h_8_12x20_neon:                                   190.6 ( 4.40x)
dmvr_h_8_20x12_neon:                                   171.1 ( 4.31x)
dmvr_h_8_20x20_neon:                                   275.1 ( 4.50x)
This commit is contained in:
Krzysztof Pyrkosz
2025-09-05 22:24:55 +02:00
committed by jianhuaw
parent 1b97966199
commit 189e841cfd
2 changed files with 54 additions and 0 deletions

View File

@@ -95,6 +95,7 @@ W_AVG_FUN(12)
DMVR_FUN(, 8)
DMVR_FUN(, 12)
DMVR_FUN(h_, 8)
DMVR_FUN(hv_, 8)
DMVR_FUN(hv_, 10)
DMVR_FUN(hv_, 12)
@@ -188,6 +189,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.avg = ff_vvc_avg_8_neon;
c->inter.w_avg = vvc_w_avg_8;
c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon;
c->inter.dmvr[0][1] = ff_vvc_dmvr_h_8_neon;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
c->inter.apply_bdof = ff_vvc_apply_bdof_8_neon;

View File

@@ -385,6 +385,58 @@ function ff_vvc_dmvr_12_neon, export=1
ret
endfunc
function ff_vvc_dmvr_h_8_neon, export=1
movrel x7, X(ff_vvc_inter_luma_dmvr_filters)
add x7, x7, x4, lsl #1
ld2r {v0.16b, v1.16b}, [x7]
tbz w6, #4, 12f
20:
ldur q3, [x1, #1]
ldr q2, [x1]
umull v4.8h, v0.8b, v2.8b
umull2 v5.8h, v0.16b, v2.16b
ldur s17, [x1, #17]
umull v6.8h, v1.8b, v3.8b
ldr s16, [x1, #16]
umull2 v7.8h, v1.16b, v3.16b
add v4.8h, v4.8h, v6.8h
umull v17.8h, v1.8b, v17.8b
add v5.8h, v5.8h, v7.8h
umull v16.8h, v0.8b, v16.8b
srshr v4.8h, v4.8h, #2
add v16.4h, v16.4h, v17.4h
srshr v5.8h, v5.8h, #2
srshr v16.4h, v16.4h, #2
st1 {v4.8h, v5.8h}, [x0], #32
subs w3, w3, #1
st1 {v16.4h}, [x0], #8
add x1, x1, x2
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
b.ne 20b
ret
12:
ldur d3, [x1, #1]
ldr d2, [x1]
umull v4.8h, v0.8b, v2.8b
ldur s17, [x1, #9]
umull v6.8h, v1.8b, v3.8b
ldr s16, [x1, #8]
add v4.8h, v4.8h, v6.8h
umull v17.8h, v1.8b, v17.8b
umull v16.8h, v0.8b, v16.8b
srshr v4.8h, v4.8h, #2
add v16.4h, v16.4h, v17.4h
srshr v16.4h, v16.4h, #2
st1 {v4.8h}, [x0], #16
subs w3, w3, #1
st1 {v16.4h}, [x0], #8
add x1, x1, x2
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
b.ne 12b
ret
endfunc
function ff_vvc_dmvr_hv_8_neon, export=1
tmp0 .req x7
tmp1 .req x8