avcodec/aarch64/vvc: Implement dmvr_h_8

A78: dmvr_h_8_12x20_neon: 76.6 ( 4.31x) dmvr_h_8_20x12_neon: 65.8 ( 3.49x) dmvr_h_8_20x20_neon: 106.6 ( 3.62x) A72: dmvr_h_8_12x20_neon: 190.6 ( 4.40x) dmvr_h_8_20x12_neon: 171.1 ( 4.31x) dmvr_h_8_20x20_neon: 275.1 ( 4.50x)
2026-05-01 06:13:08 +08:00 · 2025-09-05 22:24:55 +02:00
parent 1b97966199
commit 189e841cfd
2 changed files with 54 additions and 0 deletions
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -95,6 +95,7 @@ W_AVG_FUN(12)

 DMVR_FUN(, 8)
 DMVR_FUN(, 12)
+DMVR_FUN(h_, 8)
 DMVR_FUN(hv_, 8)
 DMVR_FUN(hv_, 10)
 DMVR_FUN(hv_, 12)
@@ -188,6 +189,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
        c->inter.avg = ff_vvc_avg_8_neon;
        c->inter.w_avg = vvc_w_avg_8;
        c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon;
+        c->inter.dmvr[0][1] = ff_vvc_dmvr_h_8_neon;
        c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
        c->inter.apply_bdof = ff_vvc_apply_bdof_8_neon;

--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -385,6 +385,58 @@ function ff_vvc_dmvr_12_neon, export=1
        ret
 endfunc

+function ff_vvc_dmvr_h_8_neon, export=1
+        movrel          x7, X(ff_vvc_inter_luma_dmvr_filters)
+        add             x7, x7, x4, lsl #1
+        ld2r            {v0.16b, v1.16b}, [x7]
+        tbz             w6, #4, 12f
+20:
+        ldur            q3, [x1, #1]
+        ldr             q2, [x1]
+        umull           v4.8h, v0.8b, v2.8b
+        umull2          v5.8h, v0.16b, v2.16b
+        ldur            s17, [x1, #17]
+        umull           v6.8h, v1.8b, v3.8b
+        ldr             s16, [x1, #16]
+        umull2          v7.8h, v1.16b, v3.16b
+        add             v4.8h, v4.8h, v6.8h
+        umull           v17.8h, v1.8b, v17.8b
+        add             v5.8h, v5.8h, v7.8h
+        umull           v16.8h, v0.8b, v16.8b
+        srshr           v4.8h, v4.8h, #2
+        add             v16.4h, v16.4h, v17.4h
+        srshr           v5.8h, v5.8h, #2
+        srshr           v16.4h, v16.4h, #2
+        st1             {v4.8h, v5.8h}, [x0], #32
+        subs            w3, w3, #1
+        st1             {v16.4h}, [x0], #8
+        add             x1, x1, x2
+        add             x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
+        b.ne            20b
+        ret
+
+12:
+        ldur            d3, [x1, #1]
+        ldr             d2, [x1]
+        umull           v4.8h, v0.8b, v2.8b
+        ldur            s17, [x1, #9]
+        umull           v6.8h, v1.8b, v3.8b
+        ldr             s16, [x1, #8]
+        add             v4.8h, v4.8h, v6.8h
+        umull           v17.8h, v1.8b, v17.8b
+        umull           v16.8h, v0.8b, v16.8b
+        srshr           v4.8h, v4.8h, #2
+        add             v16.4h, v16.4h, v17.4h
+        srshr           v16.4h, v16.4h, #2
+        st1             {v4.8h}, [x0], #16
+        subs            w3, w3, #1
+        st1             {v16.4h}, [x0], #8
+        add             x1, x1, x2
+        add             x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
+        b.ne            12b
+        ret
+endfunc
+
 function ff_vvc_dmvr_hv_8_neon, export=1
        tmp0            .req x7
        tmp1            .req x8