mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-30 13:50:50 +08:00
lavc/hevc: add aarch64 NEON for qpel uni-weighted vertical filter
Add NEON-optimized implementations for HEVC QPEL uni-weighted vertical interpolation (put_hevc_qpel_uni_w_v) at 8-bit depth. These functions perform weighted uni-directional prediction with vertical QPEL filtering: - 8-tap vertical QPEL filter - Weighted prediction: (filter_result * wx + offset) >> shift Previously only sizes 4, 8, 16, 64 were optimized. This patch adds optimized implementations for all remaining sizes: 6, 12, 24, 32, 48. Performance results on Apple M4: ./tests/checkasm/checkasm --test=hevc_pel --bench put_hevc_qpel_uni_w_v6_8_neon: 3.40x put_hevc_qpel_uni_w_v12_8_neon: 3.24x put_hevc_qpel_uni_w_v24_8_neon: 3.06x put_hevc_qpel_uni_w_v32_8_neon: 2.66x put_hevc_qpel_uni_w_v48_8_neon: 2.67x Signed-off-by: Jun Zhao <barryjzhao@tencent.com>
This commit is contained in:
@@ -143,7 +143,7 @@ NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
|
||||
int height, int denom, int wx, int ox,
|
||||
intptr_t mx, intptr_t my, int width),);
|
||||
|
||||
NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
|
||||
NEON8_FNPROTO(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
|
||||
const uint8_t *_src, ptrdiff_t _srcstride,
|
||||
int height, int denom, int wx, int ox,
|
||||
intptr_t mx, intptr_t my, int width),);
|
||||
|
||||
@@ -2311,6 +2311,90 @@ function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// Store 6 bytes: 4 bytes + 2 bytes, then advance dst pointer by dststride
|
||||
.macro QPEL_UNI_W_V_6
|
||||
smull v24.4s, v26.4h, v30.4h
|
||||
smull2 v25.4s, v26.8h, v30.8h
|
||||
sqrshl v24.4s, v24.4s, v31.4s
|
||||
sqrshl v25.4s, v25.4s, v31.4s
|
||||
sqadd v24.4s, v24.4s, v29.4s
|
||||
sqadd v25.4s, v25.4s, v29.4s
|
||||
sqxtn v24.4h, v24.4s
|
||||
sqxtn2 v24.8h, v25.4s
|
||||
sqxtun v24.8b, v24.8h
|
||||
add x15, x0, #4 // avoid st1 postincrement stall
|
||||
st1 {v24.s}[0], [x0], x1
|
||||
st1 {v24.h}[2], [x15]
|
||||
.endm
|
||||
|
||||
function ff_hevc_put_hevc_qpel_uni_w_v6_8_neon, export=1
|
||||
QPEL_UNI_W_V_HEADER
|
||||
ldr d16, [x2]
|
||||
ldr d17, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr d18, [x2]
|
||||
ldr d19, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr d20, [x2]
|
||||
ldr d21, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr d22, [x2]
|
||||
|
||||
1: ldr d23, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_6
|
||||
b.eq 2f
|
||||
|
||||
ldr d16, [x2]
|
||||
QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_6
|
||||
b.eq 2f
|
||||
|
||||
ldr d17, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_6
|
||||
b.eq 2f
|
||||
|
||||
ldr d18, [x2]
|
||||
QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_6
|
||||
b.eq 2f
|
||||
|
||||
ldr d19, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_6
|
||||
b.eq 2f
|
||||
|
||||
ldr d20, [x2]
|
||||
QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_6
|
||||
b.eq 2f
|
||||
|
||||
ldr d21, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_6
|
||||
b.eq 2f
|
||||
|
||||
ldr d22, [x2]
|
||||
QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_6
|
||||
b.ne 1b
|
||||
2:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro QPEL_UNI_W_V_16
|
||||
smull v24.4s, v26.4h, v30.4h
|
||||
smull2 v25.4s, v26.8h, v30.8h
|
||||
@@ -2409,6 +2493,103 @@ function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// Store 12 bytes: 8 bytes + 4 bytes, then advance dst pointer by dststride
|
||||
.macro QPEL_UNI_W_V_12
|
||||
smull v24.4s, v26.4h, v30.4h
|
||||
smull2 v25.4s, v26.8h, v30.8h
|
||||
smull v26.4s, v27.4h, v30.4h
|
||||
sqrshl v24.4s, v24.4s, v31.4s
|
||||
sqrshl v25.4s, v25.4s, v31.4s
|
||||
sqrshl v26.4s, v26.4s, v31.4s
|
||||
sqadd v24.4s, v24.4s, v29.4s
|
||||
sqadd v25.4s, v25.4s, v29.4s
|
||||
sqadd v26.4s, v26.4s, v29.4s
|
||||
sqxtn v24.4h, v24.4s
|
||||
sqxtn2 v24.8h, v25.4s
|
||||
sqxtn v26.4h, v26.4s
|
||||
sqxtun v24.8b, v24.8h
|
||||
sqxtun v26.8b, v26.8h
|
||||
add x15, x0, #8 // avoid st1 postincrement stall
|
||||
st1 {v24.d}[0], [x0], x1
|
||||
st1 {v26.s}[0], [x15]
|
||||
.endm
|
||||
|
||||
function ff_hevc_put_hevc_qpel_uni_w_v12_8_neon, export=1
|
||||
QPEL_UNI_W_V_HEADER
|
||||
ldr q16, [x2]
|
||||
ldr q17, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr q18, [x2]
|
||||
ldr q19, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr q20, [x2]
|
||||
ldr q21, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr q22, [x2]
|
||||
|
||||
1: ldr q23, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_12
|
||||
b.eq 2f
|
||||
|
||||
ldr q16, [x2]
|
||||
QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
|
||||
QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_12
|
||||
b.eq 2f
|
||||
|
||||
ldr q17, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
|
||||
QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_12
|
||||
b.eq 2f
|
||||
|
||||
ldr q18, [x2]
|
||||
QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
|
||||
QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_12
|
||||
b.eq 2f
|
||||
|
||||
ldr q19, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
|
||||
QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_12
|
||||
b.eq 2f
|
||||
|
||||
ldr q20, [x2]
|
||||
QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
|
||||
QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_12
|
||||
b.eq 2f
|
||||
|
||||
ldr q21, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
|
||||
QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_12
|
||||
b.eq 2f
|
||||
|
||||
ldr q22, [x2]
|
||||
QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
|
||||
QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_12
|
||||
b.ne 1b
|
||||
2:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
|
||||
QPEL_UNI_W_V_HEADER
|
||||
ldur w13, [sp, #16]
|
||||
@@ -2499,6 +2680,345 @@ function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// Store 24 bytes: process as 16 + 8 in a loop
|
||||
function ff_hevc_put_hevc_qpel_uni_w_v24_8_neon, export=1
|
||||
QPEL_UNI_W_V_HEADER
|
||||
mov w13, #24 // width
|
||||
mov x14, x0
|
||||
mov x15, x2
|
||||
mov w11, w4
|
||||
|
||||
3:
|
||||
cmp w13, #16
|
||||
b.le 4f
|
||||
// Process 16 bytes
|
||||
ldr q16, [x2]
|
||||
ldr q17, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr q18, [x2]
|
||||
ldr q19, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr q20, [x2]
|
||||
ldr q21, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr q22, [x2]
|
||||
|
||||
1: ldr q23, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q16, [x2]
|
||||
QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
|
||||
QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q17, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
|
||||
QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q18, [x2]
|
||||
QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
|
||||
QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q19, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
|
||||
QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q20, [x2]
|
||||
QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
|
||||
QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q21, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
|
||||
QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q22, [x2]
|
||||
QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
|
||||
QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.ne 1b
|
||||
2:
|
||||
subs w13, w13, #16
|
||||
add x14, x14, #16
|
||||
add x15, x15, #16
|
||||
mov x0, x14
|
||||
mov x2, x15
|
||||
mov w4, w11
|
||||
b.hi 3b
|
||||
ret
|
||||
|
||||
4: // Process remaining 8 bytes
|
||||
ldr d16, [x2]
|
||||
ldr d17, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr d18, [x2]
|
||||
ldr d19, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr d20, [x2]
|
||||
ldr d21, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr d22, [x2]
|
||||
|
||||
5: ldr d23, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_8
|
||||
b.eq 6f
|
||||
|
||||
ldr d16, [x2]
|
||||
QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_8
|
||||
b.eq 6f
|
||||
|
||||
ldr d17, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_8
|
||||
b.eq 6f
|
||||
|
||||
ldr d18, [x2]
|
||||
QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_8
|
||||
b.eq 6f
|
||||
|
||||
ldr d19, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_8
|
||||
b.eq 6f
|
||||
|
||||
ldr d20, [x2]
|
||||
QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_8
|
||||
b.eq 6f
|
||||
|
||||
ldr d21, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_8
|
||||
b.eq 6f
|
||||
|
||||
ldr d22, [x2]
|
||||
QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_8
|
||||
b.ne 5b
|
||||
6:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// v32: process as two 16-byte columns
|
||||
function ff_hevc_put_hevc_qpel_uni_w_v32_8_neon, export=1
|
||||
QPEL_UNI_W_V_HEADER
|
||||
mov w13, #32 // width
|
||||
mov x14, x0
|
||||
mov x15, x2
|
||||
mov w11, w4
|
||||
|
||||
3:
|
||||
ldr q16, [x2]
|
||||
ldr q17, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr q18, [x2]
|
||||
ldr q19, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr q20, [x2]
|
||||
ldr q21, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr q22, [x2]
|
||||
|
||||
1: ldr q23, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q16, [x2]
|
||||
QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
|
||||
QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q17, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
|
||||
QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q18, [x2]
|
||||
QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
|
||||
QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q19, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
|
||||
QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q20, [x2]
|
||||
QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
|
||||
QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q21, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
|
||||
QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q22, [x2]
|
||||
QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
|
||||
QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.ne 1b
|
||||
2:
|
||||
subs w13, w13, #16
|
||||
add x14, x14, #16
|
||||
add x15, x15, #16
|
||||
mov x0, x14
|
||||
mov x2, x15
|
||||
mov w4, w11
|
||||
b.hi 3b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// v48: process as three 16-byte columns
|
||||
function ff_hevc_put_hevc_qpel_uni_w_v48_8_neon, export=1
|
||||
QPEL_UNI_W_V_HEADER
|
||||
mov w13, #48 // width
|
||||
mov x14, x0
|
||||
mov x15, x2
|
||||
mov w11, w4
|
||||
|
||||
3:
|
||||
ldr q16, [x2]
|
||||
ldr q17, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr q18, [x2]
|
||||
ldr q19, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr q20, [x2]
|
||||
ldr q21, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
ldr q22, [x2]
|
||||
|
||||
1: ldr q23, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q16, [x2]
|
||||
QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
|
||||
QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q17, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
|
||||
QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q18, [x2]
|
||||
QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
|
||||
QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q19, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
|
||||
QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q20, [x2]
|
||||
QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
|
||||
QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q21, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
|
||||
QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.eq 2f
|
||||
|
||||
ldr q22, [x2]
|
||||
QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
|
||||
QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22
|
||||
subs w4, w4, #1
|
||||
QPEL_UNI_W_V_16
|
||||
b.ne 1b
|
||||
2:
|
||||
subs w13, w13, #16
|
||||
add x14, x14, #16
|
||||
add x15, x15, #16
|
||||
mov x0, x14
|
||||
mov x2, x15
|
||||
mov w4, w11
|
||||
b.hi 3b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function hevc_put_hevc_qpel_uni_hv4_8_end_neon
|
||||
mov x9, #(HEVC_MAX_PB_SIZE * 2)
|
||||
load_qpel_filterh x6, x5
|
||||
|
||||
@@ -274,7 +274,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
|
||||
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
|
||||
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
|
||||
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
|
||||
NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
|
||||
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
|
||||
|
||||
NEON8_FNASSIGN_SHARED_32(c->put_hevc_epel, 0, 1, epel_h,);
|
||||
NEON8_FNASSIGN_SHARED_32(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h,);
|
||||
|
||||
Reference in New Issue
Block a user