aarch64/hpeldsp_neon: fix out-of-bounds read

Fix #21141

The performance improved a little bit.
On A76:
                              Before            After
put_pixels_tab[0][1]_neon:    32.4 ( 3.91x)     31.6 ( 3.99x)
put_pixels_tab[0][3]_neon:    88.0 ( 4.50x)     74.6 ( 5.31x)
put_pixels_tab[1][1]_neon:    33.5 ( 2.52x)     31.2 ( 2.71x)
put_pixels_tab[1][3]_neon:    30.5 ( 3.61x)     21.7 ( 5.08x)

On A55:
                             Before            After
put_pixels_tab[0][1]_neon:   175.2 ( 2.41x)    138.7 ( 3.04x)
put_pixels_tab[0][3]_neon:   334.3 ( 2.71x)    296.1 ( 3.07x)
put_pixels_tab[1][1]_neon:   168.3 ( 1.78x)     94.1 ( 3.19x)
put_pixels_tab[1][3]_neon:   112.3 ( 2.20x)     90.0 ( 2.74x)

(cherry picked from commit 840183d823)
Signed-off-by: Marvin Scholz <epirat07@gmail.com>
This commit is contained in:
Zhao Zhili
2026-01-01 00:52:44 +08:00
committed by Marvin Scholz
parent 3424262575
commit e791fab239

View File

@@ -50,12 +50,13 @@
.endm .endm
.macro pixels16_x2 rnd=1, avg=0 .macro pixels16_x2 rnd=1, avg=0
1: ld1 {v0.16b, v1.16b}, [x1], x2 1:
ld1 {v2.16b, v3.16b}, [x1], x2 ldur q1, [x1, #1]
ld1 {v0.16b}, [x1], x2
subs w3, w3, #2 subs w3, w3, #2
ext v1.16b, v0.16b, v1.16b, #1 ldur q3, [x1, #1]
ld1 {v2.16b}, [x1], x2
avg v0.16b, v0.16b, v1.16b avg v0.16b, v0.16b, v1.16b
ext v3.16b, v2.16b, v3.16b, #1
avg v2.16b, v2.16b, v3.16b avg v2.16b, v2.16b, v3.16b
.if \avg .if \avg
ld1 {v1.16b}, [x0], x2 ld1 {v1.16b}, [x0], x2
@@ -108,20 +109,20 @@
.macro pixels16_xy2 rnd=1, avg=0 .macro pixels16_xy2 rnd=1, avg=0
sub w3, w3, #2 sub w3, w3, #2
ld1 {v0.16b, v1.16b}, [x1], x2 ldur q1, [x1, #1]
ld1 {v4.16b, v5.16b}, [x1], x2 ld1 {v0.16b}, [x1], x2
NRND movi v26.8H, #1 NRND movi v26.8H, #1
ext v1.16b, v0.16b, v1.16b, #1 ldur q5, [x1, #1]
ext v5.16b, v4.16b, v5.16b, #1 ld1 {v4.16b}, [x1], x2
uaddl v16.8h, v0.8b, v1.8b uaddl v16.8h, v0.8b, v1.8b
uaddl2 v20.8h, v0.16b, v1.16b uaddl2 v20.8h, v0.16b, v1.16b
uaddl v18.8h, v4.8b, v5.8b uaddl v18.8h, v4.8b, v5.8b
uaddl2 v22.8h, v4.16b, v5.16b uaddl2 v22.8h, v4.16b, v5.16b
1: subs w3, w3, #2 1: subs w3, w3, #2
ld1 {v0.16b, v1.16b}, [x1], x2 ldur q30, [x1, #1]
ld1 {v0.16b}, [x1], x2
add v24.8h, v16.8h, v18.8h add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H NRND add v24.8H, v24.8H, v26.8H
ext v30.16b, v0.16b, v1.16b, #1
add v1.8h, v20.8h, v22.8h add v1.8h, v20.8h, v22.8h
mshrn v28.8b, v24.8h, #2 mshrn v28.8b, v24.8h, #2
NRND add v1.8H, v1.8H, v26.8H NRND add v1.8H, v1.8H, v26.8H
@@ -131,12 +132,12 @@ NRND add v1.8H, v1.8H, v26.8H
urhadd v28.16b, v28.16b, v16.16b urhadd v28.16b, v28.16b, v16.16b
.endif .endif
uaddl v16.8h, v0.8b, v30.8b uaddl v16.8h, v0.8b, v30.8b
ld1 {v2.16b, v3.16b}, [x1], x2 ldur q3, [x1, #1]
ld1 {v2.16b}, [x1], x2
uaddl2 v20.8h, v0.16b, v30.16b uaddl2 v20.8h, v0.16b, v30.16b
st1 {v28.16b}, [x0], x2 st1 {v28.16b}, [x0], x2
add v24.8h, v16.8h, v18.8h add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H NRND add v24.8H, v24.8H, v26.8H
ext v3.16b, v2.16b, v3.16b, #1
add v0.8h, v20.8h, v22.8h add v0.8h, v20.8h, v22.8h
mshrn v30.8b, v24.8h, #2 mshrn v30.8b, v24.8h, #2
NRND add v0.8H, v0.8H, v26.8H NRND add v0.8H, v0.8H, v26.8H
@@ -150,10 +151,10 @@ NRND add v0.8H, v0.8H, v26.8H
st1 {v30.16b}, [x0], x2 st1 {v30.16b}, [x0], x2
b.gt 1b b.gt 1b
ld1 {v0.16b, v1.16b}, [x1], x2 ldur q30, [x1, #1]
ld1 {v0.16b}, [x1], x2
add v24.8h, v16.8h, v18.8h add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H NRND add v24.8H, v24.8H, v26.8H
ext v30.16b, v0.16b, v1.16b, #1
add v1.8h, v20.8h, v22.8h add v1.8h, v20.8h, v22.8h
mshrn v28.8b, v24.8h, #2 mshrn v28.8b, v24.8h, #2
NRND add v1.8H, v1.8H, v26.8H NRND add v1.8H, v1.8H, v26.8H
@@ -206,10 +207,11 @@ NRND add v0.8H, v0.8H, v26.8H
.endm .endm
.macro pixels8_x2 rnd=1, avg=0 .macro pixels8_x2 rnd=1, avg=0
1: ld1 {v0.8b, v1.8b}, [x1], x2 1:
ext v1.8b, v0.8b, v1.8b, #1 ldur d1, [x1, #1]
ld1 {v2.8b, v3.8b}, [x1], x2 ld1 {v0.8b}, [x1], x2
ext v3.8b, v2.8b, v3.8b, #1 ldur d3, [x1, #1]
ld1 {v2.8b}, [x1], x2
subs w3, w3, #2 subs w3, w3, #2
avg v0.8b, v0.8b, v1.8b avg v0.8b, v0.8b, v1.8b
avg v2.8b, v2.8b, v3.8b avg v2.8b, v2.8b, v3.8b
@@ -263,22 +265,23 @@ NRND add v0.8H, v0.8H, v26.8H
.endm .endm
.macro pixels8_xy2 rnd=1, avg=0 .macro pixels8_xy2 rnd=1, avg=0
ldur d4, [x1, #1]
sub w3, w3, #2 sub w3, w3, #2
ld1 {v0.16b}, [x1], x2 ld1 {v0.8b}, [x1], x2
ld1 {v1.16b}, [x1], x2
NRND movi v19.8H, #1 NRND movi v19.8H, #1
ext v4.16b, v0.16b, v4.16b, #1 ldur d6, [x1, #1]
ext v6.16b, v1.16b, v6.16b, #1 ld1 {v1.8b}, [x1], x2
uaddl v16.8h, v0.8b, v4.8b uaddl v16.8h, v0.8b, v4.8b
uaddl v17.8h, v1.8b, v6.8b uaddl v17.8h, v1.8b, v6.8b
1: subs w3, w3, #2 1: subs w3, w3, #2
ld1 {v0.16b}, [x1], x2 ldur d4, [x1, #1]
ld1 {v0.8b}, [x1], x2
add v18.8h, v16.8h, v17.8h add v18.8h, v16.8h, v17.8h
ext v4.16b, v0.16b, v4.16b, #1
NRND add v18.8H, v18.8H, v19.8H NRND add v18.8H, v18.8H, v19.8H
uaddl v16.8h, v0.8b, v4.8b uaddl v16.8h, v0.8b, v4.8b
mshrn v5.8b, v18.8h, #2 mshrn v5.8b, v18.8h, #2
ld1 {v1.16b}, [x1], x2 ldur d6, [x1, #1]
ld1 {v1.8b}, [x1], x2
add v18.8h, v16.8h, v17.8h add v18.8h, v16.8h, v17.8h
.if \avg .if \avg
ld1 {v7.8b}, [x0] ld1 {v7.8b}, [x0]
@@ -291,14 +294,13 @@ NRND add v18.8H, v18.8H, v19.8H
ld1 {v5.8b}, [x0] ld1 {v5.8b}, [x0]
urhadd v7.8b, v7.8b, v5.8b urhadd v7.8b, v7.8b, v5.8b
.endif .endif
ext v6.16b, v1.16b, v6.16b, #1
uaddl v17.8h, v1.8b, v6.8b uaddl v17.8h, v1.8b, v6.8b
st1 {v7.8b}, [x0], x2 st1 {v7.8b}, [x0], x2
b.gt 1b b.gt 1b
ld1 {v0.16b}, [x1], x2 ldur d4, [x1, #1]
ld1 {v0.8b}, [x1], x2
add v18.8h, v16.8h, v17.8h add v18.8h, v16.8h, v17.8h
ext v4.16b, v0.16b, v4.16b, #1
NRND add v18.8H, v18.8H, v19.8H NRND add v18.8H, v18.8H, v19.8H
uaddl v16.8h, v0.8b, v4.8b uaddl v16.8h, v0.8b, v4.8b
mshrn v5.8b, v18.8h, #2 mshrn v5.8b, v18.8h, #2