From e791fab2391129c4289510b34c9057198f4fdaee Mon Sep 17 00:00:00 2001 From: Zhao Zhili Date: Thu, 1 Jan 2026 00:52:44 +0800 Subject: [PATCH] aarch64/hpeldsp_neon: fix out-of-bounds read Fix #21141 The performance improved a little bit. On A76: Before After put_pixels_tab[0][1]_neon: 32.4 ( 3.91x) 31.6 ( 3.99x) put_pixels_tab[0][3]_neon: 88.0 ( 4.50x) 74.6 ( 5.31x) put_pixels_tab[1][1]_neon: 33.5 ( 2.52x) 31.2 ( 2.71x) put_pixels_tab[1][3]_neon: 30.5 ( 3.61x) 21.7 ( 5.08x) On A55: Before After put_pixels_tab[0][1]_neon: 175.2 ( 2.41x) 138.7 ( 3.04x) put_pixels_tab[0][3]_neon: 334.3 ( 2.71x) 296.1 ( 3.07x) put_pixels_tab[1][1]_neon: 168.3 ( 1.78x) 94.1 ( 3.19x) put_pixels_tab[1][3]_neon: 112.3 ( 2.20x) 90.0 ( 2.74x) (cherry picked from commit 840183d823788306adb30153183614c3769bf6d3) Signed-off-by: Marvin Scholz --- libavcodec/aarch64/hpeldsp_neon.S | 58 ++++++++++++++++--------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/libavcodec/aarch64/hpeldsp_neon.S b/libavcodec/aarch64/hpeldsp_neon.S index e7c1549c40..fd2c2c98c4 100644 --- a/libavcodec/aarch64/hpeldsp_neon.S +++ b/libavcodec/aarch64/hpeldsp_neon.S @@ -50,12 +50,13 @@ .endm .macro pixels16_x2 rnd=1, avg=0 -1: ld1 {v0.16b, v1.16b}, [x1], x2 - ld1 {v2.16b, v3.16b}, [x1], x2 +1: + ldur q1, [x1, #1] + ld1 {v0.16b}, [x1], x2 subs w3, w3, #2 - ext v1.16b, v0.16b, v1.16b, #1 + ldur q3, [x1, #1] + ld1 {v2.16b}, [x1], x2 avg v0.16b, v0.16b, v1.16b - ext v3.16b, v2.16b, v3.16b, #1 avg v2.16b, v2.16b, v3.16b .if \avg ld1 {v1.16b}, [x0], x2 @@ -108,20 +109,20 @@ .macro pixels16_xy2 rnd=1, avg=0 sub w3, w3, #2 - ld1 {v0.16b, v1.16b}, [x1], x2 - ld1 {v4.16b, v5.16b}, [x1], x2 + ldur q1, [x1, #1] + ld1 {v0.16b}, [x1], x2 NRND movi v26.8H, #1 - ext v1.16b, v0.16b, v1.16b, #1 - ext v5.16b, v4.16b, v5.16b, #1 + ldur q5, [x1, #1] + ld1 {v4.16b}, [x1], x2 uaddl v16.8h, v0.8b, v1.8b uaddl2 v20.8h, v0.16b, v1.16b uaddl v18.8h, v4.8b, v5.8b uaddl2 v22.8h, v4.16b, v5.16b 1: subs w3, w3, #2 - ld1 {v0.16b, v1.16b}, [x1], x2 + ldur q30, [x1, #1] + ld1 {v0.16b}, [x1], x2 add v24.8h, v16.8h, v18.8h NRND add v24.8H, v24.8H, v26.8H - ext v30.16b, v0.16b, v1.16b, #1 add v1.8h, v20.8h, v22.8h mshrn v28.8b, v24.8h, #2 NRND add v1.8H, v1.8H, v26.8H @@ -131,12 +132,12 @@ NRND add v1.8H, v1.8H, v26.8H urhadd v28.16b, v28.16b, v16.16b .endif uaddl v16.8h, v0.8b, v30.8b - ld1 {v2.16b, v3.16b}, [x1], x2 + ldur q3, [x1, #1] + ld1 {v2.16b}, [x1], x2 uaddl2 v20.8h, v0.16b, v30.16b st1 {v28.16b}, [x0], x2 add v24.8h, v16.8h, v18.8h NRND add v24.8H, v24.8H, v26.8H - ext v3.16b, v2.16b, v3.16b, #1 add v0.8h, v20.8h, v22.8h mshrn v30.8b, v24.8h, #2 NRND add v0.8H, v0.8H, v26.8H @@ -150,10 +151,10 @@ NRND add v0.8H, v0.8H, v26.8H st1 {v30.16b}, [x0], x2 b.gt 1b - ld1 {v0.16b, v1.16b}, [x1], x2 + ldur q30, [x1, #1] + ld1 {v0.16b}, [x1], x2 add v24.8h, v16.8h, v18.8h NRND add v24.8H, v24.8H, v26.8H - ext v30.16b, v0.16b, v1.16b, #1 add v1.8h, v20.8h, v22.8h mshrn v28.8b, v24.8h, #2 NRND add v1.8H, v1.8H, v26.8H @@ -206,10 +207,11 @@ NRND add v0.8H, v0.8H, v26.8H .endm .macro pixels8_x2 rnd=1, avg=0 -1: ld1 {v0.8b, v1.8b}, [x1], x2 - ext v1.8b, v0.8b, v1.8b, #1 - ld1 {v2.8b, v3.8b}, [x1], x2 - ext v3.8b, v2.8b, v3.8b, #1 +1: + ldur d1, [x1, #1] + ld1 {v0.8b}, [x1], x2 + ldur d3, [x1, #1] + ld1 {v2.8b}, [x1], x2 subs w3, w3, #2 avg v0.8b, v0.8b, v1.8b avg v2.8b, v2.8b, v3.8b @@ -263,22 +265,23 @@ NRND add v0.8H, v0.8H, v26.8H .endm .macro pixels8_xy2 rnd=1, avg=0 + ldur d4, [x1, #1] sub w3, w3, #2 - ld1 {v0.16b}, [x1], x2 - ld1 {v1.16b}, [x1], x2 + ld1 {v0.8b}, [x1], x2 NRND movi v19.8H, #1 - ext v4.16b, v0.16b, v4.16b, #1 - ext v6.16b, v1.16b, v6.16b, #1 + ldur d6, [x1, #1] + ld1 {v1.8b}, [x1], x2 uaddl v16.8h, v0.8b, v4.8b uaddl v17.8h, v1.8b, v6.8b 1: subs w3, w3, #2 - ld1 {v0.16b}, [x1], x2 + ldur d4, [x1, #1] + ld1 {v0.8b}, [x1], x2 add v18.8h, v16.8h, v17.8h - ext v4.16b, v0.16b, v4.16b, #1 NRND add v18.8H, v18.8H, v19.8H uaddl v16.8h, v0.8b, v4.8b mshrn v5.8b, v18.8h, #2 - ld1 {v1.16b}, [x1], x2 + ldur d6, [x1, #1] + ld1 {v1.8b}, [x1], x2 add v18.8h, v16.8h, v17.8h .if \avg ld1 {v7.8b}, [x0] @@ -291,14 +294,13 @@ NRND add v18.8H, v18.8H, v19.8H ld1 {v5.8b}, [x0] urhadd v7.8b, v7.8b, v5.8b .endif - ext v6.16b, v1.16b, v6.16b, #1 uaddl v17.8h, v1.8b, v6.8b st1 {v7.8b}, [x0], x2 b.gt 1b - ld1 {v0.16b}, [x1], x2 + ldur d4, [x1, #1] + ld1 {v0.8b}, [x1], x2 add v18.8h, v16.8h, v17.8h - ext v4.16b, v0.16b, v4.16b, #1 NRND add v18.8H, v18.8H, v19.8H uaddl v16.8h, v0.8b, v4.8b mshrn v5.8b, v18.8h, #2