mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 21:00:41 +08:00
aarch64/vvc: Optimisations of put_luma_h() functions for 10/12-bit
RPi4: put_chroma_h_10_2x2_c: 63.4 ( 1.00x) put_chroma_h_10_4x4_c: 151.4 ( 1.00x) put_chroma_h_10_8x8_c: 555.1 ( 1.00x) put_chroma_h_10_8x8_neon: 113.9 ( 4.88x) put_chroma_h_10_16x16_c: 1068.5 ( 1.00x) put_chroma_h_10_16x16_neon: 439.4 ( 2.43x) put_chroma_h_10_32x32_c: 3432.6 ( 1.00x) put_chroma_h_10_32x32_neon: 1878.3 ( 1.83x) put_chroma_h_10_64x64_c: 12872.2 ( 1.00x) put_chroma_h_10_64x64_neon: 7868.2 ( 1.64x) put_chroma_h_10_128x128_c: 45612.2 ( 1.00x) put_chroma_h_10_128x128_neon: 28742.1 ( 1.59x) put_chroma_h_12_2x2_c: 63.7 ( 1.00x) put_chroma_h_12_4x4_c: 151.5 ( 1.00x) put_chroma_h_12_8x8_c: 555.2 ( 1.00x) put_chroma_h_12_8x8_neon: 114.2 ( 4.86x) put_chroma_h_12_16x16_c: 1068.1 ( 1.00x) put_chroma_h_12_16x16_neon: 438.8 ( 2.43x) put_chroma_h_12_32x32_c: 3419.7 ( 1.00x) put_chroma_h_12_32x32_neon: 1878.7 ( 1.82x) put_chroma_h_12_64x64_c: 12862.2 ( 1.00x) put_chroma_h_12_64x64_neon: 7868.2 ( 1.63x) put_chroma_h_12_128x128_c: 45613.5 ( 1.00x) put_chroma_h_12_128x128_neon: 28743.3 ( 1.59x) Apple M4: put_chroma_h_10_2x2_c: 2.5 ( 1.00x) put_chroma_h_10_4x4_c: 6.5 ( 1.00x) put_chroma_h_10_8x8_c: 17.8 ( 1.00x) put_chroma_h_10_8x8_neon: 6.8 ( 2.60x) put_chroma_h_10_16x16_c: 53.3 ( 1.00x) put_chroma_h_10_16x16_neon: 30.4 ( 1.75x) put_chroma_h_10_32x32_c: 181.8 ( 1.00x) put_chroma_h_10_32x32_neon: 116.2 ( 1.56x) put_chroma_h_10_64x64_c: 684.2 ( 1.00x) put_chroma_h_10_64x64_neon: 470.3 ( 1.45x) put_chroma_h_10_128x128_c: 2567.6 ( 1.00x) put_chroma_h_10_128x128_neon: 1879.3 ( 1.37x) put_chroma_h_12_2x2_c: 1.9 ( 1.00x) put_chroma_h_12_4x4_c: 7.0 ( 1.00x) put_chroma_h_12_8x8_c: 16.8 ( 1.00x) put_chroma_h_12_8x8_neon: 7.9 ( 2.12x) put_chroma_h_12_16x16_c: 55.0 ( 1.00x) put_chroma_h_12_16x16_neon: 29.0 ( 1.90x) put_chroma_h_12_32x32_c: 182.5 ( 1.00x) put_chroma_h_12_32x32_neon: 116.9 ( 1.56x) put_chroma_h_12_64x64_c: 666.8 ( 1.00x) put_chroma_h_12_64x64_neon: 474.5 ( 1.41x) put_chroma_h_12_128x128_c: 2588.1 ( 1.00x) put_chroma_h_12_128x128_neon: 1912.2 ( 1.35x)
This commit is contained in:
committed by
Martin Storsjö
parent
125bb2e045
commit
c1be2107c9
@@ -43,6 +43,19 @@ void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdif
|
||||
void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
|
||||
const int height, const int8_t *hf, const int8_t *vf, const int width);
|
||||
|
||||
void ff_vvc_put_chroma_h8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
|
||||
const int height, const int8_t *hf, const int8_t *vf, const int width);
|
||||
void ff_vvc_put_chroma_h16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
|
||||
const int height, const int8_t *hf, const int8_t *vf, const int width);
|
||||
void ff_vvc_put_chroma_h_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
|
||||
const int height, const int8_t *hf, const int8_t *vf, const int width);
|
||||
void ff_vvc_put_chroma_h8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
|
||||
const int height, const int8_t *hf, const int8_t *vf, const int width);
|
||||
void ff_vvc_put_chroma_h16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
|
||||
const int height, const int8_t *hf, const int8_t *vf, const int width);
|
||||
void ff_vvc_put_chroma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
|
||||
const int height, const int8_t *hf, const int8_t *vf, const int width);
|
||||
|
||||
void ff_vvc_put_luma_v4_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
|
||||
const int height, const int8_t *hf, const int8_t *vf, const int width);
|
||||
void ff_vvc_put_luma_v8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
|
||||
@@ -290,6 +303,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
c->inter.dmvr[0][1] = ff_vvc_dmvr_h_10_neon;
|
||||
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
|
||||
c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon;
|
||||
|
||||
c->inter.put[1][2][0][1] = ff_vvc_put_chroma_h8_10_neon;
|
||||
c->inter.put[1][3][0][1] = ff_vvc_put_chroma_h16_10_neon;
|
||||
c->inter.put[1][4][0][1] =
|
||||
c->inter.put[1][5][0][1] =
|
||||
c->inter.put[1][6][0][1] = ff_vvc_put_chroma_h_x16_10_neon;
|
||||
|
||||
c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_10_neon;
|
||||
c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_10_neon;
|
||||
c->inter.put[0][4][0][1] =
|
||||
@@ -322,6 +342,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
c->inter.dmvr[0][1] = ff_vvc_dmvr_h_12_neon;
|
||||
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
|
||||
c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon;
|
||||
|
||||
c->inter.put[1][2][0][1] = ff_vvc_put_chroma_h8_12_neon;
|
||||
c->inter.put[1][3][0][1] = ff_vvc_put_chroma_h16_12_neon;
|
||||
c->inter.put[1][4][0][1] =
|
||||
c->inter.put[1][5][0][1] =
|
||||
c->inter.put[1][6][0][1] = ff_vvc_put_chroma_h_x16_12_neon;
|
||||
|
||||
c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_12_neon;
|
||||
c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_12_neon;
|
||||
c->inter.put[0][4][0][1] =
|
||||
|
||||
@@ -1833,6 +1833,137 @@ function ff_vvc_put_luma_h_x16_12_neon, export=1
|
||||
put_luma_h_x16_xx_neon 4
|
||||
endfunc
|
||||
|
||||
.macro put_chroma_h_x8_horizontal_filter shift
|
||||
// 4 bytes from hf loaded to v0.4h
|
||||
// 24 bytes from _src loaded to v20.8h & v21.4h where v21.4h is loaded for shift to v1.8h,v2.8h,v3.8h
|
||||
// v24.4h & v25.4h are output vectors to store
|
||||
ext v1.16b, v20.16b, v21.16b, #2
|
||||
ext v2.16b, v20.16b, v21.16b, #4
|
||||
ext v3.16b, v20.16b, v21.16b, #6
|
||||
smull v24.4s, v20.4h, v0.h[0]
|
||||
smull2 v25.4s, v20.8h, v0.h[0]
|
||||
smlal v24.4s, v1.4h, v0.h[1]
|
||||
smlal2 v25.4s, v1.8h, v0.h[1]
|
||||
smlal v24.4s, v2.4h, v0.h[2]
|
||||
smlal2 v25.4s, v2.8h, v0.h[2]
|
||||
smlal v24.4s, v3.4h, v0.h[3]
|
||||
smlal2 v25.4s, v3.8h, v0.h[3]
|
||||
sqshrn v24.4h, v24.4s, #(\shift)
|
||||
sqshrn v25.4h, v25.4s, #(\shift)
|
||||
.endm
|
||||
|
||||
.macro put_chroma_h8_xx_neon shift
|
||||
// dst .req x0
|
||||
// _src .req x1
|
||||
// _src_stride .req x2
|
||||
// height .req x3
|
||||
// hf .req x4
|
||||
// vf .req x5
|
||||
// width .req x6
|
||||
mov x9, #(VVC_MAX_PB_SIZE * 2)
|
||||
ldr s0, [x4]
|
||||
sub x1, x1, #2
|
||||
sub x2, x2, #16
|
||||
sxtl v0.8h, v0.8b
|
||||
1:
|
||||
ld1 {v20.8h}, [x1], #16
|
||||
ld1 {v21.4h}, [x1], x2
|
||||
put_chroma_h_x8_horizontal_filter \shift
|
||||
subs w3, w3, #1
|
||||
st1 {v24.4h, v25.4h}, [x0], x9
|
||||
b.gt 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro put_chroma_h16_xx_neon shift
|
||||
// dst .req x0
|
||||
// _src .req x1
|
||||
// _src_stride .req x2
|
||||
// height .req x3
|
||||
// hf .req x4
|
||||
// vf .req x5
|
||||
// width .req x6
|
||||
mov x9, #(VVC_MAX_PB_SIZE * 2)
|
||||
ldr s0, [x4]
|
||||
sub x9, x9, #16
|
||||
sub x1, x1, #2
|
||||
sub x2, x2, #32
|
||||
sxtl v0.8h, v0.8b
|
||||
1:
|
||||
ld1 {v20.8h, v21.8h}, [x1], #32
|
||||
ld1 {v22.4h}, [x1], x2
|
||||
put_chroma_h_x8_horizontal_filter \shift
|
||||
mov v20.16b, v21.16b
|
||||
mov v21.16b, v22.16b
|
||||
st1 {v24.4h, v25.4h}, [x0], #16
|
||||
put_chroma_h_x8_horizontal_filter \shift
|
||||
subs w3, w3, #1
|
||||
st1 {v24.4h, v25.4h}, [x0], x9
|
||||
b.gt 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro put_chroma_h_x16_xx_neon shift
|
||||
// dst .req x0
|
||||
// _src .req x1
|
||||
// _src_stride .req x2
|
||||
// height .req x3
|
||||
// hf .req x4
|
||||
// vf .req x5
|
||||
// width .req x6
|
||||
mov x9, #(VVC_MAX_PB_SIZE * 2)
|
||||
ldr s0, [x4]
|
||||
sub x9, x9, w6, uxtw #1
|
||||
sub x2, x2, w6, uxtw #1
|
||||
sxtl v0.8h, v0.8b
|
||||
sub x1, x1, #2
|
||||
sub x2, x2, #16
|
||||
1:
|
||||
ld1 {v20.8h}, [x1], #16
|
||||
mov w8, w6
|
||||
2:
|
||||
ld1 {v21.8h, v22.8h}, [x1], #32
|
||||
put_chroma_h_x8_horizontal_filter \shift
|
||||
mov v20.16b, v21.16b
|
||||
mov v21.16b, v22.16b
|
||||
st1 {v24.4h, v25.4h}, [x0], #16
|
||||
put_chroma_h_x8_horizontal_filter \shift
|
||||
mov v20.16b, v21.16b
|
||||
subs w8, w8, #16
|
||||
st1 {v24.4h, v25.4h}, [x0], #16
|
||||
b.gt 2b
|
||||
subs w3, w3, #1
|
||||
add x0, x0, x9
|
||||
add x1, x1, x2
|
||||
b.gt 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
|
||||
function ff_vvc_put_chroma_h8_10_neon, export=1
|
||||
put_chroma_h8_xx_neon 2
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_chroma_h8_12_neon, export=1
|
||||
put_chroma_h8_xx_neon 4
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_chroma_h16_10_neon, export=1
|
||||
put_chroma_h16_xx_neon 2
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_chroma_h16_12_neon, export=1
|
||||
put_chroma_h16_xx_neon 4
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_chroma_h_x16_10_neon, export=1
|
||||
put_chroma_h_x16_xx_neon 2
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_chroma_h_x16_12_neon, export=1
|
||||
put_chroma_h_x16_xx_neon 4
|
||||
endfunc
|
||||
|
||||
.macro put_luma_v4_xx_neon shift
|
||||
mov x9, #(VVC_MAX_PB_SIZE * 2)
|
||||
sub x1, x1, x2, lsl #1
|
||||
|
||||
Reference in New Issue
Block a user