diff --git a/libswresample/arm/resample.S b/libswresample/arm/resample.S index 791f4cc016..e48df3c8fb 100644 --- a/libswresample/arm/resample.S +++ b/libswresample/arm/resample.S @@ -21,57 +21,57 @@ #include "libavutil/arm/asm.S" function ff_resample_common_apply_filter_x4_float_neon, export=1 - vmov.f32 q0, #0.0 @ accumulator -1: vld1.32 {q1}, [r1]! @ src[0..3] - vld1.32 {q2}, [r2]! @ filter[0..3] - vmla.f32 q0, q1, q2 @ accumulator += src[0..3] * filter[0..3] - subs r3, #4 @ filter_length -= 4 - bgt 1b @ loop until filter_length - vpadd.f32 d0, d0, d1 @ pair adding of the 4x32-bit accumulated values - vpadd.f32 d0, d0, d0 @ pair adding of the 4x32-bit accumulator values - vst1.32 {d0[0]}, [r0] @ write accumulator - bx lr + vmov.f32 q0, #0.0 @ accumulator +1: vld1.32 {q1}, [r1]! @ src[0..3] + vld1.32 {q2}, [r2]! @ filter[0..3] + vmla.f32 q0, q1, q2 @ accumulator += src[0..3] * filter[0..3] + subs r3, #4 @ filter_length -= 4 + bgt 1b @ loop until filter_length + vpadd.f32 d0, d0, d1 @ pair adding of the 4x32-bit accumulated values + vpadd.f32 d0, d0, d0 @ pair adding of the 4x32-bit accumulator values + vst1.32 {d0[0]}, [r0] @ write accumulator + bx lr endfunc function ff_resample_common_apply_filter_x8_float_neon, export=1 - vmov.f32 q0, #0.0 @ accumulator -1: vld1.32 {q1}, [r1]! @ src[0..3] - vld1.32 {q2}, [r2]! @ filter[0..3] - vld1.32 {q8}, [r1]! @ src[4..7] - vld1.32 {q9}, [r2]! @ filter[4..7] - vmla.f32 q0, q1, q2 @ accumulator += src[0..3] * filter[0..3] - vmla.f32 q0, q8, q9 @ accumulator += src[4..7] * filter[4..7] - subs r3, #8 @ filter_length -= 8 - bgt 1b @ loop until filter_length - vpadd.f32 d0, d0, d1 @ pair adding of the 4x32-bit accumulated values - vpadd.f32 d0, d0, d0 @ pair adding of the 4x32-bit accumulator values - vst1.32 {d0[0]}, [r0] @ write accumulator - bx lr + vmov.f32 q0, #0.0 @ accumulator +1: vld1.32 {q1}, [r1]! @ src[0..3] + vld1.32 {q2}, [r2]! @ filter[0..3] + vld1.32 {q8}, [r1]! @ src[4..7] + vld1.32 {q9}, [r2]! @ filter[4..7] + vmla.f32 q0, q1, q2 @ accumulator += src[0..3] * filter[0..3] + vmla.f32 q0, q8, q9 @ accumulator += src[4..7] * filter[4..7] + subs r3, #8 @ filter_length -= 8 + bgt 1b @ loop until filter_length + vpadd.f32 d0, d0, d1 @ pair adding of the 4x32-bit accumulated values + vpadd.f32 d0, d0, d0 @ pair adding of the 4x32-bit accumulator values + vst1.32 {d0[0]}, [r0] @ write accumulator + bx lr endfunc function ff_resample_common_apply_filter_x4_s16_neon, export=1 - vmov.s32 q0, #0 @ accumulator -1: vld1.16 {d2}, [r1]! @ src[0..3] - vld1.16 {d4}, [r2]! @ filter[0..3] - vmlal.s16 q0, d2, d4 @ accumulator += src[0..3] * filter[0..3] - subs r3, #4 @ filter_length -= 4 - bgt 1b @ loop until filter_length - vpadd.s32 d0, d0, d1 @ pair adding of the 4x32-bit accumulated values - vpadd.s32 d0, d0, d0 @ pair adding of the 4x32-bit accumulator values - vst1.32 {d0[0]}, [r0] @ write accumulator - bx lr + vmov.s32 q0, #0 @ accumulator +1: vld1.16 {d2}, [r1]! @ src[0..3] + vld1.16 {d4}, [r2]! @ filter[0..3] + vmlal.s16 q0, d2, d4 @ accumulator += src[0..3] * filter[0..3] + subs r3, #4 @ filter_length -= 4 + bgt 1b @ loop until filter_length + vpadd.s32 d0, d0, d1 @ pair adding of the 4x32-bit accumulated values + vpadd.s32 d0, d0, d0 @ pair adding of the 4x32-bit accumulator values + vst1.32 {d0[0]}, [r0] @ write accumulator + bx lr endfunc function ff_resample_common_apply_filter_x8_s16_neon, export=1 - vmov.s32 q0, #0 @ accumulator -1: vld1.16 {q1}, [r1]! @ src[0..7] - vld1.16 {q2}, [r2]! @ filter[0..7] - vmlal.s16 q0, d2, d4 @ accumulator += src[0..3] * filter[0..3] - vmlal.s16 q0, d3, d5 @ accumulator += src[4..7] * filter[4..7] - subs r3, #8 @ filter_length -= 8 - bgt 1b @ loop until filter_length - vpadd.s32 d0, d0, d1 @ pair adding of the 4x32-bit accumulated values - vpadd.s32 d0, d0, d0 @ pair adding of the 4x32-bit accumulator values - vst1.32 {d0[0]}, [r0] @ write accumulator - bx lr + vmov.s32 q0, #0 @ accumulator +1: vld1.16 {q1}, [r1]! @ src[0..7] + vld1.16 {q2}, [r2]! @ filter[0..7] + vmlal.s16 q0, d2, d4 @ accumulator += src[0..3] * filter[0..3] + vmlal.s16 q0, d3, d5 @ accumulator += src[4..7] * filter[4..7] + subs r3, #8 @ filter_length -= 8 + bgt 1b @ loop until filter_length + vpadd.s32 d0, d0, d1 @ pair adding of the 4x32-bit accumulated values + vpadd.s32 d0, d0, d0 @ pair adding of the 4x32-bit accumulator values + vst1.32 {d0[0]}, [r0] @ write accumulator + bx lr endfunc