avcodec/x86/snow_dwt: Avoid slice_buffer in inner_add_yblock

It is unnecessary and avoids the src_y parameter;
it also makes this function more ASM-friendly.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2026-04-08 10:39:04 +02:00
parent fd77f00a8f
commit 5f373872c0
4 changed files with 27 additions and 23 deletions

View File

@@ -116,7 +116,8 @@ static av_cold void init_qpel(SnowContext *const s)
}
void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
int src_x, int src_stride, IDWTELEM *const *lines, int add, uint8_t *dst8)
{
int y, x;
av_assume(add); // add == 0 is currently unused
@@ -127,7 +128,7 @@ void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_
const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
IDWTELEM *dst = sb->line[src_y+y];
IDWTELEM *dst = lines[y];
av_assert2(dst);
for(x=0; x<b_w; x++){

View File

@@ -313,7 +313,7 @@ static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer
ff_snow_pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
}
if(sliced){
s->dwt.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
s->dwt.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, sb->line + src_y, add, dst8);
}else{
for(y=0; y<b_h; y++){
//FIXME ugly misuse of obmc_stride

View File

@@ -62,7 +62,7 @@ typedef struct SnowDWTContext {
void (*horizontal_compose97i)(IDWTELEM *b, IDWTELEM *temp, int width);
void (*inner_add_yblock)(const uint8_t *obmc, const int obmc_stride,
uint8_t **block, int b_w, int b_h, int src_x,
int src_y, int src_stride, slice_buffer *sb,
int src_stride, IDWTELEM * const *lines,
int add, uint8_t *dst8);
} SnowDWTContext;
@@ -141,7 +141,7 @@ IDWTELEM *ff_slice_buffer_load_line(slice_buffer *buf, int line);
void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride,
uint8_t **block, int b_w, int b_h, int src_x,
int src_y, int src_stride, slice_buffer *sb,
int src_stride, IDWTELEM *const *lines,
int add, uint8_t *dst8);
int ff_w53_32_c(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t line_size, int h);

View File

@@ -608,7 +608,6 @@ static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM
#if HAVE_6REGS
#define snow_inner_add_yblock_sse2_header \
IDWTELEM * * dst_array = sb->line + src_y;\
x86_reg tmp;\
__asm__ volatile(\
"mov %7, %%"FF_REG_c" \n\t"\
@@ -669,7 +668,7 @@ static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM
#define snow_inner_add_yblock_sse2_end_common2\
"jnz 1b \n\t"\
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
:"+m"(dst8),"+m"(lines),"=&r"(tmp)\
:\
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\
@@ -690,7 +689,8 @@ static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM
snow_inner_add_yblock_sse2_end_common2
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
int src_x, x86_reg src_stride, IDWTELEM *const *lines, int add, uint8_t * dst8)
{
snow_inner_add_yblock_sse2_header
snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
snow_inner_add_yblock_sse2_accum_8("2", "8")
@@ -738,7 +738,8 @@ snow_inner_add_yblock_sse2_end_8
}
static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
int src_x, x86_reg src_stride, IDWTELEM *const *lines, int add, uint8_t * dst8)
{
snow_inner_add_yblock_sse2_header
snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
snow_inner_add_yblock_sse2_accum_16("2", "16")
@@ -762,7 +763,6 @@ snow_inner_add_yblock_sse2_end_16
}
#define snow_inner_add_yblock_mmx_header \
IDWTELEM * * dst_array = sb->line + src_y;\
x86_reg tmp;\
__asm__ volatile(\
"mov %7, %%"FF_REG_c" \n\t"\
@@ -818,13 +818,14 @@ snow_inner_add_yblock_sse2_end_16
"add %%"FF_REG_c", %0 \n\t"\
"dec %2 \n\t"\
"jnz 1b \n\t"\
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
:"+m"(dst8),"+m"(lines),"=&r"(tmp)\
:\
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
"%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
int src_x, x86_reg src_stride, IDWTELEM *const *lines, int add, uint8_t * dst8)
{
snow_inner_add_yblock_mmx_header
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
snow_inner_add_yblock_mmx_accum("2", "8", "0")
@@ -835,7 +836,8 @@ snow_inner_add_yblock_mmx_end("16")
}
static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
int src_x, x86_reg src_stride, IDWTELEM *const *lines, int add, uint8_t * dst8)
{
snow_inner_add_yblock_mmx_header
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
snow_inner_add_yblock_mmx_accum("2", "16", "0")
@@ -852,27 +854,28 @@ snow_inner_add_yblock_mmx_end("32")
}
static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
int src_x, int src_stride, IDWTELEM *const *lines, int add, uint8_t * dst8)
{
if (b_w == 16)
inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, lines, add, dst8);
else if (b_w == 8 && obmc_stride == 16) {
if (!(b_h & 1))
inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, lines, add, dst8);
else
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, lines, add, dst8);
} else
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, lines, add, dst8);
}
static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
int src_x, int src_stride, IDWTELEM *const *lines, int add, uint8_t * dst8)
{
if (b_w == 16)
inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, lines, add, dst8);
else if (b_w == 8 && obmc_stride == 16)
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, lines, add, dst8);
else
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, lines, add, dst8);
}
#endif /* HAVE_6REGS */