mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 21:00:41 +08:00
Compared to the MMX version, this version benefits from wider registers and pmaddubsw. It also has fewer unnecessary loads and stores: On x64, the MMX version has 12 unnecessary GPR loads and 6 stores in each line when width is eight; for width 16, there are 17 unnecessary GPR loads and six stores per line. Even the 32bit SSSE3 version only has six loads and zero stores per line more than the x64 version. Furthermore, in contrast to the MMX version, the SSSE3 version also does not clobber the array of block pointers given to it. Benchmarks: inner_add_yblock_2_c: 29.2 ( 1.00x) inner_add_yblock_2_mmx: 32.5 ( 0.90x) inner_add_yblock_2_ssse3: 28.6 ( 1.02x) inner_add_yblock_4_c: 85.2 ( 1.00x) inner_add_yblock_4_mmx: 89.2 ( 0.96x) inner_add_yblock_4_ssse3: 84.5 ( 1.01x) inner_add_yblock_8_c: 302.0 ( 1.00x) inner_add_yblock_8_mmx: 77.0 ( 3.92x) inner_add_yblock_8_ssse3: 30.6 ( 9.85x) inner_add_yblock_16_c: 1164.7 ( 1.00x) inner_add_yblock_16_mmx: 260.4 ( 4.47x) inner_add_yblock_16_ssse3: 82.3 (14.15x) Both the MMX and SSSE3 versions leave the size 2 and 4 cases to ff_snow_inner_add_yblock_c() (but the MMX version has a prologue at the beginning that it needs to undo before the call, leading to the higher overhead for these sizes). I don't know why the SSSE3 version is marginally faster than the C version in these cases. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
165 lines
5.7 KiB
C
165 lines
5.7 KiB
C
/*
|
|
* Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#ifndef AVCODEC_SNOW_DWT_H
|
|
#define AVCODEC_SNOW_DWT_H
|
|
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
|
|
#include "libavutil/attributes.h"
|
|
|
|
typedef struct MPVEncContext MPVEncContext;
|
|
|
|
typedef int DWTELEM;
|
|
typedef short IDWTELEM;
|
|
|
|
#define MAX_DECOMPOSITIONS 8
|
|
|
|
typedef struct DWTCompose {
|
|
IDWTELEM *b0;
|
|
IDWTELEM *b1;
|
|
IDWTELEM *b2;
|
|
IDWTELEM *b3;
|
|
int y;
|
|
} DWTCompose;
|
|
|
|
/** Used to minimize the amount of memory used in order to
|
|
* optimize cache performance. **/
|
|
typedef struct slice_buffer_s {
|
|
IDWTELEM **line; ///< For use by idwt and predict_slices.
|
|
IDWTELEM **data_stack; ///< Used for internal purposes.
|
|
int data_stack_top;
|
|
int line_count;
|
|
int line_width;
|
|
int data_count;
|
|
IDWTELEM *base_buffer; ///< Buffer that this structure is caching.
|
|
} slice_buffer;
|
|
|
|
typedef struct SnowDWTContext {
|
|
void (*vertical_compose97i)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
|
IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5,
|
|
int width);
|
|
void (*horizontal_compose97i)(IDWTELEM *b, IDWTELEM *temp, int width);
|
|
void (*inner_add_yblock)(const uint8_t *obmc, const int obmc_stride,
|
|
uint8_t **block, int b_w, int b_h, int src_x,
|
|
int src_stride, IDWTELEM * const *lines,
|
|
int add, uint8_t *dst8);
|
|
} SnowDWTContext;
|
|
|
|
|
|
#define DWT_97 0
|
|
#define DWT_53 1
|
|
|
|
#define liftS lift
|
|
#define W_AM 3
|
|
#define W_AO 0
|
|
#define W_AS 1
|
|
|
|
#undef liftS
|
|
#define W_BM 1
|
|
#define W_BO 8
|
|
#define W_BS 4
|
|
|
|
#define W_CM 1
|
|
#define W_CO 0
|
|
#define W_CS 0
|
|
|
|
#define W_DM 3
|
|
#define W_DO 4
|
|
#define W_DS 3
|
|
|
|
#define slice_buffer_get_line(slice_buf, line_num) \
|
|
((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] \
|
|
: ff_slice_buffer_load_line((slice_buf), \
|
|
(line_num)))
|
|
|
|
/* C bits used by mmx/sse2/altivec */
|
|
|
|
static av_always_inline void snow_interleave_line_header(int *i, int width, IDWTELEM *low, IDWTELEM *high)
|
|
{
|
|
*i = width - 2;
|
|
|
|
if (width & 1) {
|
|
low[*i + 1] = low[(*i + 1)>>1];
|
|
(*i)--;
|
|
}
|
|
}
|
|
|
|
static av_always_inline void snow_interleave_line_footer(int *i, IDWTELEM *low, const IDWTELEM *high)
|
|
{
|
|
for (; *i >= 0; *i -= 2) {
|
|
low[*i + 1] = high[*i >> 1];
|
|
low[*i] = low[*i >> 1];
|
|
}
|
|
}
|
|
|
|
static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, IDWTELEM *dst, const IDWTELEM *src, const IDWTELEM *ref, int width, int w, int lift_high, int mul, int add, int shift)
|
|
{
|
|
for (; i < w; i++)
|
|
dst[i] = src[i] - ((mul * (ref[i] + ref[i + 1]) + add) >> shift);
|
|
|
|
if ((width ^ lift_high) & 1)
|
|
dst[w] = src[w] - ((mul * 2 * ref[w] + add) >> shift);
|
|
}
|
|
|
|
static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, IDWTELEM *dst, const IDWTELEM *src, const IDWTELEM *ref, int width, int w)
|
|
{
|
|
for (; i < w; i++)
|
|
dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO + 4 * src[i]) >> W_BS);
|
|
|
|
if (width & 1)
|
|
dst[w] = src[w] + ((2 * ref[w] + W_BO + 4 * src[w]) >> W_BS);
|
|
}
|
|
|
|
int ff_slice_buffer_init(slice_buffer *buf, int line_count,
|
|
int max_allocated_lines, int line_width,
|
|
IDWTELEM *base_buffer);
|
|
void ff_slice_buffer_release(slice_buffer *buf, int line);
|
|
void ff_slice_buffer_flush(slice_buffer *buf);
|
|
void ff_slice_buffer_destroy(slice_buffer *buf);
|
|
IDWTELEM *ff_slice_buffer_load_line(slice_buffer *buf, int line);
|
|
|
|
void ff_snow_inner_add_yblock_c(const uint8_t *obmc, const int obmc_stride,
|
|
uint8_t **block, int b_w, int b_h, int src_x,
|
|
int src_stride, IDWTELEM *const *lines,
|
|
int add, uint8_t *dst8);
|
|
|
|
int ff_w53_32_c(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t line_size, int h);
|
|
int ff_w97_32_c(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t line_size, int h);
|
|
|
|
void ff_spatial_dwt(int *buffer, int *temp, int width, int height, int stride,
|
|
int type, int decomposition_count);
|
|
|
|
void ff_spatial_idwt_buffered_init(DWTCompose *cs, slice_buffer *sb, int width,
|
|
int height, int stride_line, int type,
|
|
int decomposition_count);
|
|
void ff_spatial_idwt_buffered_slice(SnowDWTContext *dsp, DWTCompose *cs,
|
|
slice_buffer *slice_buf, IDWTELEM *temp,
|
|
int width, int height, int stride_line,
|
|
int type, int decomposition_count, int y);
|
|
void ff_spatial_idwt(IDWTELEM *buffer, IDWTELEM *temp, int width, int height,
|
|
int stride, int type, int decomposition_count);
|
|
|
|
void ff_dwt_init(SnowDWTContext *c);
|
|
void ff_dwt_init_x86(SnowDWTContext *c);
|
|
|
|
#endif /* AVCODEC_DWT_H */
|