mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 21:00:41 +08:00
lavc: implement a Vulkan-based prores encoder
Adds a vulkan implementation of the reference prores kostya encoder. Provides about 3-4x speedup over the CPU code
This commit is contained in:
1
configure
vendored
1
configure
vendored
@@ -3239,6 +3239,7 @@ prores_decoder_select="blockdsp idctdsp"
|
||||
prores_encoder_select="fdctdsp"
|
||||
prores_aw_encoder_select="fdctdsp"
|
||||
prores_ks_encoder_select="fdctdsp"
|
||||
prores_ks_vulkan_encoder_select="vulkan spirv_compiler"
|
||||
prores_raw_decoder_select="blockdsp idctdsp"
|
||||
qcelp_decoder_select="lsp"
|
||||
qdm2_decoder_select="mpegaudiodsp"
|
||||
|
||||
@@ -648,6 +648,7 @@ OBJS-$(CONFIG_PRORES_DECODER) += proresdec.o proresdsp.o proresdata.o
|
||||
OBJS-$(CONFIG_PRORES_ENCODER) += proresenc_anatoliy.o proresdata.o
|
||||
OBJS-$(CONFIG_PRORES_AW_ENCODER) += proresenc_anatoliy.o proresdata.o
|
||||
OBJS-$(CONFIG_PRORES_KS_ENCODER) += proresenc_kostya.o proresdata.o proresenc_kostya_common.o
|
||||
OBJS-$(CONFIG_PRORES_KS_VULKAN_ENCODER) += proresenc_kostya_vulkan.o proresdata.o proresenc_kostya_common.o
|
||||
OBJS-$(CONFIG_PRORES_RAW_DECODER) += prores_raw.o proresdsp.o proresdata.o
|
||||
OBJS-$(CONFIG_PRORES_VIDEOTOOLBOX_ENCODER) += videotoolboxenc.o
|
||||
OBJS-$(CONFIG_PROSUMER_DECODER) += prosumer.o
|
||||
|
||||
@@ -271,6 +271,7 @@ extern const FFCodec ff_prores_encoder;
|
||||
extern const FFCodec ff_prores_decoder;
|
||||
extern const FFCodec ff_prores_aw_encoder;
|
||||
extern const FFCodec ff_prores_ks_encoder;
|
||||
extern const FFCodec ff_prores_ks_vulkan_encoder;
|
||||
extern const FFCodec ff_prores_raw_decoder;
|
||||
extern const FFCodec ff_prosumer_decoder;
|
||||
extern const FFCodec ff_psd_decoder;
|
||||
|
||||
1015
libavcodec/proresenc_kostya_vulkan.c
Normal file
1015
libavcodec/proresenc_kostya_vulkan.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -18,6 +18,12 @@ OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/ffv1_dec_setup.comp.spv.o \
|
||||
vulkan/ffv1_dec_rgb.comp.spv.o \
|
||||
vulkan/ffv1_dec_rgb_golomb.comp.spv.o
|
||||
|
||||
OBJS-$(CONFIG_PRORES_KS_VULKAN_ENCODER) += vulkan/prores_ks_alpha_data.comp.spv.o \
|
||||
vulkan/prores_ks_slice_data.comp.spv.o \
|
||||
vulkan/prores_ks_estimate_slice.comp.spv.o \
|
||||
vulkan/prores_ks_encode_slice.comp.spv.o \
|
||||
vulkan/prores_ks_trellis_node.comp.spv.o
|
||||
|
||||
OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/prores_raw_decode.comp.spv.o \
|
||||
vulkan/prores_raw_idct.comp.spv.o
|
||||
|
||||
|
||||
@@ -118,4 +118,53 @@ void idct8(uint block, uint offset, uint stride)
|
||||
blocks[block][7*stride + offset] = u7;
|
||||
}
|
||||
|
||||
void fdct8(uint block, uint offset, uint stride)
|
||||
{
|
||||
const float c_pi = radians(180);
|
||||
const float c_rt2 = sqrt(2.0);
|
||||
const float c_norm = 1 / sqrt(8.0);
|
||||
const float c_a = c_rt2 * cos( c_pi / 16);
|
||||
const float c_b = c_rt2 * cos( c_pi / 8);
|
||||
const float c_c = c_rt2 * cos(3 * c_pi / 16);
|
||||
const float c_d = c_rt2 * cos(5 * c_pi / 16);
|
||||
const float c_e = c_rt2 * cos(3 * c_pi / 8);
|
||||
const float c_f = c_rt2 * cos(7 * c_pi / 16);
|
||||
|
||||
float u0, u1, u2, u3, u4, u5, u6, u7;
|
||||
|
||||
/* Input */
|
||||
u0 = blocks[block][0*stride + offset];
|
||||
u1 = blocks[block][1*stride + offset];
|
||||
u2 = blocks[block][2*stride + offset];
|
||||
u3 = blocks[block][3*stride + offset];
|
||||
u4 = blocks[block][4*stride + offset];
|
||||
u5 = blocks[block][5*stride + offset];
|
||||
u6 = blocks[block][6*stride + offset];
|
||||
u7 = blocks[block][7*stride + offset];
|
||||
|
||||
float X07P = u0 + u7;
|
||||
float X16P = u1 + u6;
|
||||
float X25P = u2 + u5;
|
||||
float X34P = u3 + u4;
|
||||
|
||||
float X07M = u0 - u7;
|
||||
float X61M = u6 - u1;
|
||||
float X25M = u2 - u5;
|
||||
float X43M = u4 - u3;
|
||||
|
||||
float X07P34PP = X07P + X34P;
|
||||
float X07P34PM = X07P - X34P;
|
||||
float X16P25PP = X16P + X25P;
|
||||
float X16P25PM = X16P - X25P;
|
||||
|
||||
blocks[block][0*stride + offset] = c_norm * (X07P34PP + X16P25PP);
|
||||
blocks[block][2*stride + offset] = c_norm * (c_b * X07P34PM + c_e * X16P25PM);
|
||||
blocks[block][4*stride + offset] = c_norm * (X07P34PP - X16P25PP);
|
||||
blocks[block][6*stride + offset] = c_norm * (c_e * X07P34PM - c_b * X16P25PM);
|
||||
blocks[block][1*stride + offset] = c_norm * (c_a * X07M - c_c * X61M + c_d * X25M - c_f * X43M);
|
||||
blocks[block][3*stride + offset] = c_norm * (c_c * X07M + c_f * X61M - c_a * X25M + c_d * X43M);
|
||||
blocks[block][5*stride + offset] = c_norm * (c_d * X07M + c_a * X61M + c_f * X25M - c_c * X43M);
|
||||
blocks[block][7*stride + offset] = c_norm * (c_f * X07M + c_d * X61M + c_c * X25M + c_a * X43M);
|
||||
}
|
||||
|
||||
#endif /* VULKAN_DCT_H */
|
||||
|
||||
87
libavcodec/vulkan/prores_ks_alpha_data.comp.glsl
Normal file
87
libavcodec/vulkan/prores_ks_alpha_data.comp.glsl
Normal file
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#version 460
|
||||
#pragma shader_stage(compute)
|
||||
|
||||
#extension GL_EXT_shader_image_load_formatted : require
|
||||
#extension GL_EXT_scalar_block_layout : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) in;
|
||||
|
||||
layout (constant_id = 0) const int alpha_bits = 0;
|
||||
layout (constant_id = 1) const int slices_per_row = 0;
|
||||
layout (constant_id = 2) const int width_in_mb = 0;
|
||||
layout (constant_id = 3) const int max_mbs_per_slice = 0;
|
||||
|
||||
struct SliceData {
|
||||
uint mbs_per_slice;
|
||||
int16_t coeffs[4][8 * 256];
|
||||
};
|
||||
|
||||
layout (set = 0, binding = 0, scalar) writeonly buffer SliceBuffer {
|
||||
SliceData slices[];
|
||||
};
|
||||
layout (set = 0, binding = 1) uniform readonly iimage2D plane;
|
||||
|
||||
/* Table of possible edge slice configurations */
|
||||
const uvec3 edge_mps_table[8] = uvec3[](
|
||||
uvec3(0, 0, 0),
|
||||
uvec3(1, 0, 0),
|
||||
uvec3(2, 0, 0),
|
||||
uvec3(2, 1, 0),
|
||||
uvec3(4, 0, 0),
|
||||
uvec3(4, 1, 0),
|
||||
uvec3(4, 2, 0),
|
||||
uvec3(4, 2, 1)
|
||||
);
|
||||
|
||||
void main()
|
||||
{
|
||||
ivec2 coord = min(ivec2(gl_GlobalInvocationID.xy), imageSize(plane) - ivec2(1));
|
||||
uint16_t alpha = uint16_t(imageLoad(plane, coord).x);
|
||||
|
||||
if (alpha_bits == 8)
|
||||
alpha >>= 2;
|
||||
else
|
||||
alpha = (alpha << 6) | (alpha >> 4);
|
||||
|
||||
uint mbs_per_slice = max_mbs_per_slice;
|
||||
uint slices_width = width_in_mb / mbs_per_slice;
|
||||
uint mb_width = slices_width * mbs_per_slice;
|
||||
uint slice_x = gl_WorkGroupID.x / mbs_per_slice;
|
||||
uint slice_y = gl_WorkGroupID.y;
|
||||
uvec2 slice_base = uvec2(slice_x * mbs_per_slice * 16u, slice_y * 16u);
|
||||
|
||||
/* Handle slice macroblock size reduction on edge slices */
|
||||
if (gl_WorkGroupID.x >= mb_width) {
|
||||
uint edge_mb = gl_WorkGroupID.x - mb_width;
|
||||
uvec3 table = edge_mps_table[width_in_mb - mb_width];
|
||||
uvec3 base = uvec3(0, table.x, table.x + table.y);
|
||||
uint edge_slice = edge_mb < base.y ? 0 : (edge_mb < base.z ? 1 : 2);
|
||||
slice_x += edge_slice;
|
||||
slice_base.x += base[edge_slice] * 16u;
|
||||
mbs_per_slice = table[edge_slice];
|
||||
}
|
||||
|
||||
uint slice = slice_y * slices_per_row + slice_x;
|
||||
uvec2 coeff_coord = uvec2(coord) - slice_base;
|
||||
uint coeff = coeff_coord.y * (mbs_per_slice * 16u) + coeff_coord.x;
|
||||
slices[slice].coeffs[3][coeff] = int16_t(alpha);
|
||||
}
|
||||
273
libavcodec/vulkan/prores_ks_encode_slice.comp.glsl
Normal file
273
libavcodec/vulkan/prores_ks_encode_slice.comp.glsl
Normal file
@@ -0,0 +1,273 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#version 460
|
||||
#pragma shader_stage(compute)
|
||||
|
||||
#extension GL_EXT_scalar_block_layout : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
|
||||
#define PB_UNALIGNED
|
||||
#include "common.glsl"
|
||||
|
||||
layout (constant_id = 0) const int max_mbs_per_slice = 8;
|
||||
layout (constant_id = 1) const int chroma_factor = 0;
|
||||
layout (constant_id = 2) const int alpha_bits = 0;
|
||||
layout (constant_id = 3) const int num_planes = 0;
|
||||
layout (constant_id = 4) const int slices_per_picture = 0;
|
||||
layout (constant_id = 5) const int max_quant = 0;
|
||||
|
||||
struct SliceData {
|
||||
uint32_t mbs_per_slice;
|
||||
int16_t coeffs[4][8 * 256];
|
||||
};
|
||||
|
||||
struct SliceScore {
|
||||
ivec4 bits[16];
|
||||
ivec4 score[16];
|
||||
int total_bits[16];
|
||||
int total_score[16];
|
||||
int overquant;
|
||||
int buf_start;
|
||||
int quant;
|
||||
};
|
||||
|
||||
layout(push_constant, scalar) uniform EncodeSliceInfo {
|
||||
u8buf bytestream;
|
||||
u8vec2buf seek_table;
|
||||
};
|
||||
|
||||
layout (set = 0, binding = 0, scalar) readonly buffer SliceBuffer {
|
||||
SliceData slices[];
|
||||
};
|
||||
layout (set = 0, binding = 1, scalar) readonly buffer SliceScores {
|
||||
SliceScore scores[];
|
||||
};
|
||||
layout (set = 0, binding = 2, scalar) uniform ProresDataTables {
|
||||
int16_t qmat[128][64]; int16_t qmat_chroma[128][64];
|
||||
};
|
||||
|
||||
#define CFACTOR_Y444 3
|
||||
|
||||
void encode_vlc_codeword(inout PutBitContext pb, uint codebook, int val)
|
||||
{
|
||||
/* number of prefix bits to switch between Rice and expGolomb */
|
||||
uint switch_bits = (codebook & 3) + 1;
|
||||
uint rice_order = codebook >> 5; /* rice code order */
|
||||
uint exp_order = (codebook >> 2) & 7; /* exp golomb code order */
|
||||
|
||||
uint switch_val = switch_bits << rice_order;
|
||||
|
||||
if (val >= switch_val) {
|
||||
val -= int(switch_val - (1 << exp_order));
|
||||
int exponent = findMSB(val);
|
||||
|
||||
put_bits(pb, exponent - exp_order + switch_bits, 0);
|
||||
put_bits(pb, exponent + 1, val);
|
||||
} else {
|
||||
int exponent = val >> rice_order;
|
||||
if (exponent != 0)
|
||||
put_bits(pb, exponent, 0);
|
||||
put_bits(pb, 1, 1);
|
||||
if (rice_order != 0)
|
||||
put_bits(pb, rice_order, zero_extend(val, rice_order));
|
||||
}
|
||||
}
|
||||
|
||||
#define GET_SIGN(x) ((x) >> 31)
|
||||
#define MAKE_CODE(x) (((x) * 2) ^ GET_SIGN(x))
|
||||
|
||||
#define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits = 0
|
||||
|
||||
void encode_dcs(inout PutBitContext pb, bool is_chroma, int q)
|
||||
{
|
||||
const uint8_t dc_codebook[7] = { U8(0x04), U8(0x28), U8(0x28), U8(0x4D), U8(0x4D), U8(0x70), U8(0x70) };
|
||||
|
||||
uint slice = gl_GlobalInvocationID.x;
|
||||
uint plane = gl_GlobalInvocationID.y;
|
||||
uint blocks_per_mb = is_chroma && chroma_factor != CFACTOR_Y444 ? 2 : 4;
|
||||
uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
|
||||
int codebook = 5;
|
||||
int scale = is_chroma ? qmat_chroma[q][0] : qmat[q][0];
|
||||
int coeff = slices[slice].coeffs[plane][0];
|
||||
int prev_dc = (coeff - 0x4000) / scale;
|
||||
encode_vlc_codeword(pb, FIRST_DC_CB, MAKE_CODE(prev_dc));
|
||||
int sign = 0;
|
||||
for (int i = 1; i < blocks_per_slice; i++) {
|
||||
coeff = slices[slice].coeffs[plane][i];
|
||||
int dc = (coeff - 0x4000) / scale;
|
||||
int delta = dc - prev_dc;
|
||||
int new_sign = GET_SIGN(delta);
|
||||
delta = (delta ^ sign) - sign;
|
||||
int code = MAKE_CODE(delta);
|
||||
encode_vlc_codeword(pb, dc_codebook[codebook], code);
|
||||
codebook = min(code, 6);
|
||||
sign = new_sign;
|
||||
prev_dc = dc;
|
||||
}
|
||||
}
|
||||
|
||||
void encode_acs(inout PutBitContext pb, bool is_chroma, int q)
|
||||
{
|
||||
const uint8_t run_to_cb[16] = { U8(0x06), U8(0x06), U8(0x05), U8(0x05), U8(0x04), U8(0x29),
|
||||
U8(0x29), U8(0x29), U8(0x29), U8(0x28), U8(0x28), U8(0x28),
|
||||
U8(0x28), U8(0x28), U8(0x28), U8(0x4C) };
|
||||
|
||||
const uint8_t level_to_cb[10] = { U8(0x04), U8(0x0A), U8(0x05), U8(0x06), U8(0x04), U8(0x28),
|
||||
U8(0x28), U8(0x28), U8(0x28), U8(0x4C) };
|
||||
|
||||
uint slice = gl_GlobalInvocationID.x;
|
||||
uint plane = gl_GlobalInvocationID.y;
|
||||
uint blocks_per_mb = is_chroma && chroma_factor != CFACTOR_Y444 ? 2 : 4;
|
||||
uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
|
||||
int prev_run = 4;
|
||||
int prev_level = 2;
|
||||
int run = 0;
|
||||
|
||||
for (uint i = 1; i < 64; i++) {
|
||||
int quant = is_chroma ? qmat_chroma[q][i] : qmat[q][i];
|
||||
for (uint j = 0; j < blocks_per_slice; j++) {
|
||||
uint idx = i * blocks_per_slice + j;
|
||||
int coeff = slices[slice].coeffs[plane][idx];
|
||||
int level = coeff / quant;
|
||||
if (level != 0) {
|
||||
int abs_level = abs(level);
|
||||
encode_vlc_codeword(pb, run_to_cb[prev_run], run);
|
||||
encode_vlc_codeword(pb, level_to_cb[prev_level], abs_level - 1);
|
||||
put_bits(pb, 1, zero_extend(GET_SIGN(level), 1));
|
||||
prev_run = min(run, 15);
|
||||
prev_level = min(abs_level, 9);
|
||||
run = 0;
|
||||
} else {
|
||||
run++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void encode_slice_plane(inout PutBitContext pb, int q)
|
||||
{
|
||||
uint plane = gl_GlobalInvocationID.y;
|
||||
bool is_chroma = plane == 1 || plane == 2;
|
||||
encode_dcs(pb, is_chroma, q);
|
||||
encode_acs(pb, is_chroma, q);
|
||||
}
|
||||
|
||||
void put_alpha_diff(inout PutBitContext pb, int cur, int prev)
|
||||
{
|
||||
const int dbits = (alpha_bits == 8) ? 4 : 7;
|
||||
const int dsize = 1 << dbits - 1;
|
||||
int diff = cur - prev;
|
||||
|
||||
diff = zero_extend(diff, alpha_bits);
|
||||
if (diff >= (1 << alpha_bits) - dsize)
|
||||
diff -= 1 << alpha_bits;
|
||||
if (diff < -dsize || diff > dsize || diff == 0) {
|
||||
put_bits(pb, 1, 1);
|
||||
put_bits(pb, alpha_bits, diff);
|
||||
} else {
|
||||
put_bits(pb, 1, 0);
|
||||
put_bits(pb, dbits - 1, abs(diff) - 1);
|
||||
put_bits(pb, 1, int(diff < 0));
|
||||
}
|
||||
}
|
||||
|
||||
void put_alpha_run(inout PutBitContext pb, int run)
|
||||
{
|
||||
if (run != 0) {
|
||||
put_bits(pb, 1, 0);
|
||||
if (run < 0x10)
|
||||
put_bits(pb, 4, run);
|
||||
else
|
||||
put_bits(pb, 15, run);
|
||||
} else {
|
||||
put_bits(pb, 1, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void encode_alpha_plane(inout PutBitContext pb)
|
||||
{
|
||||
uint slice = gl_GlobalInvocationID.x;
|
||||
const int mask = (1 << alpha_bits) - 1;
|
||||
const int num_coeffs = int(slices[slice].mbs_per_slice) * 256;
|
||||
int prev = mask, cur;
|
||||
int idx = 0;
|
||||
int run = 0;
|
||||
|
||||
cur = slices[slice].coeffs[3][idx++];
|
||||
put_alpha_diff(pb, cur, prev);
|
||||
prev = cur;
|
||||
do {
|
||||
cur = slices[slice].coeffs[3][idx++];
|
||||
if (cur != prev) {
|
||||
put_alpha_run(pb, run);
|
||||
put_alpha_diff(pb, cur, prev);
|
||||
prev = cur;
|
||||
run = 0;
|
||||
} else {
|
||||
run++;
|
||||
}
|
||||
} while (idx < num_coeffs);
|
||||
put_alpha_run(pb, run);
|
||||
}
|
||||
|
||||
u8vec2 byteswap16(int value)
|
||||
{
|
||||
return unpack8(uint16_t(value)).yx;
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint slice = gl_GlobalInvocationID.x;
|
||||
if (slice >= slices_per_picture)
|
||||
return;
|
||||
|
||||
uint plane = gl_GlobalInvocationID.y;
|
||||
int q = scores[slice].quant;
|
||||
int q_idx = min(q, max_quant + 1);
|
||||
ivec4 bits = scores[slice].bits[q_idx];
|
||||
int slice_hdr_size = 2 * num_planes;
|
||||
int slice_size = slice_hdr_size + ((bits.x + bits.y + bits.z + bits.w) / 8);
|
||||
int buf_start = scores[slice].buf_start;
|
||||
u8buf buf = OFFBUF(u8buf, bytestream, buf_start);
|
||||
|
||||
/* Write slice header */
|
||||
if (plane == 0) {
|
||||
buf[0].v = uint8_t(slice_hdr_size * 8);
|
||||
buf[1].v = uint8_t(q);
|
||||
u8vec2buf slice_hdr = OFFBUF(u8vec2buf, buf, 2);
|
||||
for (int i = 0; i < num_planes - 1; i++) {
|
||||
slice_hdr[i].v = byteswap16(bits[i] / 8);
|
||||
}
|
||||
seek_table[slice].v = byteswap16(slice_size);
|
||||
}
|
||||
|
||||
int plane_offset = 0;
|
||||
for (int i = 0; i < plane; ++i)
|
||||
plane_offset += bits[i] / 8;
|
||||
|
||||
/* Encode slice plane */
|
||||
PutBitContext pb;
|
||||
init_put_bits(pb, OFFBUF(u8buf, buf, slice_hdr_size + plane_offset), 0);
|
||||
if (plane == 3)
|
||||
encode_alpha_plane(pb);
|
||||
else
|
||||
encode_slice_plane(pb, q);
|
||||
flush_put_bits(pb);
|
||||
}
|
||||
302
libavcodec/vulkan/prores_ks_estimate_slice.comp.glsl
Normal file
302
libavcodec/vulkan/prores_ks_estimate_slice.comp.glsl
Normal file
@@ -0,0 +1,302 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#version 460
|
||||
#pragma shader_stage(compute)
|
||||
|
||||
#extension GL_EXT_scalar_block_layout : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
#extension GL_KHR_shader_subgroup_clustered : require
|
||||
#extension GL_KHR_shader_subgroup_shuffle : require
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
|
||||
#include "common.glsl"
|
||||
|
||||
layout (constant_id = 0) const int max_mbs_per_slice = 8;
|
||||
layout (constant_id = 1) const int chroma_factor = 0;
|
||||
layout (constant_id = 2) const int alpha_bits = 0;
|
||||
layout (constant_id = 3) const int num_planes = 0;
|
||||
layout (constant_id = 4) const int slices_per_picture = 0;
|
||||
layout (constant_id = 5) const int min_quant = 0;
|
||||
layout (constant_id = 6) const int max_quant = 0;
|
||||
layout (constant_id = 7) const int bits_per_mb = 0;
|
||||
|
||||
struct SliceData {
|
||||
uint32_t mbs_per_slice;
|
||||
int16_t coeffs[4][8 * 256];
|
||||
};
|
||||
|
||||
struct SliceScore {
|
||||
ivec4 bits[16];
|
||||
ivec4 score[16];
|
||||
int total_bits[16];
|
||||
int total_score[16];
|
||||
int overquant;
|
||||
int buf_start;
|
||||
int quant;
|
||||
};
|
||||
|
||||
layout (set = 0, binding = 0, scalar) readonly buffer SliceBuffer {
|
||||
SliceData slices[];
|
||||
};
|
||||
layout (set = 0, binding = 1, scalar) writeonly buffer SliceScores {
|
||||
SliceScore scores[];
|
||||
};
|
||||
layout (set = 0, binding = 2, scalar) uniform ProresDataTables {
|
||||
int16_t qmat[128][64];
|
||||
int16_t qmat_chroma[128][64];
|
||||
};
|
||||
|
||||
#define CFACTOR_Y444 3
|
||||
|
||||
#define GET_SIGN(x) ((x) >> 31)
|
||||
#define MAKE_CODE(x) (((x) * 2) ^ GET_SIGN(x))
|
||||
|
||||
int estimate_vlc(uint codebook, int val)
|
||||
{
|
||||
/* number of prefix bits to switch between Rice and expGolomb */
|
||||
uint switch_bits = (codebook & 3) + 1;
|
||||
uint rice_order = codebook >> 5; /* rice code order */
|
||||
uint exp_order = (codebook >> 2) & 7; /* exp golomb code order */
|
||||
|
||||
uint switch_val = switch_bits << rice_order;
|
||||
|
||||
if (val >= switch_val) {
|
||||
val -= int(switch_val - (1 << exp_order));
|
||||
int exponent = findMSB(val);
|
||||
return int(exponent * 2 - exp_order + switch_bits + 1);
|
||||
} else {
|
||||
return int((val >> rice_order) + rice_order + 1);
|
||||
}
|
||||
}
|
||||
|
||||
#define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits = 0
|
||||
|
||||
int estimate_dcs(inout int error, uint slice, uint plane, uint q)
|
||||
{
|
||||
const uint8_t dc_codebook[7] = { U8(0x04), U8(0x28), U8(0x28), U8(0x4D), U8(0x4D), U8(0x70), U8(0x70) };
|
||||
|
||||
uint blocks_per_mb = plane != 0 && chroma_factor != CFACTOR_Y444 ? 2 : 4;
|
||||
uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
|
||||
int codebook = 5;
|
||||
int coeff = slices[slice].coeffs[plane][0];
|
||||
int scale = plane != 0 ? qmat_chroma[q][0] : qmat[q][0];
|
||||
int prev_dc = (coeff - 0x4000) / scale;
|
||||
int bits = estimate_vlc(FIRST_DC_CB, MAKE_CODE(prev_dc));
|
||||
int sign = 0;
|
||||
|
||||
for (int i = 1; i < blocks_per_slice; ++i) {
|
||||
coeff = slices[slice].coeffs[plane][i];
|
||||
int dc = (coeff - 0x4000) / scale;
|
||||
error += abs(coeff - 0x4000) % scale;
|
||||
int delta = dc - prev_dc;
|
||||
int new_sign = GET_SIGN(delta);
|
||||
delta = (delta ^ sign) - sign;
|
||||
int code = MAKE_CODE(delta);
|
||||
bits += estimate_vlc(dc_codebook[codebook], code);
|
||||
codebook = min(code, 6);
|
||||
sign = new_sign;
|
||||
prev_dc = dc;
|
||||
}
|
||||
|
||||
return bits;
|
||||
}
|
||||
|
||||
#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1))
|
||||
#define SCORE_LIMIT 1073741823
|
||||
|
||||
int estimate_acs(inout int error, uint slice, uint plane, uint q)
|
||||
{
|
||||
const uint8_t run_to_cb[16] = { U8(0x06), U8(0x06), U8(0x05), U8(0x05), U8(0x04), U8(0x29),
|
||||
U8(0x29), U8(0x29), U8(0x29), U8(0x28), U8(0x28), U8(0x28),
|
||||
U8(0x28), U8(0x28), U8(0x28), U8(0x4C) };
|
||||
|
||||
const uint8_t level_to_cb[10] = { U8(0x04), U8(0x0A), U8(0x05), U8(0x06), U8(0x04), U8(0x28),
|
||||
U8(0x28), U8(0x28), U8(0x28), U8(0x4C) };
|
||||
|
||||
uint blocks_per_mb = plane != 0 && chroma_factor != CFACTOR_Y444 ? 2 : 4;
|
||||
uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
|
||||
uint max_coeffs = blocks_per_slice << 6;
|
||||
int prev_run = 4;
|
||||
int prev_level = 2;
|
||||
int bits = 0;
|
||||
int run = 0;
|
||||
|
||||
for (uint i = 1; i < 64; i++) {
|
||||
int quant = plane != 0 ? qmat_chroma[q][i] : qmat[q][i];
|
||||
for (uint j = 0; j < blocks_per_slice; j++) {
|
||||
uint idx = i * blocks_per_slice + j;
|
||||
int coeff = slices[slice].coeffs[plane][idx];
|
||||
int level = coeff / quant;
|
||||
error += abs(coeff) % quant;
|
||||
if (level != 0) {
|
||||
int abs_level = abs(level);
|
||||
bits += estimate_vlc(run_to_cb[prev_run], run);
|
||||
bits += estimate_vlc(level_to_cb[prev_level], abs_level - 1) + 1;
|
||||
prev_run = min(run, 15);
|
||||
prev_level = min(abs_level, 9);
|
||||
run = 0;
|
||||
} else {
|
||||
run++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bits;
|
||||
}
|
||||
|
||||
int estimate_slice_plane(inout int error, uint slice, uint plane, uint q)
|
||||
{
|
||||
int bits = 0;
|
||||
bits += estimate_dcs(error, slice, plane, q);
|
||||
bits += estimate_acs(error, slice, plane, q);
|
||||
return FFALIGN(bits, 8);
|
||||
}
|
||||
|
||||
int est_alpha_diff(int cur, int prev)
|
||||
{
|
||||
const int dbits = (alpha_bits == 8) ? 4 : 7;
|
||||
const int dsize = 1 << dbits - 1;
|
||||
int diff = cur - prev;
|
||||
|
||||
diff = zero_extend(diff, alpha_bits);
|
||||
if (diff >= (1 << alpha_bits) - dsize)
|
||||
diff -= 1 << alpha_bits;
|
||||
if (diff < -dsize || diff > dsize || diff == 0)
|
||||
return alpha_bits + 1;
|
||||
else
|
||||
return dbits + 1;
|
||||
}
|
||||
|
||||
int estimate_alpha_plane(uint slice)
|
||||
{
|
||||
const int mask = (1 << alpha_bits) - 1;
|
||||
const int num_coeffs = int(slices[slice].mbs_per_slice) * 256;
|
||||
int prev = mask, cur;
|
||||
int idx = 0;
|
||||
int run = 0;
|
||||
int bits;
|
||||
|
||||
cur = slices[slice].coeffs[3][idx++];
|
||||
bits = est_alpha_diff(cur, prev);
|
||||
prev = cur;
|
||||
do {
|
||||
cur = slices[slice].coeffs[3][idx++];
|
||||
if (cur != prev) {
|
||||
if (run == 0)
|
||||
bits++;
|
||||
else if (run < 0x10)
|
||||
bits += 5;
|
||||
else
|
||||
bits += 16;
|
||||
bits += est_alpha_diff(cur, prev);
|
||||
prev = cur;
|
||||
run = 0;
|
||||
} else {
|
||||
run++;
|
||||
}
|
||||
} while (idx < num_coeffs);
|
||||
|
||||
if (run != 0) {
|
||||
if (run < 0x10)
|
||||
bits += 5;
|
||||
else
|
||||
bits += 16;
|
||||
} else {
|
||||
bits++;
|
||||
}
|
||||
|
||||
return bits;
|
||||
}
|
||||
|
||||
int sum_of_planes(int value)
|
||||
{
|
||||
if (num_planes == 3) {
|
||||
uint base = (gl_SubgroupInvocationID / 3) * 3;
|
||||
return subgroupShuffle(value, base) + subgroupShuffle(value, base + 1) + subgroupShuffle(value, base + 2);
|
||||
} else
|
||||
return subgroupClusteredAdd(value, 4);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint slice = gl_GlobalInvocationID.x / num_planes;
|
||||
uint plane = gl_LocalInvocationID.x % num_planes;
|
||||
uint q = min_quant + gl_GlobalInvocationID.y;
|
||||
if (slice >= slices_per_picture)
|
||||
return;
|
||||
|
||||
/* Estimate slice bits and error for specified quantizer and plane */
|
||||
int error = 0;
|
||||
int bits = 0;
|
||||
if (plane == 3)
|
||||
bits = estimate_alpha_plane(slice);
|
||||
else
|
||||
bits = estimate_slice_plane(error, slice, plane, q);
|
||||
|
||||
/* Write results to score buffer */
|
||||
scores[slice].bits[q][plane] = FFALIGN(bits, 8);
|
||||
scores[slice].score[q][plane] = error;
|
||||
|
||||
/* Accumulate total bits and error of all planes */
|
||||
int total_bits = sum_of_planes(bits);
|
||||
int total_score = sum_of_planes(error);
|
||||
if (total_bits > 65000 * 8)
|
||||
total_score = SCORE_LIMIT;
|
||||
scores[slice].total_bits[q] = total_bits;
|
||||
scores[slice].total_score[q] = total_score;
|
||||
|
||||
if (q != max_quant)
|
||||
return;
|
||||
|
||||
/* Task threads that computed max_quant to also compute overquant if necessary */
|
||||
uint mbs_per_slice = slices[slice].mbs_per_slice;
|
||||
if (total_bits <= bits_per_mb * mbs_per_slice) {
|
||||
/* Overquant isn't needed for this slice */
|
||||
scores[slice].total_bits[max_quant + 1] = total_bits;
|
||||
scores[slice].total_score[max_quant + 1] = total_score + 1;
|
||||
scores[slice].bits[max_quant + 1][plane] = FFALIGN(bits, 8);
|
||||
scores[slice].score[max_quant + 1][plane] = error;
|
||||
scores[slice].overquant = int(max_quant);
|
||||
} else {
|
||||
/* Keep searching until an encoding fits our budget */
|
||||
for (q = max_quant + 1; q < 128; ++q) {
|
||||
/* Estimate slice bits and error for specified quantizer and plane */
|
||||
error = 0;
|
||||
bits = 0;
|
||||
if (plane == 3)
|
||||
bits = estimate_alpha_plane(slice);
|
||||
else
|
||||
bits = estimate_slice_plane(error, slice, plane, q);
|
||||
|
||||
/* Accumulate total bits and error of all planes */
|
||||
total_bits = sum_of_planes(bits);
|
||||
total_score = sum_of_planes(error);
|
||||
|
||||
/* If estimated bits fit within budget, we are done */
|
||||
if (total_bits <= bits_per_mb * mbs_per_slice)
|
||||
break;
|
||||
}
|
||||
|
||||
scores[slice].bits[max_quant + 1][plane] = bits;
|
||||
scores[slice].score[max_quant + 1][plane] = error;
|
||||
scores[slice].total_bits[max_quant + 1] = total_bits;
|
||||
scores[slice].total_score[max_quant + 1] = total_score;
|
||||
scores[slice].overquant = int(q);
|
||||
}
|
||||
}
|
||||
179
libavcodec/vulkan/prores_ks_slice_data.comp.glsl
Normal file
179
libavcodec/vulkan/prores_ks_slice_data.comp.glsl
Normal file
@@ -0,0 +1,179 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#version 460
|
||||
#pragma shader_stage(compute)
|
||||
|
||||
#extension GL_EXT_scalar_block_layout : require
|
||||
#extension GL_EXT_shared_memory_block : require
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
|
||||
#include "common.glsl"
|
||||
#include "dct.glsl"
|
||||
|
||||
layout (constant_id = 0) const int max_mbs_per_slice = 8;
|
||||
layout (constant_id = 1) const int blocks_per_mb = 0;
|
||||
layout (constant_id = 2) const int width_in_mb = 0;
|
||||
layout (constant_id = 3) const int pictures_per_frame = 0;
|
||||
|
||||
layout(push_constant, scalar) uniform SliceDataInfo {
|
||||
int plane;
|
||||
int line_add;
|
||||
int bits_per_sample;
|
||||
};
|
||||
|
||||
struct SliceData {
|
||||
uint32_t mbs_per_slice;
|
||||
i16vec4 rows[4][8 * 32][2];
|
||||
};
|
||||
|
||||
layout (set = 0, binding = 0, scalar) writeonly buffer SliceBuffer {
|
||||
SliceData slices[];
|
||||
};
|
||||
layout (set = 0, binding = 1) uniform readonly iimage2D planes[3];
|
||||
|
||||
/* Table of possible edge slice configurations */
|
||||
const uvec3 edge_mps_table[8] = uvec3[](
|
||||
uvec3(0, 0, 0),
|
||||
uvec3(1, 0, 0),
|
||||
uvec3(2, 0, 0),
|
||||
uvec3(2, 1, 0),
|
||||
uvec3(4, 0, 0),
|
||||
uvec3(4, 1, 0),
|
||||
uvec3(4, 2, 0),
|
||||
uvec3(4, 2, 1)
|
||||
);
|
||||
|
||||
const u8vec2 progressive_scan[64] = {
|
||||
u8vec2(0, 0), u8vec2(1, 0), u8vec2(0, 1), u8vec2(1, 1),
|
||||
u8vec2(2, 0), u8vec2(3, 0), u8vec2(2, 1), u8vec2(3, 1),
|
||||
u8vec2(0, 2), u8vec2(1, 2), u8vec2(0, 3), u8vec2(1, 3),
|
||||
u8vec2(2, 2), u8vec2(3, 2), u8vec2(2, 3), u8vec2(3, 3),
|
||||
u8vec2(4, 0), u8vec2(5, 0), u8vec2(4, 1), u8vec2(4, 2),
|
||||
u8vec2(5, 1), u8vec2(6, 0), u8vec2(7, 0), u8vec2(6, 1),
|
||||
u8vec2(5, 2), u8vec2(4, 3), u8vec2(5, 3), u8vec2(6, 2),
|
||||
u8vec2(7, 1), u8vec2(7, 2), u8vec2(6, 3), u8vec2(7, 3),
|
||||
u8vec2(0, 4), u8vec2(1, 4), u8vec2(0, 5), u8vec2(0, 6),
|
||||
u8vec2(1, 5), u8vec2(2, 4), u8vec2(3, 4), u8vec2(2, 5),
|
||||
u8vec2(1, 6), u8vec2(0, 7), u8vec2(1, 7), u8vec2(2, 6),
|
||||
u8vec2(3, 5), u8vec2(4, 4), u8vec2(5, 4), u8vec2(4, 5),
|
||||
u8vec2(3, 6), u8vec2(2, 7), u8vec2(3, 7), u8vec2(4, 6),
|
||||
u8vec2(5, 5), u8vec2(6, 4), u8vec2(7, 4), u8vec2(6, 5),
|
||||
u8vec2(5, 6), u8vec2(4, 7), u8vec2(5, 7), u8vec2(6, 6),
|
||||
u8vec2(7, 5), u8vec2(7, 6), u8vec2(6, 7), u8vec2(7, 7),
|
||||
};
|
||||
|
||||
const u8vec2 interlaced_scan[64] = {
|
||||
u8vec2(0, 0), u8vec2(0, 1), u8vec2(1, 0), u8vec2(1, 1),
|
||||
u8vec2(0, 2), u8vec2(0, 3), u8vec2(1, 2), u8vec2(1, 3),
|
||||
u8vec2(2, 0), u8vec2(2, 1), u8vec2(3, 0), u8vec2(3, 1),
|
||||
u8vec2(2, 2), u8vec2(2, 3), u8vec2(3, 2), u8vec2(3, 3),
|
||||
u8vec2(0, 4), u8vec2(0, 5), u8vec2(1, 4), u8vec2(2, 4),
|
||||
u8vec2(1, 5), u8vec2(0, 6), u8vec2(0, 7), u8vec2(1, 6),
|
||||
u8vec2(2, 5), u8vec2(3, 4), u8vec2(3, 5), u8vec2(2, 6),
|
||||
u8vec2(1, 7), u8vec2(2, 7), u8vec2(3, 6), u8vec2(3, 7),
|
||||
u8vec2(4, 0), u8vec2(4, 1), u8vec2(5, 0), u8vec2(6, 0),
|
||||
u8vec2(5, 1), u8vec2(4, 2), u8vec2(4, 3), u8vec2(5, 2),
|
||||
u8vec2(6, 1), u8vec2(7, 0), u8vec2(7, 1), u8vec2(6, 2),
|
||||
u8vec2(5, 3), u8vec2(4, 4), u8vec2(4, 5), u8vec2(5, 4),
|
||||
u8vec2(6, 3), u8vec2(7, 2), u8vec2(7, 3), u8vec2(6, 4),
|
||||
u8vec2(5, 5), u8vec2(4, 6), u8vec2(4, 7), u8vec2(5, 6),
|
||||
u8vec2(6, 5), u8vec2(7, 4), u8vec2(7, 5), u8vec2(6, 6),
|
||||
u8vec2(5, 7), u8vec2(6, 7), u8vec2(7, 6), u8vec2(7, 7),
|
||||
};
|
||||
|
||||
#define DCTSIZE 8
|
||||
|
||||
int16_t get_swizzled_coeff(uint blocks_per_slice, uint slice_row, uint idx)
|
||||
{
|
||||
uint coeff = slice_row * DCTSIZE + idx;
|
||||
u8vec2 coord = pictures_per_frame == 1 ? progressive_scan[coeff / blocks_per_slice]
|
||||
: interlaced_scan[coeff / blocks_per_slice];
|
||||
uint block = coeff % blocks_per_slice;
|
||||
float v = blocks[block][coord.y * 9 + coord.x];
|
||||
return int16_t(v * float(1 << 11));
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint row = gl_LocalInvocationID.x;
|
||||
uint block = gl_LocalInvocationID.y;
|
||||
uint macroblock = gl_LocalInvocationID.z;
|
||||
uint slice_x = gl_WorkGroupID.x;
|
||||
uint slice_block = macroblock * blocks_per_mb + block;
|
||||
uint slice = gl_WorkGroupID.y * gl_NumWorkGroups.x + slice_x;
|
||||
|
||||
/* Calculate the current thread coordinate in input plane */
|
||||
uint mbs_per_slice = max_mbs_per_slice;
|
||||
uint mb_width = 4u * blocks_per_mb;
|
||||
uint slices_width = width_in_mb / max_mbs_per_slice;
|
||||
uvec2 slice_base = gl_WorkGroupID.xy * uvec2(max_mbs_per_slice * mb_width, DCTSIZE * 2u);
|
||||
|
||||
/* Handle slice macroblock size reduction on edge slices */
|
||||
if (slice_x >= slices_width) {
|
||||
uint edge_slice = slice_x - slices_width;
|
||||
uvec3 table = edge_mps_table[width_in_mb - slices_width * max_mbs_per_slice];
|
||||
uvec3 base = uvec3(0u, table.x, table.x + table.y);
|
||||
slice_base.x = (max_mbs_per_slice * slices_width + base[edge_slice]) * mb_width;
|
||||
mbs_per_slice = table[edge_slice];
|
||||
}
|
||||
|
||||
uvec2 mb_base = slice_base + uvec2(macroblock * mb_width, 0u);
|
||||
uvec2 block_coord = plane != 0 ? uvec2(block >> 1u, block & 1u) : uvec2(block & 1u, block >> 1u);
|
||||
ivec2 coord = ivec2(mb_base + block_coord * DCTSIZE + uvec2(0u, row));
|
||||
coord.y = coord.y * pictures_per_frame + line_add;
|
||||
coord = min(coord, imageSize(planes[plane]) - ivec2(1));
|
||||
|
||||
/* Load and normalize coefficients to [-1, 1] for increased precision during the DCT. */
|
||||
[[unroll]] for (int i = 0; i < 8; i++) {
|
||||
int c = imageLoad(planes[plane], coord + ivec2(i, 0)).x;
|
||||
blocks[slice_block][row * 9 + i] = float(c) / (1 << (bits_per_sample - 1));
|
||||
}
|
||||
|
||||
/* Row-wise DCT */
|
||||
fdct8(slice_block, row, 9);
|
||||
barrier();
|
||||
|
||||
/* Column-wise DCT */
|
||||
fdct8(slice_block, row*9, 1);
|
||||
barrier();
|
||||
|
||||
uint slice_row = slice_block * DCTSIZE + row;
|
||||
uint blocks_per_slice = mbs_per_slice * blocks_per_mb;
|
||||
|
||||
/**
|
||||
* Swizzle coefficients in morton order before storing to output buffer.
|
||||
* This allows for more cache friendly and coalesced coefficient loads.
|
||||
*/
|
||||
i16vec4 dst_low;
|
||||
dst_low.x = get_swizzled_coeff(blocks_per_slice, slice_row, 0);
|
||||
dst_low.y = get_swizzled_coeff(blocks_per_slice, slice_row, 1);
|
||||
dst_low.z = get_swizzled_coeff(blocks_per_slice, slice_row, 2);
|
||||
dst_low.w = get_swizzled_coeff(blocks_per_slice, slice_row, 3);
|
||||
|
||||
i16vec4 dst_hi;
|
||||
dst_hi.x = get_swizzled_coeff(blocks_per_slice, slice_row, 4);
|
||||
dst_hi.y = get_swizzled_coeff(blocks_per_slice, slice_row, 5);
|
||||
dst_hi.z = get_swizzled_coeff(blocks_per_slice, slice_row, 6);
|
||||
dst_hi.w = get_swizzled_coeff(blocks_per_slice, slice_row, 7);
|
||||
|
||||
/* Store DCT result to slice buffer */
|
||||
slices[slice].mbs_per_slice = mbs_per_slice;
|
||||
slices[slice].rows[plane][slice_row][0] = dst_low;
|
||||
slices[slice].rows[plane][slice_row][1] = dst_hi;
|
||||
}
|
||||
194
libavcodec/vulkan/prores_ks_trellis_node.comp.glsl
Normal file
194
libavcodec/vulkan/prores_ks_trellis_node.comp.glsl
Normal file
@@ -0,0 +1,194 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#version 460
|
||||
#pragma shader_stage(compute)
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
#extension GL_EXT_scalar_block_layout : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
|
||||
layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) in;
|
||||
|
||||
layout (constant_id = 0) const int slices_per_row = 1;
|
||||
layout (constant_id = 1) const int num_subgroups = 1;
|
||||
layout (constant_id = 2) const int num_planes = 0;
|
||||
layout (constant_id = 3) const int force_quant = 0;
|
||||
layout (constant_id = 4) const int min_quant = 0;
|
||||
layout (constant_id = 5) const int max_quant = 0;
|
||||
layout (constant_id = 6) const int mbs_per_slice = 0;
|
||||
layout (constant_id = 7) const int bits_per_mb = 0;
|
||||
|
||||
struct SliceScore {
|
||||
ivec4 bits[16];
|
||||
ivec4 score[16];
|
||||
int total_bits[16];
|
||||
int total_score[16];
|
||||
int overquant;
|
||||
int buf_start;
|
||||
int quant;
|
||||
};
|
||||
|
||||
layout (set = 0, binding = 0, scalar) writeonly buffer FrameSize {
|
||||
int frame_size;
|
||||
};
|
||||
layout (set = 0, binding = 1, scalar) buffer SliceScores {
|
||||
SliceScore scores[];
|
||||
};
|
||||
|
||||
#define TRELLIS_WIDTH 16
|
||||
#define SCORE_LIMIT 1073741823
|
||||
|
||||
struct TrellisNode {
|
||||
int prev_node;
|
||||
int quant;
|
||||
int bits;
|
||||
int score;
|
||||
};
|
||||
|
||||
shared int subgroup_sizes[num_subgroups];
|
||||
|
||||
int slice_sizes[slices_per_row];
|
||||
|
||||
TrellisNode nodes[(slices_per_row + 1) * TRELLIS_WIDTH];
|
||||
|
||||
int find_slice_quant(int slice_x)
|
||||
{
|
||||
int slice = int(gl_LocalInvocationID.x) * slices_per_row + slice_x;
|
||||
|
||||
int trellis_node = int(slice_x + 1) * TRELLIS_WIDTH;
|
||||
[[unroll]] for (int q = min_quant; q < max_quant + 2; q++) {
|
||||
nodes[trellis_node + q].prev_node = -1;
|
||||
nodes[trellis_node + q].quant = q;
|
||||
}
|
||||
|
||||
int mbs = int(slice_x + 1) * mbs_per_slice;
|
||||
nodes[trellis_node + max_quant + 1].quant = scores[slice].overquant;
|
||||
|
||||
int bits_limit = mbs * bits_per_mb;
|
||||
for (int pq = min_quant; pq < max_quant + 2; pq++) {
|
||||
int prev = trellis_node - TRELLIS_WIDTH + pq;
|
||||
for (int q = min_quant; q < max_quant + 2; q++) {
|
||||
int cur = trellis_node + q;
|
||||
int bits = nodes[prev].bits + scores[slice].total_bits[q];
|
||||
int error = scores[slice].total_score[q];
|
||||
if (bits > bits_limit)
|
||||
error = SCORE_LIMIT;
|
||||
|
||||
int new_score;
|
||||
if (nodes[prev].score < SCORE_LIMIT && error < SCORE_LIMIT)
|
||||
new_score = nodes[prev].score + error;
|
||||
else
|
||||
new_score = SCORE_LIMIT;
|
||||
if (nodes[cur].prev_node == -1 || nodes[cur].score >= new_score) {
|
||||
nodes[cur].bits = bits;
|
||||
nodes[cur].score = new_score;
|
||||
nodes[cur].prev_node = prev;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int error = nodes[trellis_node + min_quant].score;
|
||||
int pq = trellis_node + min_quant;
|
||||
for (int q = min_quant + 1; q < max_quant + 2; q++) {
|
||||
if (nodes[trellis_node + q].score <= error) {
|
||||
error = nodes[trellis_node + q].score;
|
||||
pq = trellis_node + q;
|
||||
}
|
||||
}
|
||||
|
||||
return pq;
|
||||
}
|
||||
|
||||
int find_slice_row_quants()
|
||||
{
|
||||
for (int i = min_quant; i < max_quant + 2; i++) {
|
||||
nodes[i].prev_node = -1;
|
||||
nodes[i].bits = 0;
|
||||
nodes[i].score = 0;
|
||||
}
|
||||
|
||||
int q = 0;
|
||||
for (int slice_x = 0; slice_x < slices_per_row; ++slice_x) {
|
||||
q = find_slice_quant(slice_x);
|
||||
}
|
||||
|
||||
int slice_hdr_size = 2 * num_planes;
|
||||
int slice_row_size = slice_hdr_size * slices_per_row;
|
||||
int y = int(gl_LocalInvocationID.x);
|
||||
for (int x = slices_per_row - 1; x >= 0; x--) {
|
||||
int slice = x + y * slices_per_row;
|
||||
int quant = nodes[q].quant;
|
||||
int q_idx = min(quant, max_quant + 1);
|
||||
ivec4 bits = scores[slice].bits[q_idx];
|
||||
slice_sizes[x] = (bits.x + bits.y + bits.z + bits.w) / 8;
|
||||
slice_row_size += slice_sizes[x];
|
||||
scores[slice].quant = quant;
|
||||
q = nodes[q].prev_node;
|
||||
}
|
||||
|
||||
return slice_row_size;
|
||||
}
|
||||
|
||||
int force_slice_row_quants()
|
||||
{
|
||||
int slice_hdr_size = 2 * num_planes;
|
||||
int slice_row_size = slice_hdr_size * slices_per_row;
|
||||
int y = int(gl_LocalInvocationID.x);
|
||||
for (int x = slices_per_row - 1; x >= 0; x--) {
|
||||
int slice = x + y * slices_per_row;
|
||||
ivec4 bits = scores[slice].bits[0];
|
||||
slice_sizes[x] = (bits.x + bits.y + bits.z + bits.w) / 8;
|
||||
slice_row_size += slice_sizes[x];
|
||||
scores[slice].quant = force_quant;
|
||||
}
|
||||
|
||||
return slice_row_size;
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
int slice_row_size;
|
||||
if (force_quant == 0)
|
||||
slice_row_size = find_slice_row_quants();
|
||||
else
|
||||
slice_row_size = force_slice_row_quants();
|
||||
|
||||
int subgroup_sum = subgroupAdd(slice_row_size);
|
||||
subgroup_sizes[gl_SubgroupID] = subgroup_sum;
|
||||
barrier();
|
||||
|
||||
int buf_start = subgroupExclusiveAdd(slice_row_size);
|
||||
[[unroll]] for (int i = 0; i < num_subgroups; ++i) {
|
||||
if (i >= gl_SubgroupID)
|
||||
break;
|
||||
buf_start += subgroup_sizes[i];
|
||||
}
|
||||
|
||||
int slice_hdr_size = 2 * num_planes;
|
||||
int y = int(gl_LocalInvocationID.x);
|
||||
[[unroll]] for (int x = 0; x < slices_per_row; ++x) {
|
||||
int slice = x + y * slices_per_row;
|
||||
scores[slice].buf_start = buf_start;
|
||||
buf_start += slice_hdr_size + slice_sizes[x];
|
||||
}
|
||||
|
||||
if (y == gl_WorkGroupSize.x - 1)
|
||||
frame_size = buf_start;
|
||||
}
|
||||
Reference in New Issue
Block a user