mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 21:00:41 +08:00
vulkan_ffv1: use regular descriptors for slice state
HUGE speedup on AMD, HUGE speedup everywhere.
This commit is contained in:
@@ -33,10 +33,14 @@ layout (set = 1, binding = 1, scalar) readonly buffer slice_offsets_buf {
|
||||
layout (set = 1, binding = 2, scalar) writeonly buffer slice_status_buf {
|
||||
uint32_t slice_status[];
|
||||
};
|
||||
layout (set = 1, binding = 3) uniform uimage2D dec[];
|
||||
layout (set = 1, binding = 4) uniform uimage2D dec[];
|
||||
|
||||
#ifndef GOLOMB
|
||||
|
||||
layout (set = 1, binding = 3, scalar) buffer slice_state_buf {
|
||||
uint8_t slice_rc_state[];
|
||||
};
|
||||
|
||||
#define READ(c, idx) get_rac_noadapt(c, idx)
|
||||
int get_isymbol(inout RangeCoder c)
|
||||
{
|
||||
@@ -114,10 +118,9 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
|
||||
ivec2 pr = get_pred(dec[p], sp, ivec2(x, y), 0, w,
|
||||
quant_table_idx, extend_lookup[quant_table_idx]);
|
||||
|
||||
uint context_off = state_off + CONTEXT_SIZE*abs(pr[0]);
|
||||
u8buf cd = u8buf(uint64_t(slice_state) + context_off);
|
||||
uint rc_off = state_off + CONTEXT_SIZE*abs(pr[0]) + gl_LocalInvocationID.x;
|
||||
|
||||
rc_state[gl_LocalInvocationID.x] = cd[gl_LocalInvocationID.x].v;
|
||||
rc_state[gl_LocalInvocationID.x] = slice_rc_state[rc_off];
|
||||
rc_dec[gl_LocalInvocationID.x] = false;
|
||||
barrier();
|
||||
|
||||
@@ -128,7 +131,8 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
|
||||
barrier();
|
||||
uint i = gl_LocalInvocationID.x;
|
||||
if (rc_dec[i])
|
||||
cd[i].v = zero_one_state[rc_state[i] + (rc_data[i] ? 256 : 0)];
|
||||
slice_rc_state[rc_off] = zero_one_state[rc_state[i] +
|
||||
(rc_data[i] ? 256 : 0)];
|
||||
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
if (pr[0] < 0)
|
||||
@@ -139,7 +143,13 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
#else /* GOLOMB */
|
||||
|
||||
layout (set = 1, binding = 3, scalar) buffer slice_state_buf {
|
||||
VlcState slice_vlc_state[];
|
||||
};
|
||||
|
||||
GetBitContext gb;
|
||||
|
||||
void golomb_init(inout SliceContext sc)
|
||||
@@ -172,8 +182,7 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
|
||||
ivec2 pr = get_pred(dec[p], sp, ivec2(x, y), 0, w,
|
||||
quant_table_idx, extend_lookup[quant_table_idx]);
|
||||
|
||||
uint context_off = state_off + VLC_STATE_SIZE*abs(pr[0]);
|
||||
VlcState sb = VlcState(uint64_t(slice_state) + context_off);
|
||||
uint vlc_off = state_off + abs(pr[0]);
|
||||
|
||||
if (pr[0] == 0 && run_mode == 0)
|
||||
run_mode = 1;
|
||||
@@ -201,14 +210,14 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
|
||||
if (run_count < 0) {
|
||||
run_mode = 0;
|
||||
run_count = 0;
|
||||
diff = read_vlc_symbol(gb, sb, bits);
|
||||
diff = read_vlc_symbol(gb, slice_vlc_state[vlc_off], bits);
|
||||
if (diff >= 0)
|
||||
diff++;
|
||||
} else {
|
||||
diff = 0;
|
||||
}
|
||||
} else {
|
||||
diff = read_vlc_symbol(gb, sb, bits);
|
||||
diff = read_vlc_symbol(gb, slice_vlc_state[vlc_off], bits);
|
||||
}
|
||||
|
||||
if (pr[0] < 0)
|
||||
@@ -298,6 +307,7 @@ void decode_slice(inout SliceContext sc, const uint slice_idx)
|
||||
uvec4(0, 1, 1, 2))*plane_state_size;
|
||||
|
||||
#ifdef GOLOMB
|
||||
slice_state_off >>= 3; // division by VLC_STATE_SIZE
|
||||
golomb_init(sc);
|
||||
#endif
|
||||
|
||||
|
||||
@@ -26,6 +26,18 @@
|
||||
#include "common.glsl"
|
||||
#include "ffv1_common.glsl"
|
||||
|
||||
#ifdef GOLOMB
|
||||
#define PS_SHIFT 3
|
||||
layout (set = 1, binding = 1, scalar) writeonly buffer slice_state_buf {
|
||||
VlcState slice_vlc_state[];
|
||||
};
|
||||
#else
|
||||
#define PS_SHIFT 2
|
||||
layout (set = 1, binding = 1, scalar) writeonly buffer slice_state_buf {
|
||||
uint32_t slice_rc_state[];
|
||||
};
|
||||
#endif
|
||||
|
||||
void main(void)
|
||||
{
|
||||
const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
|
||||
@@ -34,30 +46,26 @@ void main(void)
|
||||
return;
|
||||
|
||||
const uint8_t qidx = slice_ctx[slice_idx].quant_table_idx[gl_WorkGroupID.z];
|
||||
|
||||
uint contexts = context_count[qidx];
|
||||
uint64_t slice_state_off = uint64_t(slice_state) +
|
||||
slice_idx*plane_state_size*codec_planes;
|
||||
uint plane_state_len = plane_state_size >> PS_SHIFT;
|
||||
uint offs = slice_idx*plane_state_len*codec_planes +
|
||||
gl_WorkGroupID.z*plane_state_len +
|
||||
gl_LocalInvocationID.x;
|
||||
|
||||
#ifdef GOLOMB
|
||||
uint64_t start = slice_state_off +
|
||||
(gl_WorkGroupID.z*(plane_state_size/VLC_STATE_SIZE) +
|
||||
gl_LocalInvocationID.x)*VLC_STATE_SIZE;
|
||||
for (uint x = gl_LocalInvocationID.x; x < contexts; x += gl_WorkGroupSize.x) {
|
||||
VlcState sb = VlcState(start);
|
||||
sb.drift = int16_t(0);
|
||||
sb.error_sum = uint16_t(4);
|
||||
sb.bias = int8_t(0);
|
||||
sb.count = uint8_t(1);
|
||||
start += gl_WorkGroupSize.x*VLC_STATE_SIZE;
|
||||
slice_vlc_state[offs].drift = int16_t(0);
|
||||
slice_vlc_state[offs].error_sum = uint16_t(4);
|
||||
slice_vlc_state[offs].bias = int8_t(0);
|
||||
slice_vlc_state[offs].count = uint8_t(1);
|
||||
offs += gl_WorkGroupSize.x;
|
||||
}
|
||||
#else
|
||||
uint64_t start = slice_state_off +
|
||||
gl_WorkGroupID.z*plane_state_size +
|
||||
(gl_LocalInvocationID.x << 2 /* dwords */); /* Bytes */
|
||||
uint count_total = contexts*(CONTEXT_SIZE /* bytes */ >> 2 /* dwords */);
|
||||
for (uint x = gl_LocalInvocationID.x; x < count_total; x += gl_WorkGroupSize.x) {
|
||||
u32buf(start).v = 0x80808080;
|
||||
start += gl_WorkGroupSize.x*(CONTEXT_SIZE >> 3 /* 1/8th of context */);
|
||||
slice_rc_state[offs] = 0x80808080;
|
||||
offs += gl_WorkGroupSize.x;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
#extension GL_EXT_shader_image_load_formatted : require
|
||||
|
||||
layout (set = 1, binding = 4) writeonly uniform uimage2D dst[];
|
||||
layout (set = 1, binding = 5) writeonly uniform uimage2D dst[];
|
||||
|
||||
#define RGB
|
||||
#include "ffv1_dec.comp.glsl"
|
||||
|
||||
@@ -23,5 +23,6 @@
|
||||
#pragma shader_stage(compute)
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
|
||||
#define VLC_BUFFER
|
||||
#define GOLOMB
|
||||
#include "ffv1_enc.comp.glsl"
|
||||
|
||||
@@ -23,5 +23,6 @@
|
||||
#pragma shader_stage(compute)
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
|
||||
#define VLC_BUFFER
|
||||
#define GOLOMB
|
||||
#include "ffv1_enc_reset.comp.glsl"
|
||||
|
||||
@@ -23,5 +23,6 @@
|
||||
#pragma shader_stage(compute)
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
|
||||
#define VLC_BUFFER
|
||||
#define GOLOMB
|
||||
#include "ffv1_enc_rgb.comp.glsl"
|
||||
|
||||
@@ -24,7 +24,13 @@
|
||||
#define VULKAN_FFV1_VLC_H
|
||||
|
||||
#define VLC_STATE_SIZE 8
|
||||
layout(buffer_reference, buffer_reference_align = VLC_STATE_SIZE) buffer VlcState {
|
||||
#ifdef VLC_BUFFER
|
||||
layout(buffer_reference, buffer_reference_align = VLC_STATE_SIZE) buffer
|
||||
#else
|
||||
struct
|
||||
#endif
|
||||
|
||||
VlcState {
|
||||
uint32_t error_sum;
|
||||
int16_t drift;
|
||||
int8_t bias;
|
||||
|
||||
@@ -404,6 +404,12 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
|
||||
slice_state,
|
||||
0, fp->slice_data_size*f->slice_count,
|
||||
VK_FORMAT_UNDEFINED);
|
||||
ff_vk_shader_update_desc_buffer(&ctx->s, exec, reset_shader,
|
||||
1, 1, 0,
|
||||
slice_state,
|
||||
f->slice_count*fp->slice_data_size,
|
||||
VK_WHOLE_SIZE,
|
||||
VK_FORMAT_UNDEFINED);
|
||||
|
||||
ff_vk_exec_bind_shader(&ctx->s, exec, reset_shader);
|
||||
ff_vk_shader_update_push_const(&ctx->s, exec, reset_shader,
|
||||
@@ -458,16 +464,22 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
|
||||
slice_status,
|
||||
0, 2*f->slice_count*sizeof(uint32_t),
|
||||
VK_FORMAT_UNDEFINED);
|
||||
ff_vk_shader_update_desc_buffer(&ctx->s, exec, decode_shader,
|
||||
1, 3, 0,
|
||||
slice_state,
|
||||
f->slice_count*fp->slice_data_size,
|
||||
VK_WHOLE_SIZE,
|
||||
VK_FORMAT_UNDEFINED);
|
||||
|
||||
ff_vk_shader_update_img_array(&ctx->s, exec, decode_shader,
|
||||
decode_dst, decode_dst_view,
|
||||
1, 3,
|
||||
1, 4,
|
||||
VK_IMAGE_LAYOUT_GENERAL,
|
||||
VK_NULL_HANDLE);
|
||||
if (is_rgb)
|
||||
ff_vk_shader_update_img_array(&ctx->s, exec, decode_shader,
|
||||
f->picture.f, vp->view.out,
|
||||
1, 4,
|
||||
1, 5,
|
||||
VK_IMAGE_LAYOUT_GENERAL,
|
||||
VK_NULL_HANDLE);
|
||||
|
||||
@@ -602,8 +614,12 @@ static int init_reset_shader(FFV1Context *f, FFVulkanContext *s,
|
||||
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
},
|
||||
{ /* slice_state_buf */
|
||||
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
},
|
||||
};
|
||||
ff_vk_shader_add_descriptor_set(s, shd, desc_set, 1, 0, 0);
|
||||
ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0);
|
||||
|
||||
if (ac == AC_GOLOMB_RICE)
|
||||
RET(ff_vk_shader_link(s, shd,
|
||||
@@ -660,6 +676,10 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s,
|
||||
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
},
|
||||
{ /* slice_state_buf */
|
||||
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
},
|
||||
{ /* dec */
|
||||
.type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||||
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
@@ -671,7 +691,7 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s,
|
||||
.elems = av_pix_fmt_count_planes(out_frames_ctx->sw_format),
|
||||
},
|
||||
};
|
||||
ff_vk_shader_add_descriptor_set(s, shd, desc_set, 4 + rgb, 0, 0);
|
||||
ff_vk_shader_add_descriptor_set(s, shd, desc_set, 5 + rgb, 0, 0);
|
||||
|
||||
if (ac == AC_GOLOMB_RICE) {
|
||||
if (rgb)
|
||||
|
||||
Reference in New Issue
Block a user