vulkan_ffv1: use regular descriptors for slice state

HUGE speedup on AMD, HUGE speedup everywhere.
This commit is contained in:
Lynne
2026-02-08 03:58:11 +01:00
parent da99d3f209
commit c0a697a1bc
8 changed files with 79 additions and 32 deletions

View File

@@ -33,10 +33,14 @@ layout (set = 1, binding = 1, scalar) readonly buffer slice_offsets_buf {
layout (set = 1, binding = 2, scalar) writeonly buffer slice_status_buf {
uint32_t slice_status[];
};
layout (set = 1, binding = 3) uniform uimage2D dec[];
layout (set = 1, binding = 4) uniform uimage2D dec[];
#ifndef GOLOMB
layout (set = 1, binding = 3, scalar) buffer slice_state_buf {
uint8_t slice_rc_state[];
};
#define READ(c, idx) get_rac_noadapt(c, idx)
int get_isymbol(inout RangeCoder c)
{
@@ -114,10 +118,9 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
ivec2 pr = get_pred(dec[p], sp, ivec2(x, y), 0, w,
quant_table_idx, extend_lookup[quant_table_idx]);
uint context_off = state_off + CONTEXT_SIZE*abs(pr[0]);
u8buf cd = u8buf(uint64_t(slice_state) + context_off);
uint rc_off = state_off + CONTEXT_SIZE*abs(pr[0]) + gl_LocalInvocationID.x;
rc_state[gl_LocalInvocationID.x] = cd[gl_LocalInvocationID.x].v;
rc_state[gl_LocalInvocationID.x] = slice_rc_state[rc_off];
rc_dec[gl_LocalInvocationID.x] = false;
barrier();
@@ -128,7 +131,8 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
barrier();
uint i = gl_LocalInvocationID.x;
if (rc_dec[i])
cd[i].v = zero_one_state[rc_state[i] + (rc_data[i] ? 256 : 0)];
slice_rc_state[rc_off] = zero_one_state[rc_state[i] +
(rc_data[i] ? 256 : 0)];
if (gl_LocalInvocationID.x == 0) {
if (pr[0] < 0)
@@ -139,7 +143,13 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
}
}
}
#else
#else /* GOLOMB */
layout (set = 1, binding = 3, scalar) buffer slice_state_buf {
VlcState slice_vlc_state[];
};
GetBitContext gb;
void golomb_init(inout SliceContext sc)
@@ -172,8 +182,7 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
ivec2 pr = get_pred(dec[p], sp, ivec2(x, y), 0, w,
quant_table_idx, extend_lookup[quant_table_idx]);
uint context_off = state_off + VLC_STATE_SIZE*abs(pr[0]);
VlcState sb = VlcState(uint64_t(slice_state) + context_off);
uint vlc_off = state_off + abs(pr[0]);
if (pr[0] == 0 && run_mode == 0)
run_mode = 1;
@@ -201,14 +210,14 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
if (run_count < 0) {
run_mode = 0;
run_count = 0;
diff = read_vlc_symbol(gb, sb, bits);
diff = read_vlc_symbol(gb, slice_vlc_state[vlc_off], bits);
if (diff >= 0)
diff++;
} else {
diff = 0;
}
} else {
diff = read_vlc_symbol(gb, sb, bits);
diff = read_vlc_symbol(gb, slice_vlc_state[vlc_off], bits);
}
if (pr[0] < 0)
@@ -298,6 +307,7 @@ void decode_slice(inout SliceContext sc, const uint slice_idx)
uvec4(0, 1, 1, 2))*plane_state_size;
#ifdef GOLOMB
slice_state_off >>= 3; // division by VLC_STATE_SIZE
golomb_init(sc);
#endif

View File

@@ -26,6 +26,18 @@
#include "common.glsl"
#include "ffv1_common.glsl"
#ifdef GOLOMB
#define PS_SHIFT 3
layout (set = 1, binding = 1, scalar) writeonly buffer slice_state_buf {
VlcState slice_vlc_state[];
};
#else
#define PS_SHIFT 2
layout (set = 1, binding = 1, scalar) writeonly buffer slice_state_buf {
uint32_t slice_rc_state[];
};
#endif
void main(void)
{
const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
@@ -34,30 +46,26 @@ void main(void)
return;
const uint8_t qidx = slice_ctx[slice_idx].quant_table_idx[gl_WorkGroupID.z];
uint contexts = context_count[qidx];
uint64_t slice_state_off = uint64_t(slice_state) +
slice_idx*plane_state_size*codec_planes;
uint plane_state_len = plane_state_size >> PS_SHIFT;
uint offs = slice_idx*plane_state_len*codec_planes +
gl_WorkGroupID.z*plane_state_len +
gl_LocalInvocationID.x;
#ifdef GOLOMB
uint64_t start = slice_state_off +
(gl_WorkGroupID.z*(plane_state_size/VLC_STATE_SIZE) +
gl_LocalInvocationID.x)*VLC_STATE_SIZE;
for (uint x = gl_LocalInvocationID.x; x < contexts; x += gl_WorkGroupSize.x) {
VlcState sb = VlcState(start);
sb.drift = int16_t(0);
sb.error_sum = uint16_t(4);
sb.bias = int8_t(0);
sb.count = uint8_t(1);
start += gl_WorkGroupSize.x*VLC_STATE_SIZE;
slice_vlc_state[offs].drift = int16_t(0);
slice_vlc_state[offs].error_sum = uint16_t(4);
slice_vlc_state[offs].bias = int8_t(0);
slice_vlc_state[offs].count = uint8_t(1);
offs += gl_WorkGroupSize.x;
}
#else
uint64_t start = slice_state_off +
gl_WorkGroupID.z*plane_state_size +
(gl_LocalInvocationID.x << 2 /* dwords */); /* Bytes */
uint count_total = contexts*(CONTEXT_SIZE /* bytes */ >> 2 /* dwords */);
for (uint x = gl_LocalInvocationID.x; x < count_total; x += gl_WorkGroupSize.x) {
u32buf(start).v = 0x80808080;
start += gl_WorkGroupSize.x*(CONTEXT_SIZE >> 3 /* 1/8th of context */);
slice_rc_state[offs] = 0x80808080;
offs += gl_WorkGroupSize.x;
}
#endif
}

View File

@@ -24,7 +24,7 @@
#extension GL_GOOGLE_include_directive : require
#extension GL_EXT_shader_image_load_formatted : require
layout (set = 1, binding = 4) writeonly uniform uimage2D dst[];
layout (set = 1, binding = 5) writeonly uniform uimage2D dst[];
#define RGB
#include "ffv1_dec.comp.glsl"

View File

@@ -23,5 +23,6 @@
#pragma shader_stage(compute)
#extension GL_GOOGLE_include_directive : require
#define VLC_BUFFER
#define GOLOMB
#include "ffv1_enc.comp.glsl"

View File

@@ -23,5 +23,6 @@
#pragma shader_stage(compute)
#extension GL_GOOGLE_include_directive : require
#define VLC_BUFFER
#define GOLOMB
#include "ffv1_enc_reset.comp.glsl"

View File

@@ -23,5 +23,6 @@
#pragma shader_stage(compute)
#extension GL_GOOGLE_include_directive : require
#define VLC_BUFFER
#define GOLOMB
#include "ffv1_enc_rgb.comp.glsl"

View File

@@ -24,7 +24,13 @@
#define VULKAN_FFV1_VLC_H
#define VLC_STATE_SIZE 8
layout(buffer_reference, buffer_reference_align = VLC_STATE_SIZE) buffer VlcState {
#ifdef VLC_BUFFER
layout(buffer_reference, buffer_reference_align = VLC_STATE_SIZE) buffer
#else
struct
#endif
VlcState {
uint32_t error_sum;
int16_t drift;
int8_t bias;

View File

@@ -404,6 +404,12 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
slice_state,
0, fp->slice_data_size*f->slice_count,
VK_FORMAT_UNDEFINED);
ff_vk_shader_update_desc_buffer(&ctx->s, exec, reset_shader,
1, 1, 0,
slice_state,
f->slice_count*fp->slice_data_size,
VK_WHOLE_SIZE,
VK_FORMAT_UNDEFINED);
ff_vk_exec_bind_shader(&ctx->s, exec, reset_shader);
ff_vk_shader_update_push_const(&ctx->s, exec, reset_shader,
@@ -458,16 +464,22 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
slice_status,
0, 2*f->slice_count*sizeof(uint32_t),
VK_FORMAT_UNDEFINED);
ff_vk_shader_update_desc_buffer(&ctx->s, exec, decode_shader,
1, 3, 0,
slice_state,
f->slice_count*fp->slice_data_size,
VK_WHOLE_SIZE,
VK_FORMAT_UNDEFINED);
ff_vk_shader_update_img_array(&ctx->s, exec, decode_shader,
decode_dst, decode_dst_view,
1, 3,
1, 4,
VK_IMAGE_LAYOUT_GENERAL,
VK_NULL_HANDLE);
if (is_rgb)
ff_vk_shader_update_img_array(&ctx->s, exec, decode_shader,
f->picture.f, vp->view.out,
1, 4,
1, 5,
VK_IMAGE_LAYOUT_GENERAL,
VK_NULL_HANDLE);
@@ -602,8 +614,12 @@ static int init_reset_shader(FFV1Context *f, FFVulkanContext *s,
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
},
{ /* slice_state_buf */
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
},
};
ff_vk_shader_add_descriptor_set(s, shd, desc_set, 1, 0, 0);
ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0);
if (ac == AC_GOLOMB_RICE)
RET(ff_vk_shader_link(s, shd,
@@ -660,6 +676,10 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s,
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
},
{ /* slice_state_buf */
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
},
{ /* dec */
.type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
@@ -671,7 +691,7 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s,
.elems = av_pix_fmt_count_planes(out_frames_ctx->sw_format),
},
};
ff_vk_shader_add_descriptor_set(s, shd, desc_set, 4 + rgb, 0, 0);
ff_vk_shader_add_descriptor_set(s, shd, desc_set, 5 + rgb, 0, 0);
if (ac == AC_GOLOMB_RICE) {
if (rgb)