mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 21:00:41 +08:00
avfilter/af_whisper: Add max_len parameter
This commit is contained in:
@@ -7768,6 +7768,12 @@ The destination format string; it could be "text" (only the transcribed text
|
||||
will be sent to the destination), "srt" (subtitle format) or "json".
|
||||
Default value: @code{"text"}
|
||||
|
||||
@item max_len
|
||||
Maximum segment length in characters. When set to a value greater than 0,
|
||||
transcription segments will be split to not exceed this length. This is useful
|
||||
for generating subtitles with shorter lines.
|
||||
Default value: @code{"0"}
|
||||
|
||||
@item vad_model
|
||||
Path to the VAD model file. If set, the filter will load an additional voice
|
||||
activity detection module (https://github.com/snakers4/silero-vad) that will be
|
||||
|
||||
@@ -52,6 +52,7 @@ typedef struct WhisperContext {
|
||||
int64_t queue;
|
||||
char *destination;
|
||||
char *format;
|
||||
int max_len;
|
||||
|
||||
struct whisper_context *ctx_wsp;
|
||||
struct whisper_vad_context *ctx_vad;
|
||||
@@ -204,6 +205,8 @@ static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples)
|
||||
params.print_progress = 0;
|
||||
params.print_realtime = 0;
|
||||
params.print_timestamps = 0;
|
||||
params.max_len = wctx->max_len;
|
||||
params.token_timestamps = (wctx->max_len > 0);
|
||||
|
||||
if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, samples) != 0) {
|
||||
av_log(ctx, AV_LOG_ERROR, "Failed to process audio with whisper.cpp\n");
|
||||
@@ -224,6 +227,14 @@ static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples)
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip segments that are parts of [BLANK_AUDIO] when max_len splits them
|
||||
if (wctx->max_len > 0 && (strcmp(text_cleaned, "[") == 0 || strcmp(text_cleaned, "]") == 0 ||
|
||||
strcmp(text_cleaned, "BLANK") == 0 || strcmp(text_cleaned, "_") == 0 ||
|
||||
strcmp(text_cleaned, "AUDIO") == 0)) {
|
||||
av_freep(&text_cleaned);
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool turn = whisper_full_get_segment_speaker_turn_next(wctx->ctx_wsp, i);
|
||||
const int64_t t0_ms = whisper_full_get_segment_t0(wctx->ctx_wsp, i) * 10;
|
||||
const int64_t t1_ms = whisper_full_get_segment_t1(wctx->ctx_wsp, i) * 10;
|
||||
@@ -437,6 +448,7 @@ static const AVOption whisper_options[] = {
|
||||
{ "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },
|
||||
{ "destination", "Output destination", OFFSET(destination), AV_OPT_TYPE_STRING, {.str = ""}, .flags = FLAGS },
|
||||
{ "format", "Output format (text|srt|json)", OFFSET(format), AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },
|
||||
{ "max_len", "Max segment length in characters", OFFSET(max_len), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },
|
||||
{ "vad_model", "Path to the VAD model file", OFFSET(vad_model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
|
||||
{ "vad_threshold", "VAD threshold", OFFSET(vad_threshold), AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0, .flags = FLAGS },
|
||||
{ "vad_min_speech_duration", "Minimum speech duration for VAD", OFFSET(vad_min_speech_duration), AV_OPT_TYPE_DURATION, {.i64 = 100000}, 20000, HOURS, .flags = FLAGS },
|
||||
|
||||
Reference in New Issue
Block a user