From e48eaa8c6211069454b054e0cad4f8270fb563cc Mon Sep 17 00:00:00 2001 From: WyattBlue Date: Sun, 21 Dec 2025 23:51:15 -0500 Subject: [PATCH] avfilter/af_whisper: Add max_len parameter --- doc/filters.texi | 6 ++++++ libavfilter/af_whisper.c | 12 ++++++++++++ 2 files changed, 18 insertions(+) diff --git a/doc/filters.texi b/doc/filters.texi index bd9f881aa1..22d0fcf90d 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -7768,6 +7768,12 @@ The destination format string; it could be "text" (only the transcribed text will be sent to the destination), "srt" (subtitle format) or "json". Default value: @code{"text"} +@item max_len +Maximum segment length in characters. When set to a value greater than 0, +transcription segments will be split to not exceed this length. This is useful +for generating subtitles with shorter lines. +Default value: @code{"0"} + @item vad_model Path to the VAD model file. If set, the filter will load an additional voice activity detection module (https://github.com/snakers4/silero-vad) that will be diff --git a/libavfilter/af_whisper.c b/libavfilter/af_whisper.c index fcc7e415cc..299a8bca7a 100644 --- a/libavfilter/af_whisper.c +++ b/libavfilter/af_whisper.c @@ -52,6 +52,7 @@ typedef struct WhisperContext { int64_t queue; char *destination; char *format; + int max_len; struct whisper_context *ctx_wsp; struct whisper_vad_context *ctx_vad; @@ -204,6 +205,8 @@ static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples) params.print_progress = 0; params.print_realtime = 0; params.print_timestamps = 0; + params.max_len = wctx->max_len; + params.token_timestamps = (wctx->max_len > 0); if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, samples) != 0) { av_log(ctx, AV_LOG_ERROR, "Failed to process audio with whisper.cpp\n"); @@ -224,6 +227,14 @@ static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples) continue; } + // Skip segments that are parts of [BLANK_AUDIO] when max_len splits them + if (wctx->max_len > 0 && (strcmp(text_cleaned, "[") == 0 || strcmp(text_cleaned, "]") == 0 || + strcmp(text_cleaned, "BLANK") == 0 || strcmp(text_cleaned, "_") == 0 || + strcmp(text_cleaned, "AUDIO") == 0)) { + av_freep(&text_cleaned); + continue; + } + const bool turn = whisper_full_get_segment_speaker_turn_next(wctx->ctx_wsp, i); const int64_t t0_ms = whisper_full_get_segment_t0(wctx->ctx_wsp, i) * 10; const int64_t t1_ms = whisper_full_get_segment_t1(wctx->ctx_wsp, i) * 10; @@ -437,6 +448,7 @@ static const AVOption whisper_options[] = { { "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS }, { "destination", "Output destination", OFFSET(destination), AV_OPT_TYPE_STRING, {.str = ""}, .flags = FLAGS }, { "format", "Output format (text|srt|json)", OFFSET(format), AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS }, + { "max_len", "Max segment length in characters", OFFSET(max_len), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS }, { "vad_model", "Path to the VAD model file", OFFSET(vad_model_path), AV_OPT_TYPE_STRING,.flags = FLAGS }, { "vad_threshold", "VAD threshold", OFFSET(vad_threshold), AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0, .flags = FLAGS }, { "vad_min_speech_duration", "Minimum speech duration for VAD", OFFSET(vad_min_speech_duration), AV_OPT_TYPE_DURATION, {.i64 = 100000}, 20000, HOURS, .flags = FLAGS },