diff --git a/doc/filters.texi b/doc/filters.texi index 7f0c3cb99c..973a93345d 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -7743,6 +7743,11 @@ The file path of the downloaded whisper.cpp model (mandatory). The language to use for transcription ('auto' for auto-detect). Default value: @code{"auto"} +@item translate +If enabled, translate the transcription from the source language to English. A +multilingual model is required to enable this option. +Default value: @code{"false"} + @item queue The maximum size that will be queued into the filter before processing the audio with whisper. Using a small value the audio stream will be processed more often, diff --git a/libavfilter/af_whisper.c b/libavfilter/af_whisper.c index 299a8bca7a..cb1c7b2ecf 100644 --- a/libavfilter/af_whisper.c +++ b/libavfilter/af_whisper.c @@ -42,6 +42,7 @@ typedef struct WhisperContext { const AVClass *class; char *model_path; char *language; + bool translate; bool use_gpu; int gpu_device; char *vad_model_path; @@ -150,6 +151,18 @@ static int init(AVFilterContext *ctx) wctx->avio_context->direct = AVIO_FLAG_DIRECT; } + if (!whisper_is_multilingual(wctx->ctx_wsp)) { + if (!wctx->translate && strcmp(wctx->language, "auto") == 0) { + av_log(ctx, AV_LOG_WARNING, + "Multilingual model not provided. Non-English audio may not be correctly transcribed.\n"); + } else if (wctx->translate || (strcmp(wctx->language, "auto") != 0 && strcmp(wctx->language, "en") != 0)) { + av_log(ctx, AV_LOG_ERROR, + "%s requested but multilingual model not provided.\n", wctx->translate ? "Translation" : "Transcription"); + return AVERROR(ENOSYS); + } + wctx->language = "en"; + } + av_log(ctx, AV_LOG_INFO, "Whisper filter initialized: model: %s lang: %s queue: %" PRId64 " ms\n", wctx->model_path, wctx->language, wctx->queue / 1000); @@ -200,6 +213,7 @@ static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples) struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); params.language = wctx->language; + params.translate = wctx->translate; params.n_threads = ff_filter_get_nb_threads(ctx); params.print_special = 0; params.print_progress = 0; @@ -443,6 +457,7 @@ static int query_formats(const AVFilterContext *ctx, static const AVOption whisper_options[] = { { "model", "Path to the whisper.cpp model file", OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS }, { "language", "Language for transcription ('auto' for auto-detect)", OFFSET(language), AV_OPT_TYPE_STRING, {.str = "auto"}, .flags = FLAGS }, + { "translate", "Translate from source language to English", OFFSET(translate), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, .flags = FLAGS }, { "queue", "Audio queue size", OFFSET(queue), AV_OPT_TYPE_DURATION, {.i64 = 3000000}, 20000, HOURS, .flags = FLAGS }, { "use_gpu", "Use GPU for processing", OFFSET(use_gpu), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, .flags = FLAGS }, { "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },