mirror of
https://mirror.skon.top/https://github.com/FFmpeg/FFmpeg
synced 2026-04-20 21:00:41 +08:00
avfilter/af_whisper: Add translate parameter
This commit is contained in:
@@ -7743,6 +7743,11 @@ The file path of the downloaded whisper.cpp model (mandatory).
|
||||
The language to use for transcription ('auto' for auto-detect).
|
||||
Default value: @code{"auto"}
|
||||
|
||||
@item translate
|
||||
If enabled, translate the transcription from the source language to English. A
|
||||
multilingual model is required to enable this option.
|
||||
Default value: @code{"false"}
|
||||
|
||||
@item queue
|
||||
The maximum size that will be queued into the filter before processing the audio
|
||||
with whisper. Using a small value the audio stream will be processed more often,
|
||||
|
||||
@@ -42,6 +42,7 @@ typedef struct WhisperContext {
|
||||
const AVClass *class;
|
||||
char *model_path;
|
||||
char *language;
|
||||
bool translate;
|
||||
bool use_gpu;
|
||||
int gpu_device;
|
||||
char *vad_model_path;
|
||||
@@ -150,6 +151,18 @@ static int init(AVFilterContext *ctx)
|
||||
wctx->avio_context->direct = AVIO_FLAG_DIRECT;
|
||||
}
|
||||
|
||||
if (!whisper_is_multilingual(wctx->ctx_wsp)) {
|
||||
if (!wctx->translate && strcmp(wctx->language, "auto") == 0) {
|
||||
av_log(ctx, AV_LOG_WARNING,
|
||||
"Multilingual model not provided. Non-English audio may not be correctly transcribed.\n");
|
||||
} else if (wctx->translate || (strcmp(wctx->language, "auto") != 0 && strcmp(wctx->language, "en") != 0)) {
|
||||
av_log(ctx, AV_LOG_ERROR,
|
||||
"%s requested but multilingual model not provided.\n", wctx->translate ? "Translation" : "Transcription");
|
||||
return AVERROR(ENOSYS);
|
||||
}
|
||||
wctx->language = "en";
|
||||
}
|
||||
|
||||
av_log(ctx, AV_LOG_INFO,
|
||||
"Whisper filter initialized: model: %s lang: %s queue: %" PRId64 " ms\n",
|
||||
wctx->model_path, wctx->language, wctx->queue / 1000);
|
||||
@@ -200,6 +213,7 @@ static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples)
|
||||
|
||||
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
params.language = wctx->language;
|
||||
params.translate = wctx->translate;
|
||||
params.n_threads = ff_filter_get_nb_threads(ctx);
|
||||
params.print_special = 0;
|
||||
params.print_progress = 0;
|
||||
@@ -443,6 +457,7 @@ static int query_formats(const AVFilterContext *ctx,
|
||||
static const AVOption whisper_options[] = {
|
||||
{ "model", "Path to the whisper.cpp model file", OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
|
||||
{ "language", "Language for transcription ('auto' for auto-detect)", OFFSET(language), AV_OPT_TYPE_STRING, {.str = "auto"}, .flags = FLAGS },
|
||||
{ "translate", "Translate from source language to English", OFFSET(translate), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, .flags = FLAGS },
|
||||
{ "queue", "Audio queue size", OFFSET(queue), AV_OPT_TYPE_DURATION, {.i64 = 3000000}, 20000, HOURS, .flags = FLAGS },
|
||||
{ "use_gpu", "Use GPU for processing", OFFSET(use_gpu), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, .flags = FLAGS },
|
||||
{ "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },
|
||||
|
||||
Reference in New Issue
Block a user