fix(media): surface OpenAI audio transcription failures (#65096)

* fix(media): surface audio transcription provider failures

* fix(media): prefer failed reasons in surfaced errors

* fix(media): import attempt outcome type

* fix(media): guard malformed decision arrays

---------

Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
Daniel Alkurdi
2026-04-13 02:05:18 +10:00
committed by GitHub
parent d46f684898
commit 1f0431cd11
10 changed files with 201 additions and 15 deletions

View File

@@ -26,7 +26,9 @@ Docs: https://docs.openclaw.ai
- Gateway/plugins: always send a non-empty `idempotencyKey` for plugin subagent runs, so dreaming narrative jobs stop failing gateway schema validation. (#65354) Thanks @CodeForgeNet and @vincentkoc.
- Cron/isolated sessions: persist the right transcript path for each isolated run, including fresh session rollovers, so cron runs stop appending to stale session files. Thanks @samrusani and @vincentkoc.
- Dreaming/cron: wake managed dreaming jobs immediately instead of waiting for the next heartbeat, so scheduled dreaming runs start when the cron fires. (#65053) Thanks @l0cka and @vincentkoc.
<<<<<<< HEAD
- QA/packaging: stop packaged QA helpers from crashing when optional scenario execution config is unavailable, so npm distributions can skip the repo-only scenario pack without breaking completion-cache and startup paths. (#65118) Thanks @EdderTalmor and @vincentkoc.
- Media/audio transcription: surface the real provider failure when every audio transcription attempt fails, so status output and the CLI stop collapsing those errors into generic skips. (#65096) Thanks @l0cka and @vincentkoc.
## 2026.4.11

View File

@@ -739,6 +739,43 @@ describe("buildStatusMessage", () => {
expect(normalized).toContain("Media: image ok (openai/gpt-5.4) · audio skipped (maxBytes)");
});
it("includes failed media understanding decisions with the surfaced reason", () => {
const text = buildStatusMessage({
agent: { model: "anthropic/claude-opus-4-6" },
sessionEntry: { sessionId: "media-failed", updatedAt: 0 },
sessionKey: "agent:main:main",
queue: { mode: "none" },
mediaDecisions: [
{
capability: "audio",
outcome: "failed",
attachments: [
{
attachmentIndex: 0,
attempts: [
{
type: "provider",
outcome: "skipped",
reason: "empty output",
},
{
type: "provider",
outcome: "failed",
reason: "Error: Audio transcription response missing text",
},
],
},
],
},
],
});
expect(normalizeTestText(text)).toContain(
"Media: audio failed (Audio transcription response missing text)",
);
expect(normalizeTestText(text)).not.toContain("empty output");
});
it("omits media line when all decisions are none", () => {
const text = buildStatusMessage({
agent: { model: "anthropic/claude-opus-4-6" },

View File

@@ -26,6 +26,7 @@ import type { OpenClawConfig } from "../config/types.openclaw.js";
import { readLatestSessionUsageFromTranscript } from "../gateway/session-utils.fs.js";
import { formatTimeAgo } from "../infra/format-time/format-relative.ts";
import { resolveCommitHash } from "../infra/git-commit.js";
import { findDecisionReason, summarizeDecisionReason } from "../media-understanding/runner.entries.js";
import type { MediaUnderstandingDecision } from "../media-understanding/types.js";
import { resolveAgentIdFromSessionKey } from "../routing/session-key.js";
import {
@@ -375,12 +376,15 @@ const formatMediaUnderstandingLine = (decisions?: ReadonlyArray<MediaUnderstandi
return `${decision.capability} denied`;
}
if (decision.outcome === "skipped") {
const reason = decision.attachments
.flatMap((entry) => entry.attempts.map((attempt) => attempt.reason).filter(Boolean))
.find(Boolean);
const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
const reason = findDecisionReason(decision);
const shortReason = summarizeDecisionReason(reason);
return `${decision.capability} skipped${shortReason ? ` (${shortReason})` : ""}`;
}
if (decision.outcome === "failed") {
const reason = findDecisionReason(decision, "failed");
const shortReason = summarizeDecisionReason(reason);
return `${decision.capability} failed${shortReason ? ` (${shortReason})` : ""}`;
}
return null;
})
.filter((part): part is string => part != null);

View File

@@ -550,6 +550,22 @@ describe("capability cli", () => {
);
});
it("surfaces the underlying transcription failure for audio transcribe", async () => {
mocks.transcribeAudioFile.mockRejectedValueOnce(
new Error("Audio transcription response missing text"),
);
await expect(
runRegisteredCli({
register: registerCapabilityCli as (program: Command) => void,
argv: ["capability", "audio", "transcribe", "--file", "memo.m4a", "--json"],
}),
).rejects.toThrow("exit 1");
expect(mocks.runtime.error).toHaveBeenCalledWith(
expect.stringMatching(/Audio transcription response missing text/),
);
});
it("forwards transcription prompt and language hints", async () => {
await runRegisteredCli({
register: registerCapabilityCli as (program: Command) => void,

View File

@@ -36,6 +36,7 @@ import { extractGeminiResponse } from "./output-extract.js";
import { getMediaUnderstandingProvider, normalizeMediaProviderId } from "./provider-registry.js";
import { resolveMaxBytes, resolveMaxChars, resolvePrompt, resolveTimeoutMs } from "./resolve.js";
import type {
MediaUnderstandingAttemptOutcome,
MediaUnderstandingCapability,
MediaUnderstandingDecision,
MediaUnderstandingModelDecision,
@@ -444,21 +445,54 @@ export function formatDecisionSummary(decision: MediaUnderstandingDecision): str
const provider = typeof chosen?.provider === "string" ? chosen.provider.trim() : undefined;
const model = typeof chosen?.model === "string" ? chosen.model.trim() : undefined;
const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined;
const reason = attachments
.flatMap((entry) => {
const attempts = Array.isArray(entry?.attempts) ? entry.attempts : [];
return attempts
.map((attempt) => (typeof attempt?.reason === "string" ? attempt.reason : undefined))
.filter((value): value is string => Boolean(value));
})
.find((value) => value.trim().length > 0);
const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
const reason = findDecisionReason(
decision,
decision.outcome === "failed" ? "failed" : undefined,
);
const shortReason = summarizeDecisionReason(reason);
const countLabel = total > 0 ? ` (${success}/${total})` : "";
const viaLabel = modelLabel ? ` via ${modelLabel}` : "";
const reasonLabel = shortReason ? ` reason=${shortReason}` : "";
return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`;
}
export function findDecisionReason(
decision: MediaUnderstandingDecision,
outcome?: MediaUnderstandingAttemptOutcome,
): string | undefined {
const attachments = Array.isArray(decision.attachments) ? decision.attachments : [];
for (const attachment of attachments) {
const attempts = Array.isArray(attachment?.attempts) ? attachment.attempts : [];
for (const attempt of attempts) {
if (outcome && attempt.outcome !== outcome) {
continue;
}
if (typeof attempt.reason !== "string" || attempt.reason.trim().length === 0) {
continue;
}
return attempt.reason;
}
}
return undefined;
}
export function normalizeDecisionReason(reason?: string): string | undefined {
const trimmed = typeof reason === "string" ? reason.trim() : "";
if (!trimmed) {
return undefined;
}
const normalized = trimmed.replace(/^Error:\s*/i, "").trim();
return normalized || undefined;
}
export function summarizeDecisionReason(reason?: string): string | undefined {
const normalized = normalizeDecisionReason(reason);
if (!normalized) {
return undefined;
}
return normalized.split(":")[0]?.trim() || undefined;
}
function assertMinAudioSize(params: { size: number; attachmentIndex: number }): void {
if (params.size >= MIN_AUDIO_FILE_BYTES) {
return;

View File

@@ -182,4 +182,30 @@ describe("runCapability skips tiny audio files", () => {
},
});
});
it("marks the decision as failed when every audio model attempt fails", async () => {
await withAudioFixture({
filePrefix: "openclaw-failed-audio",
extension: "ogg",
mediaType: "audio/ogg",
fileContents: Buffer.alloc(MIN_AUDIO_FILE_BYTES + 100),
run: async ({ ctx, media, cache }) => {
const result = await runAudioCapabilityWithTranscriber({
ctx,
media,
cache,
transcribeAudio: async () => {
throw new Error("upstream 500");
},
});
expect(result.outputs).toHaveLength(0);
expect(result.decision.outcome).toBe("failed");
expect(result.decision.attachments).toHaveLength(1);
expect(result.decision.attachments[0]?.attempts).toHaveLength(1);
expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("failed");
expect(result.decision.attachments[0]?.attempts[0]?.reason).toContain("upstream 500");
},
});
});
});

View File

@@ -20,6 +20,7 @@ import type {
MediaUnderstandingModelConfig,
} from "../config/types.tools.js";
import { logVerbose, shouldLogVerbose } from "../globals.js";
import { logWarn } from "../logger.js";
import { resolveChannelInboundAttachmentRoots } from "../media/channel-inbound-roots.js";
import { mergeInboundPathRoots } from "../media/inbound-path-policy.js";
import { getDefaultMediaLocalRoots } from "../media/local-roots.js";
@@ -725,6 +726,12 @@ async function runAttachmentEntries(params: {
return { output: null, attempts };
}
function hasFailedMediaAttempt(attachments: MediaUnderstandingDecision["attachments"]): boolean {
return attachments.some((attachment) =>
attachment.attempts.some((attempt) => attempt.outcome === "failed"),
);
}
export async function runCapability(params: {
capability: MediaUnderstandingCapability;
cfg: OpenClawConfig;
@@ -861,10 +868,17 @@ export async function runCapability(params: {
}
const decision: MediaUnderstandingDecision = {
capability,
outcome: outputs.length > 0 ? "success" : "skipped",
outcome:
outputs.length > 0
? "success"
: hasFailedMediaAttempt(attachmentDecisions)
? "failed"
: "skipped",
attachments: attachmentDecisions,
};
if (shouldLogVerbose()) {
if (decision.outcome === "failed") {
logWarn(`media-understanding: ${formatDecisionSummary(decision)}`);
} else if (shouldLogVerbose()) {
logVerbose(`Media understanding ${formatDecisionSummary(decision)}`);
}
return {

View File

@@ -101,4 +101,43 @@ describe("media-understanding runtime", () => {
expect(mocks.runCapability).toHaveBeenCalledTimes(1);
expect(mocks.cleanup).toHaveBeenCalledTimes(1);
});
it("surfaces the underlying provider failure when media understanding fails", async () => {
mocks.normalizeMediaAttachments.mockReturnValue([
{ index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" },
]);
mocks.runCapability.mockResolvedValue({
outputs: [],
decision: {
capability: "audio",
outcome: "failed",
attachments: [
{
attachmentIndex: 0,
attempts: [
{
type: "provider",
provider: "openai",
model: "gpt-4o-mini-transcribe",
outcome: "failed",
reason: "Error: Audio transcription response missing text",
},
],
},
],
},
});
await expect(
runMediaUnderstandingFile({
capability: "audio",
filePath: "/tmp/sample.ogg",
mime: "audio/ogg",
cfg: {} as OpenClawConfig,
agentDir: "/tmp/agent",
}),
).rejects.toThrow("Audio transcription response missing text");
expect(mocks.cleanup).toHaveBeenCalledTimes(1);
});
});

View File

@@ -1,6 +1,7 @@
import fs from "node:fs/promises";
import path from "node:path";
import { normalizeMediaProviderId } from "./provider-registry.js";
import { findDecisionReason, normalizeDecisionReason } from "./runner.entries.js";
import {
buildProviderRegistry,
createMediaAttachmentCache,
@@ -33,6 +34,12 @@ const KIND_BY_CAPABILITY: Record<MediaUnderstandingCapability, MediaUnderstandin
video: "video.description",
};
function resolveDecisionFailureReason(
decision: Awaited<ReturnType<typeof runCapability>>["decision"],
): string | undefined {
return normalizeDecisionReason(findDecisionReason(decision, "failed"));
}
function buildFileContext(params: { filePath: string; mime?: string }) {
return {
MediaPath: params.filePath,
@@ -75,6 +82,12 @@ export async function runMediaUnderstandingFile(
config,
activeModel: params.activeModel,
});
if (result.outputs.length === 0 && result.decision.outcome === "failed") {
throw new Error(
resolveDecisionFailureReason(result.decision) ??
`${params.capability} understanding failed`,
);
}
const output = result.outputs.find(
(entry) => entry.kind === KIND_BY_CAPABILITY[params.capability],
);

View File

@@ -32,6 +32,7 @@ export type MediaUnderstandingOutput = {
export type MediaUnderstandingDecisionOutcome =
| "success"
| "failed"
| "skipped"
| "disabled"
| "no-attachment"