mirror of
https://fastgit.cc/github.com/openclaw/openclaw
synced 2026-05-01 06:36:23 +08:00
fix(media): surface OpenAI audio transcription failures (#65096)
* fix(media): surface audio transcription provider failures * fix(media): prefer failed reasons in surfaced errors * fix(media): import attempt outcome type * fix(media): guard malformed decision arrays --------- Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
@@ -26,7 +26,9 @@ Docs: https://docs.openclaw.ai
|
||||
- Gateway/plugins: always send a non-empty `idempotencyKey` for plugin subagent runs, so dreaming narrative jobs stop failing gateway schema validation. (#65354) Thanks @CodeForgeNet and @vincentkoc.
|
||||
- Cron/isolated sessions: persist the right transcript path for each isolated run, including fresh session rollovers, so cron runs stop appending to stale session files. Thanks @samrusani and @vincentkoc.
|
||||
- Dreaming/cron: wake managed dreaming jobs immediately instead of waiting for the next heartbeat, so scheduled dreaming runs start when the cron fires. (#65053) Thanks @l0cka and @vincentkoc.
|
||||
<<<<<<< HEAD
|
||||
- QA/packaging: stop packaged QA helpers from crashing when optional scenario execution config is unavailable, so npm distributions can skip the repo-only scenario pack without breaking completion-cache and startup paths. (#65118) Thanks @EdderTalmor and @vincentkoc.
|
||||
- Media/audio transcription: surface the real provider failure when every audio transcription attempt fails, so status output and the CLI stop collapsing those errors into generic skips. (#65096) Thanks @l0cka and @vincentkoc.
|
||||
|
||||
## 2026.4.11
|
||||
|
||||
|
||||
@@ -739,6 +739,43 @@ describe("buildStatusMessage", () => {
|
||||
expect(normalized).toContain("Media: image ok (openai/gpt-5.4) · audio skipped (maxBytes)");
|
||||
});
|
||||
|
||||
it("includes failed media understanding decisions with the surfaced reason", () => {
|
||||
const text = buildStatusMessage({
|
||||
agent: { model: "anthropic/claude-opus-4-6" },
|
||||
sessionEntry: { sessionId: "media-failed", updatedAt: 0 },
|
||||
sessionKey: "agent:main:main",
|
||||
queue: { mode: "none" },
|
||||
mediaDecisions: [
|
||||
{
|
||||
capability: "audio",
|
||||
outcome: "failed",
|
||||
attachments: [
|
||||
{
|
||||
attachmentIndex: 0,
|
||||
attempts: [
|
||||
{
|
||||
type: "provider",
|
||||
outcome: "skipped",
|
||||
reason: "empty output",
|
||||
},
|
||||
{
|
||||
type: "provider",
|
||||
outcome: "failed",
|
||||
reason: "Error: Audio transcription response missing text",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(normalizeTestText(text)).toContain(
|
||||
"Media: audio failed (Audio transcription response missing text)",
|
||||
);
|
||||
expect(normalizeTestText(text)).not.toContain("empty output");
|
||||
});
|
||||
|
||||
it("omits media line when all decisions are none", () => {
|
||||
const text = buildStatusMessage({
|
||||
agent: { model: "anthropic/claude-opus-4-6" },
|
||||
|
||||
@@ -26,6 +26,7 @@ import type { OpenClawConfig } from "../config/types.openclaw.js";
|
||||
import { readLatestSessionUsageFromTranscript } from "../gateway/session-utils.fs.js";
|
||||
import { formatTimeAgo } from "../infra/format-time/format-relative.ts";
|
||||
import { resolveCommitHash } from "../infra/git-commit.js";
|
||||
import { findDecisionReason, summarizeDecisionReason } from "../media-understanding/runner.entries.js";
|
||||
import type { MediaUnderstandingDecision } from "../media-understanding/types.js";
|
||||
import { resolveAgentIdFromSessionKey } from "../routing/session-key.js";
|
||||
import {
|
||||
@@ -375,12 +376,15 @@ const formatMediaUnderstandingLine = (decisions?: ReadonlyArray<MediaUnderstandi
|
||||
return `${decision.capability} denied`;
|
||||
}
|
||||
if (decision.outcome === "skipped") {
|
||||
const reason = decision.attachments
|
||||
.flatMap((entry) => entry.attempts.map((attempt) => attempt.reason).filter(Boolean))
|
||||
.find(Boolean);
|
||||
const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
|
||||
const reason = findDecisionReason(decision);
|
||||
const shortReason = summarizeDecisionReason(reason);
|
||||
return `${decision.capability} skipped${shortReason ? ` (${shortReason})` : ""}`;
|
||||
}
|
||||
if (decision.outcome === "failed") {
|
||||
const reason = findDecisionReason(decision, "failed");
|
||||
const shortReason = summarizeDecisionReason(reason);
|
||||
return `${decision.capability} failed${shortReason ? ` (${shortReason})` : ""}`;
|
||||
}
|
||||
return null;
|
||||
})
|
||||
.filter((part): part is string => part != null);
|
||||
|
||||
@@ -550,6 +550,22 @@ describe("capability cli", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("surfaces the underlying transcription failure for audio transcribe", async () => {
|
||||
mocks.transcribeAudioFile.mockRejectedValueOnce(
|
||||
new Error("Audio transcription response missing text"),
|
||||
);
|
||||
|
||||
await expect(
|
||||
runRegisteredCli({
|
||||
register: registerCapabilityCli as (program: Command) => void,
|
||||
argv: ["capability", "audio", "transcribe", "--file", "memo.m4a", "--json"],
|
||||
}),
|
||||
).rejects.toThrow("exit 1");
|
||||
expect(mocks.runtime.error).toHaveBeenCalledWith(
|
||||
expect.stringMatching(/Audio transcription response missing text/),
|
||||
);
|
||||
});
|
||||
|
||||
it("forwards transcription prompt and language hints", async () => {
|
||||
await runRegisteredCli({
|
||||
register: registerCapabilityCli as (program: Command) => void,
|
||||
|
||||
@@ -36,6 +36,7 @@ import { extractGeminiResponse } from "./output-extract.js";
|
||||
import { getMediaUnderstandingProvider, normalizeMediaProviderId } from "./provider-registry.js";
|
||||
import { resolveMaxBytes, resolveMaxChars, resolvePrompt, resolveTimeoutMs } from "./resolve.js";
|
||||
import type {
|
||||
MediaUnderstandingAttemptOutcome,
|
||||
MediaUnderstandingCapability,
|
||||
MediaUnderstandingDecision,
|
||||
MediaUnderstandingModelDecision,
|
||||
@@ -444,21 +445,54 @@ export function formatDecisionSummary(decision: MediaUnderstandingDecision): str
|
||||
const provider = typeof chosen?.provider === "string" ? chosen.provider.trim() : undefined;
|
||||
const model = typeof chosen?.model === "string" ? chosen.model.trim() : undefined;
|
||||
const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined;
|
||||
const reason = attachments
|
||||
.flatMap((entry) => {
|
||||
const attempts = Array.isArray(entry?.attempts) ? entry.attempts : [];
|
||||
return attempts
|
||||
.map((attempt) => (typeof attempt?.reason === "string" ? attempt.reason : undefined))
|
||||
.filter((value): value is string => Boolean(value));
|
||||
})
|
||||
.find((value) => value.trim().length > 0);
|
||||
const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
|
||||
const reason = findDecisionReason(
|
||||
decision,
|
||||
decision.outcome === "failed" ? "failed" : undefined,
|
||||
);
|
||||
const shortReason = summarizeDecisionReason(reason);
|
||||
const countLabel = total > 0 ? ` (${success}/${total})` : "";
|
||||
const viaLabel = modelLabel ? ` via ${modelLabel}` : "";
|
||||
const reasonLabel = shortReason ? ` reason=${shortReason}` : "";
|
||||
return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`;
|
||||
}
|
||||
|
||||
export function findDecisionReason(
|
||||
decision: MediaUnderstandingDecision,
|
||||
outcome?: MediaUnderstandingAttemptOutcome,
|
||||
): string | undefined {
|
||||
const attachments = Array.isArray(decision.attachments) ? decision.attachments : [];
|
||||
for (const attachment of attachments) {
|
||||
const attempts = Array.isArray(attachment?.attempts) ? attachment.attempts : [];
|
||||
for (const attempt of attempts) {
|
||||
if (outcome && attempt.outcome !== outcome) {
|
||||
continue;
|
||||
}
|
||||
if (typeof attempt.reason !== "string" || attempt.reason.trim().length === 0) {
|
||||
continue;
|
||||
}
|
||||
return attempt.reason;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export function normalizeDecisionReason(reason?: string): string | undefined {
|
||||
const trimmed = typeof reason === "string" ? reason.trim() : "";
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = trimmed.replace(/^Error:\s*/i, "").trim();
|
||||
return normalized || undefined;
|
||||
}
|
||||
|
||||
export function summarizeDecisionReason(reason?: string): string | undefined {
|
||||
const normalized = normalizeDecisionReason(reason);
|
||||
if (!normalized) {
|
||||
return undefined;
|
||||
}
|
||||
return normalized.split(":")[0]?.trim() || undefined;
|
||||
}
|
||||
|
||||
function assertMinAudioSize(params: { size: number; attachmentIndex: number }): void {
|
||||
if (params.size >= MIN_AUDIO_FILE_BYTES) {
|
||||
return;
|
||||
|
||||
@@ -182,4 +182,30 @@ describe("runCapability skips tiny audio files", () => {
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("marks the decision as failed when every audio model attempt fails", async () => {
|
||||
await withAudioFixture({
|
||||
filePrefix: "openclaw-failed-audio",
|
||||
extension: "ogg",
|
||||
mediaType: "audio/ogg",
|
||||
fileContents: Buffer.alloc(MIN_AUDIO_FILE_BYTES + 100),
|
||||
run: async ({ ctx, media, cache }) => {
|
||||
const result = await runAudioCapabilityWithTranscriber({
|
||||
ctx,
|
||||
media,
|
||||
cache,
|
||||
transcribeAudio: async () => {
|
||||
throw new Error("upstream 500");
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.outputs).toHaveLength(0);
|
||||
expect(result.decision.outcome).toBe("failed");
|
||||
expect(result.decision.attachments).toHaveLength(1);
|
||||
expect(result.decision.attachments[0]?.attempts).toHaveLength(1);
|
||||
expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("failed");
|
||||
expect(result.decision.attachments[0]?.attempts[0]?.reason).toContain("upstream 500");
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -20,6 +20,7 @@ import type {
|
||||
MediaUnderstandingModelConfig,
|
||||
} from "../config/types.tools.js";
|
||||
import { logVerbose, shouldLogVerbose } from "../globals.js";
|
||||
import { logWarn } from "../logger.js";
|
||||
import { resolveChannelInboundAttachmentRoots } from "../media/channel-inbound-roots.js";
|
||||
import { mergeInboundPathRoots } from "../media/inbound-path-policy.js";
|
||||
import { getDefaultMediaLocalRoots } from "../media/local-roots.js";
|
||||
@@ -725,6 +726,12 @@ async function runAttachmentEntries(params: {
|
||||
return { output: null, attempts };
|
||||
}
|
||||
|
||||
function hasFailedMediaAttempt(attachments: MediaUnderstandingDecision["attachments"]): boolean {
|
||||
return attachments.some((attachment) =>
|
||||
attachment.attempts.some((attempt) => attempt.outcome === "failed"),
|
||||
);
|
||||
}
|
||||
|
||||
export async function runCapability(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
cfg: OpenClawConfig;
|
||||
@@ -861,10 +868,17 @@ export async function runCapability(params: {
|
||||
}
|
||||
const decision: MediaUnderstandingDecision = {
|
||||
capability,
|
||||
outcome: outputs.length > 0 ? "success" : "skipped",
|
||||
outcome:
|
||||
outputs.length > 0
|
||||
? "success"
|
||||
: hasFailedMediaAttempt(attachmentDecisions)
|
||||
? "failed"
|
||||
: "skipped",
|
||||
attachments: attachmentDecisions,
|
||||
};
|
||||
if (shouldLogVerbose()) {
|
||||
if (decision.outcome === "failed") {
|
||||
logWarn(`media-understanding: ${formatDecisionSummary(decision)}`);
|
||||
} else if (shouldLogVerbose()) {
|
||||
logVerbose(`Media understanding ${formatDecisionSummary(decision)}`);
|
||||
}
|
||||
return {
|
||||
|
||||
@@ -101,4 +101,43 @@ describe("media-understanding runtime", () => {
|
||||
expect(mocks.runCapability).toHaveBeenCalledTimes(1);
|
||||
expect(mocks.cleanup).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("surfaces the underlying provider failure when media understanding fails", async () => {
|
||||
mocks.normalizeMediaAttachments.mockReturnValue([
|
||||
{ index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" },
|
||||
]);
|
||||
mocks.runCapability.mockResolvedValue({
|
||||
outputs: [],
|
||||
decision: {
|
||||
capability: "audio",
|
||||
outcome: "failed",
|
||||
attachments: [
|
||||
{
|
||||
attachmentIndex: 0,
|
||||
attempts: [
|
||||
{
|
||||
type: "provider",
|
||||
provider: "openai",
|
||||
model: "gpt-4o-mini-transcribe",
|
||||
outcome: "failed",
|
||||
reason: "Error: Audio transcription response missing text",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
await expect(
|
||||
runMediaUnderstandingFile({
|
||||
capability: "audio",
|
||||
filePath: "/tmp/sample.ogg",
|
||||
mime: "audio/ogg",
|
||||
cfg: {} as OpenClawConfig,
|
||||
agentDir: "/tmp/agent",
|
||||
}),
|
||||
).rejects.toThrow("Audio transcription response missing text");
|
||||
|
||||
expect(mocks.cleanup).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { normalizeMediaProviderId } from "./provider-registry.js";
|
||||
import { findDecisionReason, normalizeDecisionReason } from "./runner.entries.js";
|
||||
import {
|
||||
buildProviderRegistry,
|
||||
createMediaAttachmentCache,
|
||||
@@ -33,6 +34,12 @@ const KIND_BY_CAPABILITY: Record<MediaUnderstandingCapability, MediaUnderstandin
|
||||
video: "video.description",
|
||||
};
|
||||
|
||||
function resolveDecisionFailureReason(
|
||||
decision: Awaited<ReturnType<typeof runCapability>>["decision"],
|
||||
): string | undefined {
|
||||
return normalizeDecisionReason(findDecisionReason(decision, "failed"));
|
||||
}
|
||||
|
||||
function buildFileContext(params: { filePath: string; mime?: string }) {
|
||||
return {
|
||||
MediaPath: params.filePath,
|
||||
@@ -75,6 +82,12 @@ export async function runMediaUnderstandingFile(
|
||||
config,
|
||||
activeModel: params.activeModel,
|
||||
});
|
||||
if (result.outputs.length === 0 && result.decision.outcome === "failed") {
|
||||
throw new Error(
|
||||
resolveDecisionFailureReason(result.decision) ??
|
||||
`${params.capability} understanding failed`,
|
||||
);
|
||||
}
|
||||
const output = result.outputs.find(
|
||||
(entry) => entry.kind === KIND_BY_CAPABILITY[params.capability],
|
||||
);
|
||||
|
||||
@@ -32,6 +32,7 @@ export type MediaUnderstandingOutput = {
|
||||
|
||||
export type MediaUnderstandingDecisionOutcome =
|
||||
| "success"
|
||||
| "failed"
|
||||
| "skipped"
|
||||
| "disabled"
|
||||
| "no-attachment"
|
||||
|
||||
Reference in New Issue
Block a user