fix(media): surface OpenAI audio transcription failures (#65096)

* fix(media): surface audio transcription provider failures * fix(media): prefer failed reasons in surfaced errors * fix(media): import attempt outcome type * fix(media): guard malformed decision arrays --------- Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
2026-05-01 06:36:23 +08:00 · 2026-04-13 02:05:18 +10:00
parent d46f684898
commit 1f0431cd11
10 changed files with 201 additions and 15 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,7 +26,9 @@ Docs: https://docs.openclaw.ai
 - Gateway/plugins: always send a non-empty `idempotencyKey` for plugin subagent runs, so dreaming narrative jobs stop failing gateway schema validation. (#65354) Thanks @CodeForgeNet and @vincentkoc.
 - Cron/isolated sessions: persist the right transcript path for each isolated run, including fresh session rollovers, so cron runs stop appending to stale session files. Thanks @samrusani and @vincentkoc.
 - Dreaming/cron: wake managed dreaming jobs immediately instead of waiting for the next heartbeat, so scheduled dreaming runs start when the cron fires. (#65053) Thanks @l0cka and @vincentkoc.
+<<<<<<< HEAD
 - QA/packaging: stop packaged QA helpers from crashing when optional scenario execution config is unavailable, so npm distributions can skip the repo-only scenario pack without breaking completion-cache and startup paths. (#65118) Thanks @EdderTalmor and @vincentkoc.
+- Media/audio transcription: surface the real provider failure when every audio transcription attempt fails, so status output and the CLI stop collapsing those errors into generic skips. (#65096) Thanks @l0cka and @vincentkoc.

 ## 2026.4.11

--- a/src/auto-reply/status.test.ts
+++ b/src/auto-reply/status.test.ts
@@ -739,6 +739,43 @@ describe("buildStatusMessage", () => {
    expect(normalized).toContain("Media: image ok (openai/gpt-5.4) · audio skipped (maxBytes)");
  });

+  it("includes failed media understanding decisions with the surfaced reason", () => {
+    const text = buildStatusMessage({
+      agent: { model: "anthropic/claude-opus-4-6" },
+      sessionEntry: { sessionId: "media-failed", updatedAt: 0 },
+      sessionKey: "agent:main:main",
+      queue: { mode: "none" },
+      mediaDecisions: [
+        {
+          capability: "audio",
+          outcome: "failed",
+          attachments: [
+            {
+              attachmentIndex: 0,
+              attempts: [
+              {
+                type: "provider",
+                outcome: "skipped",
+                reason: "empty output",
+              },
+              {
+                type: "provider",
+                outcome: "failed",
+                reason: "Error: Audio transcription response missing text",
+              },
+            ],
+            },
+          ],
+        },
+      ],
+    });
+
+    expect(normalizeTestText(text)).toContain(
+      "Media: audio failed (Audio transcription response missing text)",
+    );
+    expect(normalizeTestText(text)).not.toContain("empty output");
+  });
+
  it("omits media line when all decisions are none", () => {
    const text = buildStatusMessage({
      agent: { model: "anthropic/claude-opus-4-6" },
--- a/src/auto-reply/status.ts
+++ b/src/auto-reply/status.ts
@@ -26,6 +26,7 @@ import type { OpenClawConfig } from "../config/types.openclaw.js";
 import { readLatestSessionUsageFromTranscript } from "../gateway/session-utils.fs.js";
 import { formatTimeAgo } from "../infra/format-time/format-relative.ts";
 import { resolveCommitHash } from "../infra/git-commit.js";
+import { findDecisionReason, summarizeDecisionReason } from "../media-understanding/runner.entries.js";
 import type { MediaUnderstandingDecision } from "../media-understanding/types.js";
 import { resolveAgentIdFromSessionKey } from "../routing/session-key.js";
 import {
@@ -375,12 +376,15 @@ const formatMediaUnderstandingLine = (decisions?: ReadonlyArray<MediaUnderstandi
        return `${decision.capability} denied`;
      }
      if (decision.outcome === "skipped") {
-        const reason = decision.attachments
-          .flatMap((entry) => entry.attempts.map((attempt) => attempt.reason).filter(Boolean))
-          .find(Boolean);
-        const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
+        const reason = findDecisionReason(decision);
+        const shortReason = summarizeDecisionReason(reason);
        return `${decision.capability} skipped${shortReason ? ` (${shortReason})` : ""}`;
      }
+      if (decision.outcome === "failed") {
+        const reason = findDecisionReason(decision, "failed");
+        const shortReason = summarizeDecisionReason(reason);
+        return `${decision.capability} failed${shortReason ? ` (${shortReason})` : ""}`;
+      }
      return null;
    })
    .filter((part): part is string => part != null);
--- a/src/cli/capability-cli.test.ts
+++ b/src/cli/capability-cli.test.ts
@@ -550,6 +550,22 @@ describe("capability cli", () => {
    );
  });

+  it("surfaces the underlying transcription failure for audio transcribe", async () => {
+    mocks.transcribeAudioFile.mockRejectedValueOnce(
+      new Error("Audio transcription response missing text"),
+    );
+
+    await expect(
+      runRegisteredCli({
+        register: registerCapabilityCli as (program: Command) => void,
+        argv: ["capability", "audio", "transcribe", "--file", "memo.m4a", "--json"],
+      }),
+    ).rejects.toThrow("exit 1");
+    expect(mocks.runtime.error).toHaveBeenCalledWith(
+      expect.stringMatching(/Audio transcription response missing text/),
+    );
+  });
+
  it("forwards transcription prompt and language hints", async () => {
    await runRegisteredCli({
      register: registerCapabilityCli as (program: Command) => void,
--- a/src/media-understanding/runner.entries.ts
+++ b/src/media-understanding/runner.entries.ts
@@ -36,6 +36,7 @@ import { extractGeminiResponse } from "./output-extract.js";
 import { getMediaUnderstandingProvider, normalizeMediaProviderId } from "./provider-registry.js";
 import { resolveMaxBytes, resolveMaxChars, resolvePrompt, resolveTimeoutMs } from "./resolve.js";
 import type {
+  MediaUnderstandingAttemptOutcome,
  MediaUnderstandingCapability,
  MediaUnderstandingDecision,
  MediaUnderstandingModelDecision,
@@ -444,21 +445,54 @@ export function formatDecisionSummary(decision: MediaUnderstandingDecision): str
  const provider = typeof chosen?.provider === "string" ? chosen.provider.trim() : undefined;
  const model = typeof chosen?.model === "string" ? chosen.model.trim() : undefined;
  const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined;
-  const reason = attachments
-    .flatMap((entry) => {
-      const attempts = Array.isArray(entry?.attempts) ? entry.attempts : [];
-      return attempts
-        .map((attempt) => (typeof attempt?.reason === "string" ? attempt.reason : undefined))
-        .filter((value): value is string => Boolean(value));
-    })
-    .find((value) => value.trim().length > 0);
-  const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
+  const reason = findDecisionReason(
+    decision,
+    decision.outcome === "failed" ? "failed" : undefined,
+  );
+  const shortReason = summarizeDecisionReason(reason);
  const countLabel = total > 0 ? ` (${success}/${total})` : "";
  const viaLabel = modelLabel ? ` via ${modelLabel}` : "";
  const reasonLabel = shortReason ? ` reason=${shortReason}` : "";
  return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`;
 }

+export function findDecisionReason(
+  decision: MediaUnderstandingDecision,
+  outcome?: MediaUnderstandingAttemptOutcome,
+): string | undefined {
+  const attachments = Array.isArray(decision.attachments) ? decision.attachments : [];
+  for (const attachment of attachments) {
+    const attempts = Array.isArray(attachment?.attempts) ? attachment.attempts : [];
+    for (const attempt of attempts) {
+      if (outcome && attempt.outcome !== outcome) {
+        continue;
+      }
+      if (typeof attempt.reason !== "string" || attempt.reason.trim().length === 0) {
+        continue;
+      }
+      return attempt.reason;
+    }
+  }
+  return undefined;
+}
+
+export function normalizeDecisionReason(reason?: string): string | undefined {
+  const trimmed = typeof reason === "string" ? reason.trim() : "";
+  if (!trimmed) {
+    return undefined;
+  }
+  const normalized = trimmed.replace(/^Error:\s*/i, "").trim();
+  return normalized || undefined;
+}
+
+export function summarizeDecisionReason(reason?: string): string | undefined {
+  const normalized = normalizeDecisionReason(reason);
+  if (!normalized) {
+    return undefined;
+  }
+  return normalized.split(":")[0]?.trim() || undefined;
+}
+
 function assertMinAudioSize(params: { size: number; attachmentIndex: number }): void {
  if (params.size >= MIN_AUDIO_FILE_BYTES) {
    return;
--- a/src/media-understanding/runner.skip-tiny-audio.test.ts
+++ b/src/media-understanding/runner.skip-tiny-audio.test.ts
@@ -182,4 +182,30 @@ describe("runCapability skips tiny audio files", () => {
      },
    });
  });
+
+  it("marks the decision as failed when every audio model attempt fails", async () => {
+    await withAudioFixture({
+      filePrefix: "openclaw-failed-audio",
+      extension: "ogg",
+      mediaType: "audio/ogg",
+      fileContents: Buffer.alloc(MIN_AUDIO_FILE_BYTES + 100),
+      run: async ({ ctx, media, cache }) => {
+        const result = await runAudioCapabilityWithTranscriber({
+          ctx,
+          media,
+          cache,
+          transcribeAudio: async () => {
+            throw new Error("upstream 500");
+          },
+        });
+
+        expect(result.outputs).toHaveLength(0);
+        expect(result.decision.outcome).toBe("failed");
+        expect(result.decision.attachments).toHaveLength(1);
+        expect(result.decision.attachments[0]?.attempts).toHaveLength(1);
+        expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("failed");
+        expect(result.decision.attachments[0]?.attempts[0]?.reason).toContain("upstream 500");
+      },
+    });
+  });
 });
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -20,6 +20,7 @@ import type {
  MediaUnderstandingModelConfig,
 } from "../config/types.tools.js";
 import { logVerbose, shouldLogVerbose } from "../globals.js";
+import { logWarn } from "../logger.js";
 import { resolveChannelInboundAttachmentRoots } from "../media/channel-inbound-roots.js";
 import { mergeInboundPathRoots } from "../media/inbound-path-policy.js";
 import { getDefaultMediaLocalRoots } from "../media/local-roots.js";
@@ -725,6 +726,12 @@ async function runAttachmentEntries(params: {
  return { output: null, attempts };
 }

+function hasFailedMediaAttempt(attachments: MediaUnderstandingDecision["attachments"]): boolean {
+  return attachments.some((attachment) =>
+    attachment.attempts.some((attempt) => attempt.outcome === "failed"),
+  );
+}
+
 export async function runCapability(params: {
  capability: MediaUnderstandingCapability;
  cfg: OpenClawConfig;
@@ -861,10 +868,17 @@ export async function runCapability(params: {
  }
  const decision: MediaUnderstandingDecision = {
    capability,
-    outcome: outputs.length > 0 ? "success" : "skipped",
+    outcome:
+      outputs.length > 0
+        ? "success"
+        : hasFailedMediaAttempt(attachmentDecisions)
+          ? "failed"
+          : "skipped",
    attachments: attachmentDecisions,
  };
-  if (shouldLogVerbose()) {
+  if (decision.outcome === "failed") {
+    logWarn(`media-understanding: ${formatDecisionSummary(decision)}`);
+  } else if (shouldLogVerbose()) {
    logVerbose(`Media understanding ${formatDecisionSummary(decision)}`);
  }
  return {
--- a/src/media-understanding/runtime.test.ts
+++ b/src/media-understanding/runtime.test.ts
@@ -101,4 +101,43 @@ describe("media-understanding runtime", () => {
    expect(mocks.runCapability).toHaveBeenCalledTimes(1);
    expect(mocks.cleanup).toHaveBeenCalledTimes(1);
  });
+
+  it("surfaces the underlying provider failure when media understanding fails", async () => {
+    mocks.normalizeMediaAttachments.mockReturnValue([
+      { index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" },
+    ]);
+    mocks.runCapability.mockResolvedValue({
+      outputs: [],
+      decision: {
+        capability: "audio",
+        outcome: "failed",
+        attachments: [
+          {
+            attachmentIndex: 0,
+            attempts: [
+              {
+                type: "provider",
+                provider: "openai",
+                model: "gpt-4o-mini-transcribe",
+                outcome: "failed",
+                reason: "Error: Audio transcription response missing text",
+              },
+            ],
+          },
+        ],
+      },
+    });
+
+    await expect(
+      runMediaUnderstandingFile({
+        capability: "audio",
+        filePath: "/tmp/sample.ogg",
+        mime: "audio/ogg",
+        cfg: {} as OpenClawConfig,
+        agentDir: "/tmp/agent",
+      }),
+    ).rejects.toThrow("Audio transcription response missing text");
+
+    expect(mocks.cleanup).toHaveBeenCalledTimes(1);
+  });
 });
--- a/src/media-understanding/runtime.ts
+++ b/src/media-understanding/runtime.ts
@@ -1,6 +1,7 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { normalizeMediaProviderId } from "./provider-registry.js";
+import { findDecisionReason, normalizeDecisionReason } from "./runner.entries.js";
 import {
  buildProviderRegistry,
  createMediaAttachmentCache,
@@ -33,6 +34,12 @@ const KIND_BY_CAPABILITY: Record<MediaUnderstandingCapability, MediaUnderstandin
  video: "video.description",
 };

+function resolveDecisionFailureReason(
+  decision: Awaited<ReturnType<typeof runCapability>>["decision"],
+): string | undefined {
+  return normalizeDecisionReason(findDecisionReason(decision, "failed"));
+}
+
 function buildFileContext(params: { filePath: string; mime?: string }) {
  return {
    MediaPath: params.filePath,
@@ -75,6 +82,12 @@ export async function runMediaUnderstandingFile(
      config,
      activeModel: params.activeModel,
    });
+    if (result.outputs.length === 0 && result.decision.outcome === "failed") {
+      throw new Error(
+        resolveDecisionFailureReason(result.decision) ??
+          `${params.capability} understanding failed`,
+      );
+    }
    const output = result.outputs.find(
      (entry) => entry.kind === KIND_BY_CAPABILITY[params.capability],
    );
--- a/src/media-understanding/types.ts
+++ b/src/media-understanding/types.ts
@@ -32,6 +32,7 @@ export type MediaUnderstandingOutput = {

 export type MediaUnderstandingDecisionOutcome =
  | "success"
+  | "failed"
  | "skipped"
  | "disabled"
  | "no-attachment"