diff --git a/.github/workflows/parity-gate.yml b/.github/workflows/parity-gate.yml new file mode 100644 index 00000000000..a96958eb106 --- /dev/null +++ b/.github/workflows/parity-gate.yml @@ -0,0 +1,93 @@ +name: Parity gate + +on: + pull_request: + types: [opened, reopened, synchronize, ready_for_review] + paths: + - "extensions/qa-lab/**" + - "extensions/qa-channel/**" + - "extensions/openai/**" + - "qa/scenarios/**" + - "src/agents/**" + - "src/context-engine/**" + - "src/gateway/**" + - "src/media/**" + - ".github/workflows/parity-gate.yml" + +permissions: + contents: read + +concurrency: + group: parity-gate-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true + +jobs: + parity-gate: + name: Run the GPT-5.4 / Opus 4.6 parity gate against the qa-lab mock + if: ${{ github.event.pull_request.draft != true }} + runs-on: blacksmith-8vcpu-ubuntu-2404 + timeout-minutes: 20 + env: + # Fence the gate off from any real provider credentials. The qa-lab + # mock server + auth staging (PR N) should be enough to produce a + # meaningful verdict without touching a real API. If any of these + # leak into the job env, fail hard instead of silently running + # against a live provider and burning real budget. + OPENAI_API_KEY: "" + ANTHROPIC_API_KEY: "" + OPENCLAW_LIVE_OPENAI_KEY: "" + OPENCLAW_LIVE_ANTHROPIC_KEY: "" + OPENCLAW_LIVE_GEMINI_KEY: "" + OPENCLAW_LIVE_SETUP_TOKEN_VALUE: "" + steps: + - name: Checkout PR + uses: actions/checkout@v4 + + - name: Install pnpm + uses: pnpm/action-setup@v4 + + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: "22.14.0" + cache: "pnpm" + + - name: Install dependencies + run: pnpm install --frozen-lockfile + + - name: Run GPT-5.4 lane + run: | + pnpm openclaw qa suite \ + --provider-mode mock-openai \ + --parity-pack agentic \ + --model openai/gpt-5.4 \ + --alt-model openai/gpt-5.4-alt \ + --output-dir .artifacts/qa-e2e/gpt54 + + - name: Run Opus 4.6 lane + run: | + pnpm openclaw qa suite \ + --provider-mode mock-openai \ + --parity-pack agentic \ + --model anthropic/claude-opus-4-6 \ + --alt-model anthropic/claude-sonnet-4-6 \ + --output-dir .artifacts/qa-e2e/opus46 + + - name: Generate parity report + run: | + pnpm openclaw qa parity-report \ + --repo-root . \ + --candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \ + --baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \ + --candidate-label openai/gpt-5.4 \ + --baseline-label anthropic/claude-opus-4-6 \ + --output-dir .artifacts/qa-e2e/parity + + - name: Upload parity artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: parity-gate-${{ github.event.pull_request.number || github.sha }} + path: .artifacts/qa-e2e/ + retention-days: 14 + if-no-files-found: warn diff --git a/extensions/qa-lab/src/agentic-parity-report.test.ts b/extensions/qa-lab/src/agentic-parity-report.test.ts index a8291027ac4..8fcee784939 100644 --- a/extensions/qa-lab/src/agentic-parity-report.test.ts +++ b/extensions/qa-lab/src/agentic-parity-report.test.ts @@ -2,16 +2,42 @@ import { describe, expect, it } from "vitest"; import { buildQaAgenticParityComparison, computeQaAgenticParityMetrics, + QaParityLabelMismatchError, renderQaAgenticParityMarkdownReport, + type QaParityReportScenario, type QaParitySuiteSummary, } from "./agentic-parity-report.js"; +const FULL_PARITY_PASS_SCENARIOS: QaParityReportScenario[] = [ + { name: "Approval turn tool followthrough", status: "pass" as const }, + { name: "Compaction retry after mutating tool", status: "pass" as const }, + { name: "Model switch with tool continuity", status: "pass" as const }, + { name: "Source and docs discovery report", status: "pass" as const }, + { name: "Image understanding from attachment", status: "pass" as const }, + { name: "Subagent handoff", status: "pass" as const }, + { name: "Subagent fanout synthesis", status: "pass" as const }, + { name: "Memory recall after context switch", status: "pass" as const }, + { name: "Thread memory isolation", status: "pass" as const }, + { name: "Config restart capability flip", status: "pass" as const }, + { name: "Instruction followthrough repo contract", status: "pass" as const }, +]; + +function withScenarioOverride(name: string, override: Partial) { + return FULL_PARITY_PASS_SCENARIOS.map((scenario) => + scenario.name === name ? { ...scenario, ...override } : scenario, + ); +} + describe("qa agentic parity report", () => { it("computes first-wave parity metrics from suite summaries", () => { const summary: QaParitySuiteSummary = { scenarios: [ - { name: "Scenario A", status: "pass" }, - { name: "Scenario B", status: "fail", details: "incomplete turn detected" }, + { name: "Approval turn tool followthrough", status: "pass" }, + { + name: "Compaction retry after mutating tool", + status: "fail", + details: "incomplete turn detected", + }, ], }; @@ -28,6 +54,23 @@ describe("qa agentic parity report", () => { }); }); + it("keeps non-tool scenarios out of the valid-tool-call metric", () => { + const summary: QaParitySuiteSummary = { + scenarios: [ + { name: "Approval turn tool followthrough", status: "pass" }, + { name: "Memory recall after context switch", status: "pass" }, + { name: "Image understanding from attachment", status: "pass" }, + ], + }; + + expect(computeQaAgenticParityMetrics(summary)).toMatchObject({ + totalScenarios: 3, + passedScenarios: 3, + validToolCallCount: 1, + validToolCallRate: 1, + }); + }); + it("fails the parity gate when the candidate regresses against baseline", () => { const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.4", @@ -207,33 +250,70 @@ describe("qa agentic parity report", () => { ); }); + it("fails the parity gate when a required parity scenario fails on both sides", () => { + // Regression for the loop-7 Codex-connector P1 finding: without this + // check, a required parity scenario that fails on both candidate and + // baseline still produces pass=true because the downstream metric + // comparisons are purely relative (candidate vs baseline). Cover the + // whole parity pack as pass on both sides except the one scenario we + // deliberately fail on both sides, so the assertion can pin the + // isolated gate failure under test. + const scenariosWithBothFail = withScenarioOverride("Approval turn tool followthrough", { + status: "fail", + }); + const comparison = buildQaAgenticParityComparison({ + candidateLabel: "openai/gpt-5.4", + baselineLabel: "anthropic/claude-opus-4-6", + candidateSummary: { scenarios: scenariosWithBothFail }, + baselineSummary: { scenarios: scenariosWithBothFail }, + comparedAt: "2026-04-11T00:00:00.000Z", + }); + + expect(comparison.pass).toBe(false); + expect(comparison.failures).toContain( + "Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.4=fail, anthropic/claude-opus-4-6=fail.", + ); + // Metric comparisons are relative, so a same-on-both-sides failure + // must not appear as a relative metric failure. The required-scenario + // failure line is the only thing keeping the gate honest here. + expect(comparison.failures.some((failure) => failure.includes("completion rate"))).toBe(false); + }); + + it("fails the parity gate when a required parity scenario fails on the candidate only", () => { + // A candidate regression below a passing baseline is already caught + // by the relative completion-rate comparison, but surface it as a + // named required-scenario failure too so operators see a concrete + // scenario name alongside the rate differential. + const candidateWithOneFail = withScenarioOverride("Approval turn tool followthrough", { + status: "fail", + }); + const comparison = buildQaAgenticParityComparison({ + candidateLabel: "openai/gpt-5.4", + baselineLabel: "anthropic/claude-opus-4-6", + candidateSummary: { scenarios: candidateWithOneFail }, + baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, + comparedAt: "2026-04-11T00:00:00.000Z", + }); + + expect(comparison.pass).toBe(false); + expect(comparison.failures).toContain( + "Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.4=fail, anthropic/claude-opus-4-6=pass.", + ); + }); + it("fails the parity gate when the baseline contains suspicious pass results", () => { - // Cover the full first-wave pack on both sides so the suspicious-pass assertion + // Cover the full second-wave pack on both sides so the suspicious-pass assertion // below is the isolated gate failure under test (no coverage-gap noise). const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.4", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { - scenarios: [ - { name: "Approval turn tool followthrough", status: "pass" }, - { name: "Compaction retry after mutating tool", status: "pass" }, - { name: "Model switch with tool continuity", status: "pass" }, - { name: "Source and docs discovery report", status: "pass" }, - { name: "Image understanding from attachment", status: "pass" }, - ], + scenarios: FULL_PARITY_PASS_SCENARIOS, }, baselineSummary: { - scenarios: [ - { - name: "Approval turn tool followthrough", - status: "pass", - details: "timed out before it continued", - }, - { name: "Compaction retry after mutating tool", status: "pass" }, - { name: "Model switch with tool continuity", status: "pass" }, - { name: "Source and docs discovery report", status: "pass" }, - { name: "Image understanding from attachment", status: "pass" }, - ], + scenarios: withScenarioOverride("Approval turn tool followthrough", { + details: "timed out before it continued", + }), }, comparedAt: "2026-04-11T00:00:00.000Z", }); @@ -303,36 +383,333 @@ Follow-up: expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(1); }); - it("renders a readable markdown parity report", () => { + it("does not flag positive-tone prose as fake success (positive-tone detection removed)", () => { + // Positive-tone detection was removed because for passing runs the + // `details` field is the model's prose, which never contains tool-call + // evidence. Criterion 2 is enforced by per-scenario tool-call assertions. + const summary: QaParitySuiteSummary = { + scenarios: [ + { + name: "Subagent handoff", + status: "pass", + details: "Successfully completed the delegation. The subagent returned its result.", + }, + ], + }; + + expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(0); + }); + + it("does not flag bare 'Done.' prose as fake success", () => { + const summary: QaParitySuiteSummary = { + scenarios: [ + { + name: "Approval turn tool followthrough", + status: "pass", + details: "Done.", + }, + ], + }; + + expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(0); + }); + + it("does not flag structured status lines that end in `done`", () => { + const summary: QaParitySuiteSummary = { + scenarios: [ + { + name: "Compaction retry after mutating tool", + status: "pass", + details: `Confirmed, replay unsafe after write. +compactionCount=0 +status=done`, + }, + ], + }; + + expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(0); + }); + + it("does not flag positive-tone passes when the scenario shows real tool-call evidence", () => { + // A legitimate tool-mediated pass that happens to include + // "successfully" in its prose must not be flagged. The + // `plannedToolName` evidence (or any of the other tool-call + // evidence patterns) exempts the scenario from positive-tone + // detection. Without this exemption, real tool-backed passes with + // self-congratulatory prose would count as fake successes and break + // the gate. + const summary: QaParitySuiteSummary = { + scenarios: [ + { + name: "Source and docs discovery report", + status: "pass", + details: + "Successfully completed the report. plannedToolName=read recorded via /debug/requests.", + }, + ], + }; + + expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(0); + }); + + it("only flags failure-tone passes, not positive-tone", () => { + const summary: QaParitySuiteSummary = { + scenarios: [ + { + name: "Approval turn tool followthrough", + status: "pass", + details: "Task executed successfully without errors.", + }, + { + name: "Subagent handoff", + status: "pass", + details: "Tool call completed, but an error occurred mid-turn.", + }, + ], + }; + + // Only the failure-tone scenario ("error occurred") counts. + // The positive-tone one ("successfully") is not flagged. + expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(1); + }); + + it("throws QaParityLabelMismatchError when the candidate run.primaryProvider does not match the label", () => { + // Regression for the gate footgun: if an operator swaps the + // --candidate-summary and --baseline-summary paths, the gate would + // silently produce a reversed verdict. PR L #64789 ships the `run` + // block on every summary so the parity report can verify it against + // the caller-supplied label; this test pins the precondition check. + const parityPassScenarios = [ + { name: "Approval turn tool followthrough", status: "pass" as const }, + { name: "Compaction retry after mutating tool", status: "pass" as const }, + { name: "Model switch with tool continuity", status: "pass" as const }, + { name: "Source and docs discovery report", status: "pass" as const }, + { name: "Image understanding from attachment", status: "pass" as const }, + ]; + + expect(() => + buildQaAgenticParityComparison({ + candidateLabel: "openai/gpt-5.4", + baselineLabel: "anthropic/claude-opus-4-6", + candidateSummary: { + scenarios: parityPassScenarios, + run: { primaryProvider: "anthropic", primaryModel: "claude-opus-4-6" }, + }, + baselineSummary: { + scenarios: parityPassScenarios, + run: { primaryProvider: "anthropic", primaryModel: "claude-opus-4-6" }, + }, + comparedAt: "2026-04-11T00:00:00.000Z", + }), + ).toThrow(QaParityLabelMismatchError); + }); + + it("throws QaParityLabelMismatchError when the baseline run.primaryProvider does not match the label", () => { + const parityPassScenarios = [ + { name: "Approval turn tool followthrough", status: "pass" as const }, + ]; + + expect(() => + buildQaAgenticParityComparison({ + candidateLabel: "openai/gpt-5.4", + baselineLabel: "anthropic/claude-opus-4-6", + candidateSummary: { + scenarios: parityPassScenarios, + run: { primaryProvider: "openai" }, + }, + baselineSummary: { + scenarios: parityPassScenarios, + run: { primaryProvider: "openai", primaryModel: "gpt-5.4" }, + }, + comparedAt: "2026-04-11T00:00:00.000Z", + }), + ).toThrow( + /baseline summary run\.primaryProvider=openai and run\.primaryModel=gpt-5\.4 do not match --baseline-label/, + ); + }); + + it("accepts matching run.primaryProvider labels without throwing", () => { const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.4", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { - scenarios: [ - { name: "Approval turn tool followthrough", status: "pass" }, - { name: "Compaction retry after mutating tool", status: "pass" }, - { name: "Model switch with tool continuity", status: "pass" }, - { name: "Source and docs discovery report", status: "pass" }, - { name: "Image understanding from attachment", status: "pass" }, - ], + scenarios: FULL_PARITY_PASS_SCENARIOS, + run: { + primaryProvider: "openai", + primaryModel: "openai/gpt-5.4", + primaryModelName: "gpt-5.4", + }, }, baselineSummary: { - scenarios: [ - { name: "Approval turn tool followthrough", status: "pass" }, - { name: "Compaction retry after mutating tool", status: "pass" }, - { name: "Model switch with tool continuity", status: "pass" }, - { name: "Source and docs discovery report", status: "pass" }, - { name: "Image understanding from attachment", status: "pass" }, - ], + scenarios: FULL_PARITY_PASS_SCENARIOS, + run: { + primaryProvider: "anthropic", + primaryModel: "anthropic/claude-opus-4-6", + primaryModelName: "claude-opus-4-6", + }, }, comparedAt: "2026-04-11T00:00:00.000Z", }); + expect(comparison.pass).toBe(true); + }); + + it("skips run.primaryProvider verification when the summary is missing a run block (legacy summaries)", () => { + // Pre-PR-L summaries don't carry a `run` block. The gate must still + // work against those, trusting the caller-supplied label. + const comparison = buildQaAgenticParityComparison({ + candidateLabel: "openai/gpt-5.4", + baselineLabel: "anthropic/claude-opus-4-6", + candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, + baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, + comparedAt: "2026-04-11T00:00:00.000Z", + }); + expect(comparison.pass).toBe(true); + }); + + it("skips provider verification for arbitrary display labels when run metadata is present", () => { + const comparison = buildQaAgenticParityComparison({ + candidateLabel: "GPT-5.4 candidate", + baselineLabel: "Opus 4.6 baseline", + candidateSummary: { + scenarios: FULL_PARITY_PASS_SCENARIOS, + run: { + primaryProvider: "openai", + primaryModel: "openai/gpt-5.4", + primaryModelName: "gpt-5.4", + }, + }, + baselineSummary: { + scenarios: FULL_PARITY_PASS_SCENARIOS, + run: { + primaryProvider: "anthropic", + primaryModel: "anthropic/claude-opus-4-6", + primaryModelName: "claude-opus-4-6", + }, + }, + comparedAt: "2026-04-11T00:00:00.000Z", + }); + + expect(comparison.pass).toBe(true); + }); + + it("skips provider verification for mixed-case or decorated display labels", () => { + const comparison = buildQaAgenticParityComparison({ + candidateLabel: "Candidate: GPT-5.4", + baselineLabel: "Opus 4.6 / baseline", + candidateSummary: { + scenarios: FULL_PARITY_PASS_SCENARIOS, + run: { + primaryProvider: "openai", + primaryModel: "openai/gpt-5.4", + primaryModelName: "gpt-5.4", + }, + }, + baselineSummary: { + scenarios: FULL_PARITY_PASS_SCENARIOS, + run: { + primaryProvider: "anthropic", + primaryModel: "anthropic/claude-opus-4-6", + primaryModelName: "claude-opus-4-6", + }, + }, + comparedAt: "2026-04-11T00:00:00.000Z", + }); + + expect(comparison.pass).toBe(true); + }); + + it("throws when a structured label mismatches the recorded model even if the provider matches", () => { + expect(() => + buildQaAgenticParityComparison({ + candidateLabel: "openai/gpt-5.4", + baselineLabel: "anthropic/claude-opus-4-6", + candidateSummary: { + scenarios: FULL_PARITY_PASS_SCENARIOS, + run: { + primaryProvider: "openai", + primaryModel: "openai/gpt-5.4-alt", + primaryModelName: "gpt-5.4-alt", + }, + }, + baselineSummary: { + scenarios: FULL_PARITY_PASS_SCENARIOS, + run: { + primaryProvider: "anthropic", + primaryModel: "anthropic/claude-opus-4-6", + primaryModelName: "claude-opus-4-6", + }, + }, + comparedAt: "2026-04-11T00:00:00.000Z", + }), + ).toThrow( + /candidate summary run\.primaryProvider=openai and run\.primaryModel=openai\/gpt-5\.4-alt do not match --candidate-label=openai\/gpt-5\.4/, + ); + }); + + it("accepts colon-delimited structured labels when provider and model both match", () => { + const comparison = buildQaAgenticParityComparison({ + candidateLabel: "openai:gpt-5.4", + baselineLabel: "anthropic:claude-opus-4-6", + candidateSummary: { + scenarios: FULL_PARITY_PASS_SCENARIOS, + run: { + primaryProvider: "openai", + primaryModel: "openai/gpt-5.4", + primaryModelName: "gpt-5.4", + }, + }, + baselineSummary: { + scenarios: FULL_PARITY_PASS_SCENARIOS, + run: { + primaryProvider: "anthropic", + primaryModel: "anthropic/claude-opus-4-6", + primaryModelName: "claude-opus-4-6", + }, + }, + comparedAt: "2026-04-11T00:00:00.000Z", + }); + + expect(comparison.pass).toBe(true); + }); + + it("renders a readable markdown parity report", () => { + // Cover the full parity pack on both sides so the pass + // verdict is not disrupted by required-scenario coverage failures + // added by the second-wave expansion. + const comparison = buildQaAgenticParityComparison({ + candidateLabel: "openai/gpt-5.4", + baselineLabel: "anthropic/claude-opus-4-6", + candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, + baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, + comparedAt: "2026-04-11T00:00:00.000Z", + }); const report = renderQaAgenticParityMarkdownReport(comparison); - expect(report).toContain("# OpenClaw GPT-5.4 / Opus 4.6 Agentic Parity Report"); + expect(report).toContain( + "# OpenClaw Agentic Parity Report — openai/gpt-5.4 vs anthropic/claude-opus-4-6", + ); expect(report).toContain("| Completion rate | 100.0% | 100.0% |"); expect(report).toContain("### Approval turn tool followthrough"); expect(report).toContain("- Verdict: pass"); }); + + it("parametrizes the markdown header from the comparison labels", () => { + // Regression for the loop-7 Copilot finding: callers that configure + // non-gpt-5.4 / non-opus labels (for example an internal candidate vs + // another candidate) must see the labels in the rendered H1 instead of + // the hardcoded "GPT-5.4 / Opus 4.6" title that would otherwise confuse + // readers of saved reports. + const comparison = buildQaAgenticParityComparison({ + candidateLabel: "openai/gpt-5.4-alt", + baselineLabel: "openai/gpt-5.4", + candidateSummary: { scenarios: [] }, + baselineSummary: { scenarios: [] }, + comparedAt: "2026-04-11T00:00:00.000Z", + }); + const report = renderQaAgenticParityMarkdownReport(comparison); + expect(report).toContain( + "# OpenClaw Agentic Parity Report — openai/gpt-5.4-alt vs openai/gpt-5.4", + ); + }); }); diff --git a/extensions/qa-lab/src/agentic-parity-report.ts b/extensions/qa-lab/src/agentic-parity-report.ts index 0e900253b0d..d057ceffb6c 100644 --- a/extensions/qa-lab/src/agentic-parity-report.ts +++ b/extensions/qa-lab/src/agentic-parity-report.ts @@ -1,4 +1,7 @@ -import { QA_AGENTIC_PARITY_SCENARIO_TITLES } from "./agentic-parity.js"; +import { + QA_AGENTIC_PARITY_SCENARIO_TITLES, + QA_AGENTIC_PARITY_TOOL_BACKED_SCENARIO_TITLES, +} from "./agentic-parity.js"; export type QaParityReportStep = { name: string; @@ -13,6 +16,21 @@ export type QaParityReportScenario = { steps?: QaParityReportStep[]; }; +/** + * Optional self-describing run metadata written by PR L (#64789). Before + * that PR merges, older summaries only have `scenarios` + `counts`; the + * parity report treats a missing `run` block as "unknown provenance" and + * skips the label-match verification for backwards compatibility + * with legacy summaries that predate the run metadata block. + */ +export type QaParityRunBlock = { + primaryProvider?: string; + primaryModel?: string; + primaryModelName?: string; + providerMode?: string; + scenarioIds?: readonly string[] | null; +}; + export type QaParitySuiteSummary = { scenarios: QaParityReportScenario[]; counts?: { @@ -20,6 +38,8 @@ export type QaParitySuiteSummary = { passed?: number; failed?: number; }; + /** Self-describing run metadata — see PR L #64789 for the writer side. */ + run?: QaParityRunBlock; }; export type QaAgenticParityMetrics = { @@ -64,7 +84,11 @@ const UNINTENDED_STOP_PATTERNS = [ /did not continue/i, ] as const; -const SUSPICIOUS_PASS_PATTERNS = [ +// Failure-tone patterns: a passing scenario whose details text matches any +// of these is treated as a "fake success" — the scenario is marked pass but +// the supporting text reveals something went wrong. Adding new patterns here +// widens the net for bad prose that correlates with runtime failure modes. +const SUSPICIOUS_PASS_FAILURE_TONE_PATTERNS = [ /incomplete turn/i, /\btimed out\b/i, /\btimeout\b/i, @@ -76,6 +100,13 @@ const SUSPICIOUS_PASS_PATTERNS = [ /an error was/i, ] as const; +// Positive-tone patterns (e.g. "Successfully completed", "Done.") are NOT +// checked in fakeSuccessCount. For passing runs, `details` is the model's +// outbound prose, which never contains tool-call evidence strings, so a +// tool-call-evidence exemption would false-positive on every legitimate +// pass. Criterion 2 ("no fake progress") is enforced by per-scenario +// `/debug/requests` tool-call assertions in the YAML flows (PR J) instead. + function normalizeScenarioStatus(status: string | undefined): "pass" | "fail" | "skip" { return status === "pass" || status === "fail" || status === "skip" ? status : "fail"; } @@ -103,6 +134,9 @@ export function computeQaAgenticParityMetrics( ...scenario, status: normalizeScenarioStatus(scenario.status), })); + const toolBackedTitleSet: ReadonlySet = new Set( + QA_AGENTIC_PARITY_TOOL_BACKED_SCENARIO_TITLES, + ); const totalScenarios = summary.counts?.total ?? scenarios.length; const passedScenarios = summary.counts?.passed ?? scenarios.filter((scenario) => scenario.status === "pass").length; @@ -112,16 +146,40 @@ export function computeQaAgenticParityMetrics( (scenario) => scenario.status !== "pass" && scenarioHasPattern(scenario, UNINTENDED_STOP_PATTERNS), ).length; - const fakeSuccessCount = scenarios.filter( - (scenario) => - scenario.status === "pass" && scenarioHasPattern(scenario, SUSPICIOUS_PASS_PATTERNS), + const fakeSuccessCount = scenarios.filter((scenario) => { + if (scenario.status !== "pass") { + return false; + } + // Failure-tone patterns catch obviously-broken passes regardless of + // whether the scenario shows tool-call evidence — "timed out" under a + // pass is always fake. + if (scenarioHasPattern(scenario, SUSPICIOUS_PASS_FAILURE_TONE_PATTERNS)) { + return true; + } + // Positive-tone patterns (like "Successfully completed") are NOT checked + // here because for passing runs the `details` field is the model's + // outbound prose, which never contains tool-call evidence strings. + // The `scenarioLacksToolCallEvidence` check would return true for ALL + // passes and false-positive on legitimate completions. Criterion 2 + // ("no fake tool completion") is instead enforced by the per-scenario + // `/debug/requests` tool-call assertions from the scenario YAML flows. + return false; + }).length; + + // Count only the scenarios that are supposed to exercise a real tool, + // subagent, or capability invocation. Memory recall and image-only + // understanding lanes stay in the parity pack, but they should not inflate + // the tool-call metric just by passing. + const toolBackedScenarioCount = scenarios.filter((scenario) => + toolBackedTitleSet.has(scenario.name), + ).length; + const validToolCallCount = scenarios.filter( + (scenario) => toolBackedTitleSet.has(scenario.name) && scenario.status === "pass", ).length; - // First-wave parity scenarios are all tool-mediated tasks, so a passing scenario is our - // verified unit of valid tool-backed execution in this harness. - const validToolCallCount = passedScenarios; - const rate = (value: number) => (totalScenarios > 0 ? value / totalScenarios : 0); + const toolRate = (value: number) => + toolBackedScenarioCount > 0 ? value / toolBackedScenarioCount : 0; return { totalScenarios, passedScenarios, @@ -130,7 +188,7 @@ export function computeQaAgenticParityMetrics( unintendedStopCount, unintendedStopRate: rate(unintendedStopCount), validToolCallCount, - validToolCallRate: rate(validToolCallCount), + validToolCallRate: toolRate(validToolCallCount), fakeSuccessCount, }; } @@ -149,14 +207,116 @@ function scopeSummaryToParityPack( summary: QaParitySuiteSummary, parityTitleSet: ReadonlySet, ): QaParitySuiteSummary { - // The parity verdict must only consider the declared first-wave parity scenarios. - // Drop `counts` so the metric helper recomputes totals from the filtered scenario - // list instead of inheriting the caller's full-suite counters. + // The parity verdict must only consider the declared parity scenarios + // (the full first-wave + second-wave pack from QA_AGENTIC_PARITY_SCENARIOS). + // Drop `counts` so the metric helper recomputes totals from the filtered + // scenario list instead of inheriting the caller's full-suite counters. return { scenarios: summary.scenarios.filter((scenario) => parityTitleSet.has(scenario.name)), + ...(summary.run ? { run: summary.run } : {}), }; } +type StructuredQaParityLabel = { + provider: string; + model: string; +}; + +/** + * Only treat caller labels as provenance-checked identifiers when they are + * exact lower-case provider/model refs. Human-facing display labels like + * "GPT-5.4 candidate" or "Candidate: GPT-5.4" should render in the report + * without being misread as structured provider ids. + */ +function parseStructuredLabelRef(label: string): StructuredQaParityLabel | null { + const trimmed = label.trim(); + if (trimmed.length === 0) { + return null; + } + if (trimmed !== trimmed.toLowerCase()) { + return null; + } + const separatorMatch = /^([a-z0-9][a-z0-9-]*)[/:]([a-z0-9][a-z0-9._-]*)$/.exec(trimmed); + if (!separatorMatch) { + return null; + } + return { + provider: separatorMatch[1] ?? "", + model: separatorMatch[2] ?? "", + }; +} + +/** + * Verify the `run.primaryProvider` + `run.primaryModel` fields on a summary + * match the caller-supplied label when that label is a structured + * `provider/model` or `provider:model` ref. PR L #64789 ships the `run` + * block; before it lands, older summaries don't have the field and this check + * is a no-op. + * + * Throws `QaParityLabelMismatchError` when the summary reports a different + * provider/model than the caller claimed — this catches the "swapped + * candidate and baseline summary paths" footgun the earlier adversarial + * review flagged. Returns silently when the fields are absent (legacy + * summaries) or when the fields match. + */ +function verifySummaryLabelMatch(params: { + summary: QaParitySuiteSummary; + label: string; + role: "candidate" | "baseline"; +}): void { + const runProvider = params.summary.run?.primaryProvider?.trim(); + const runModel = params.summary.run?.primaryModel?.trim(); + const runModelName = params.summary.run?.primaryModelName?.trim(); + if (!runProvider || !runModel) { + return; + } + const labelRef = parseStructuredLabelRef(params.label); + if (!labelRef) { + return; + } + const normalizedRunModel = runModel.toLowerCase(); + const normalizedRunModelName = runModelName?.toLowerCase(); + const normalizedLabelModel = labelRef.model; + if ( + runProvider.toLowerCase() === labelRef.provider && + (normalizedRunModel === normalizedLabelModel || + normalizedRunModelName === normalizedLabelModel || + normalizedRunModel === `${labelRef.provider}/${normalizedLabelModel}`) + ) { + return; + } + throw new QaParityLabelMismatchError({ + role: params.role, + label: params.label, + runProvider, + runModel, + }); +} + +export class QaParityLabelMismatchError extends Error { + readonly role: "candidate" | "baseline"; + readonly label: string; + readonly runProvider: string; + readonly runModel: string; + + constructor(params: { + role: "candidate" | "baseline"; + label: string; + runProvider: string; + runModel: string; + }) { + super( + `${params.role} summary run.primaryProvider=${params.runProvider} and run.primaryModel=${params.runModel} do not match --${params.role}-label=${params.label}. ` + + `Check that the --candidate-summary / --baseline-summary paths weren't swapped.`, + ); + this.name = "QaParityLabelMismatchError"; + this.role = params.role; + this.label = params.label; + this.runProvider = params.runProvider; + this.runModel = params.runModel; + } +} + export function buildQaAgenticParityComparison(params: { candidateLabel: string; baselineLabel: string; @@ -164,6 +324,22 @@ export function buildQaAgenticParityComparison(params: { baselineSummary: QaParitySuiteSummary; comparedAt?: string; }): QaAgenticParityComparison { + // Precondition: verify the `run.primaryProvider` field on each summary + // matches the caller-supplied label (when the `run` block is present). + // Throws `QaParityLabelMismatchError` on mismatch so the release gate + // fails loudly instead of silently producing a reversed verdict when an + // operator swaps the --candidate-summary and --baseline-summary paths. + // Legacy summaries without a `run` block are accepted as-is. + verifySummaryLabelMatch({ + summary: params.candidateSummary, + label: params.candidateLabel, + role: "candidate", + }); + verifySummaryLabelMatch({ + summary: params.baselineSummary, + label: params.baselineLabel, + role: "baseline", + }); const parityTitleSet: ReadonlySet = new Set(QA_AGENTIC_PARITY_SCENARIO_TITLES); // Rates and fake-success counts are computed from the parity-scoped summaries only, // so extra non-parity scenarios in the input (for example when a caller feeds a full @@ -203,7 +379,7 @@ export function buildQaAgenticParityComparison(params: { }); const failures: string[] = []; - const requiredScenarioCoverage = QA_AGENTIC_PARITY_SCENARIO_TITLES.map((name) => { + const requiredScenarioStatuses = QA_AGENTIC_PARITY_SCENARIO_TITLES.map((name) => { const candidate = candidateByName.get(name); const baseline = baselineByName.get(name); return { @@ -211,7 +387,8 @@ export function buildQaAgenticParityComparison(params: { candidateStatus: requiredCoverageStatus(candidate), baselineStatus: requiredCoverageStatus(baseline), }; - }).filter( + }); + const requiredScenarioCoverage = requiredScenarioStatuses.filter( (scenario) => scenario.candidateStatus === "missing" || scenario.baselineStatus === "missing" || @@ -223,6 +400,26 @@ export function buildQaAgenticParityComparison(params: { `Missing required parity scenario coverage for ${scenario.name}: ${params.candidateLabel}=${scenario.candidateStatus}, ${params.baselineLabel}=${scenario.baselineStatus}.`, ); } + // Required parity scenarios that ran on both sides but FAILED also fail + // the gate. Without this check, a run where both models fail the same + // required scenarios still produced pass=true, because the downstream + // metric comparisons are purely relative (candidate vs baseline) and + // the suspicious-pass fake-success check only catches passes that carry + // failure-sounding details. Excluding missing/skip here keeps operator + // output from double-counting the same scenario with two lines. + const requiredScenarioFailures = requiredScenarioStatuses.filter( + (scenario) => + scenario.candidateStatus !== "missing" && + scenario.baselineStatus !== "missing" && + scenario.candidateStatus !== "skip" && + scenario.baselineStatus !== "skip" && + (scenario.candidateStatus === "fail" || scenario.baselineStatus === "fail"), + ); + for (const scenario of requiredScenarioFailures) { + failures.push( + `Required parity scenario ${scenario.name} failed: ${params.candidateLabel}=${scenario.candidateStatus}, ${params.baselineLabel}=${scenario.baselineStatus}.`, + ); + } // Required parity scenarios are already reported via `requiredScenarioCoverage` // above; excluding them here keeps the operator-facing failure list from // double-counting the same missing scenario (one "Missing required parity scenario @@ -281,8 +478,13 @@ export function buildQaAgenticParityComparison(params: { } export function renderQaAgenticParityMarkdownReport(comparison: QaAgenticParityComparison): string { + // Title is parametrized from the candidate / baseline labels so reports + // for any candidate/baseline pair (not only gpt-5.4 vs opus 4.6) render + // with an accurate header. The default CLI labels are still + // openai/gpt-5.4 vs anthropic/claude-opus-4-6, but the helper works for + // any parity comparison a caller configures. const lines = [ - "# OpenClaw GPT-5.4 / Opus 4.6 Agentic Parity Report", + `# OpenClaw Agentic Parity Report — ${comparison.candidateLabel} vs ${comparison.baselineLabel}`, "", `- Compared at: ${comparison.comparedAt}`, `- Candidate: ${comparison.candidateLabel}`, diff --git a/extensions/qa-lab/src/agentic-parity.ts b/extensions/qa-lab/src/agentic-parity.ts index e2972c92e17..d997778e85f 100644 --- a/extensions/qa-lab/src/agentic-parity.ts +++ b/extensions/qa-lab/src/agentic-parity.ts @@ -4,22 +4,57 @@ export const QA_AGENTIC_PARITY_SCENARIOS = [ { id: "approval-turn-tool-followthrough", title: "Approval turn tool followthrough", + countsTowardValidToolCallRate: true, }, { id: "model-switch-tool-continuity", title: "Model switch with tool continuity", + countsTowardValidToolCallRate: true, }, { id: "source-docs-discovery-report", title: "Source and docs discovery report", + countsTowardValidToolCallRate: true, }, { id: "image-understanding-attachment", title: "Image understanding from attachment", + countsTowardValidToolCallRate: false, }, { id: "compaction-retry-mutating-tool", title: "Compaction retry after mutating tool", + countsTowardValidToolCallRate: true, + }, + { + id: "subagent-handoff", + title: "Subagent handoff", + countsTowardValidToolCallRate: true, + }, + { + id: "subagent-fanout-synthesis", + title: "Subagent fanout synthesis", + countsTowardValidToolCallRate: true, + }, + { + id: "memory-recall", + title: "Memory recall after context switch", + countsTowardValidToolCallRate: false, + }, + { + id: "thread-memory-isolation", + title: "Thread memory isolation", + countsTowardValidToolCallRate: true, + }, + { + id: "config-restart-capability-flip", + title: "Config restart capability flip", + countsTowardValidToolCallRate: true, + }, + { + id: "instruction-followthrough-repo-contract", + title: "Instruction followthrough repo contract", + countsTowardValidToolCallRate: true, }, ] as const; @@ -27,6 +62,9 @@ export const QA_AGENTIC_PARITY_SCENARIO_IDS = QA_AGENTIC_PARITY_SCENARIOS.map(({ export const QA_AGENTIC_PARITY_SCENARIO_TITLES = QA_AGENTIC_PARITY_SCENARIOS.map( ({ title }) => title, ); +export const QA_AGENTIC_PARITY_TOOL_BACKED_SCENARIO_TITLES = QA_AGENTIC_PARITY_SCENARIOS.filter( + ({ countsTowardValidToolCallRate }) => countsTowardValidToolCallRate, +).map(({ title }) => title); export function resolveQaParityPackScenarioIds(params: { parityPack?: string; diff --git a/extensions/qa-lab/src/cli.runtime.test.ts b/extensions/qa-lab/src/cli.runtime.test.ts index 6c0efab1755..75629b1afec 100644 --- a/extensions/qa-lab/src/cli.runtime.test.ts +++ b/extensions/qa-lab/src/cli.runtime.test.ts @@ -338,6 +338,12 @@ describe("qa cli runtime", () => { "source-docs-discovery-report", "image-understanding-attachment", "compaction-retry-mutating-tool", + "subagent-handoff", + "subagent-fanout-synthesis", + "memory-recall", + "thread-memory-isolation", + "config-restart-capability-flip", + "instruction-followthrough-repo-contract", ], }), ); @@ -566,6 +572,39 @@ describe("qa cli runtime", () => { ); }); + it("passes provider-qualified mock parity suite selection through to the host runner", async () => { + await runQaSuiteCommand({ + repoRoot: "/tmp/openclaw-repo", + providerMode: "mock-openai", + parityPack: "agentic", + primaryModel: "openai/gpt-5.4", + alternateModel: "anthropic/claude-opus-4-6", + }); + + expect(runQaSuiteFromRuntime).toHaveBeenCalledWith({ + repoRoot: path.resolve("/tmp/openclaw-repo"), + outputDir: undefined, + transportId: "qa-channel", + providerMode: "mock-openai", + primaryModel: "openai/gpt-5.4", + alternateModel: "anthropic/claude-opus-4-6", + fastMode: undefined, + scenarioIds: [ + "approval-turn-tool-followthrough", + "model-switch-tool-continuity", + "source-docs-discovery-report", + "image-understanding-attachment", + "compaction-retry-mutating-tool", + "subagent-handoff", + "subagent-fanout-synthesis", + "memory-recall", + "thread-memory-isolation", + "config-restart-capability-flip", + "instruction-followthrough-repo-contract", + ], + }); + }); + it("rejects multipass-only suite flags on the host runner", async () => { await expect( runQaSuiteCommand({ diff --git a/extensions/qa-lab/src/gateway-child.test.ts b/extensions/qa-lab/src/gateway-child.test.ts index 5f340ae238e..c70559457b3 100644 --- a/extensions/qa-lab/src/gateway-child.test.ts +++ b/extensions/qa-lab/src/gateway-child.test.ts @@ -64,6 +64,11 @@ describe("buildQaRuntimeEnv", () => { expect(env.GEMINI_API_KEY).toBe("gemini-live"); }); + it("defaults gateway-child provider mode to mock-openai when omitted", () => { + expect(__testing.resolveQaGatewayChildProviderMode(undefined)).toBe("mock-openai"); + expect(__testing.resolveQaGatewayChildProviderMode("live-frontier")).toBe("live-frontier"); + }); + it("keeps explicit provider env vars over live aliases", () => { const env = buildQaRuntimeEnv({ ...createParams({ @@ -299,6 +304,88 @@ describe("buildQaRuntimeEnv", () => { }); }); + it("stages placeholder mock auth profiles per agent dir so mock-openai runs can resolve credentials", async () => { + const stateDir = await mkdtemp(path.join(os.tmpdir(), "qa-mock-auth-")); + cleanups.push(async () => { + await rm(stateDir, { recursive: true, force: true }); + }); + + const cfg = await __testing.stageQaMockAuthProfiles({ + cfg: {}, + stateDir, + }); + + // Config side: both providers should have a profile entry with mode + // "api_key" so the runtime picks up the staging without any further + // config mutation. + expect(cfg.auth?.profiles?.["qa-mock-openai"]).toMatchObject({ + provider: "openai", + mode: "api_key", + displayName: "QA mock openai credential", + }); + expect(cfg.auth?.profiles?.["qa-mock-anthropic"]).toMatchObject({ + provider: "anthropic", + mode: "api_key", + displayName: "QA mock anthropic credential", + }); + + // Store side: each agent dir should have its own auth-profiles.json + // containing the placeholder credential for each staged provider. This + // is what the scenario runner actually reads when it resolves auth + // before calling the mock. + for (const agentId of ["main", "qa"]) { + const storeRaw = await readFile( + path.join(stateDir, "agents", agentId, "agent", "auth-profiles.json"), + "utf8", + ); + const parsed = JSON.parse(storeRaw) as { + profiles: Record; + }; + expect(parsed.profiles["qa-mock-openai"]).toMatchObject({ + type: "api_key", + provider: "openai", + key: "qa-mock-not-a-real-key", + }); + expect(parsed.profiles["qa-mock-anthropic"]).toMatchObject({ + type: "api_key", + provider: "anthropic", + key: "qa-mock-not-a-real-key", + }); + } + }); + + it("stages mock profiles only for the requested agents and providers when callers override the defaults", async () => { + const stateDir = await mkdtemp(path.join(os.tmpdir(), "qa-mock-auth-override-")); + cleanups.push(async () => { + await rm(stateDir, { recursive: true, force: true }); + }); + + const cfg = await __testing.stageQaMockAuthProfiles({ + cfg: {}, + stateDir, + agentIds: ["qa"], + providers: ["openai"], + }); + + expect(cfg.auth?.profiles?.["qa-mock-openai"]).toMatchObject({ + provider: "openai", + mode: "api_key", + }); + // Anthropic should NOT be staged when the caller restricts providers. + expect(cfg.auth?.profiles?.["qa-mock-anthropic"]).toBeUndefined(); + + const qaStore = JSON.parse( + await readFile(path.join(stateDir, "agents", "qa", "agent", "auth-profiles.json"), "utf8"), + ) as { profiles: Record }; + expect(qaStore.profiles["qa-mock-openai"]).toBeDefined(); + expect(qaStore.profiles["qa-mock-anthropic"]).toBeUndefined(); + + // main/agent should not exist because it wasn't in the agentIds list. + await expect( + readFile(path.join(stateDir, "agents", "main", "agent", "auth-profiles.json"), "utf8"), + ).rejects.toThrow(/ENOENT/); + }); + it("allows loopback gateway health probes through the SSRF guard", async () => { const release = vi.fn(async () => {}); fetchWithSsrFGuardMock.mockResolvedValue({ diff --git a/extensions/qa-lab/src/gateway-child.ts b/extensions/qa-lab/src/gateway-child.ts index 3ab565ba015..94c6e69edf7 100644 --- a/extensions/qa-lab/src/gateway-child.ts +++ b/extensions/qa-lab/src/gateway-child.ts @@ -222,6 +222,12 @@ export function normalizeQaProviderModeEnv( return env; } +export function resolveQaGatewayChildProviderMode( + providerMode?: "mock-openai" | "live-frontier", +): "mock-openai" | "live-frontier" { + return providerMode ?? "mock-openai"; +} + function resolveQaLiveCliAuthEnv( baseEnv: NodeJS.ProcessEnv, opts?: { @@ -395,6 +401,72 @@ export async function stageQaLiveAnthropicSetupToken(params: { }); } +/** Providers the mock-openai harness stages placeholder credentials for. */ +export const QA_MOCK_AUTH_PROVIDERS = Object.freeze(["openai", "anthropic"] as const); + +/** Agent IDs the mock-openai harness stages credentials under. */ +export const QA_MOCK_AUTH_AGENT_IDS = Object.freeze(["main", "qa"] as const); + +export function buildQaMockProfileId(provider: string): string { + return `qa-mock-${provider}`; +} + +/** + * In mock-openai mode the qa suite runs against the embedded mock server + * instead of a real provider API. The mock does not validate credentials, but + * the agent auth layer still needs a matching `api_key` auth profile in + * `auth-profiles.json` before it will route the request through + * `providerBaseUrl`. Without this staging step, every scenario fails with + * `FailoverError: No API key found for provider "openai"` before the mock + * server ever sees a request. + * + * Stages a placeholder `api_key` profile per provider in each of the agent + * dirs the qa suite uses (`main` for the runtime config, `qa` for scenario + * runs) and returns a config with matching `auth.profiles` entries so the + * runtime accepts the profile on the first lookup. + * + * The placeholder value `qa-mock-not-a-real-key` is intentionally not + * shaped like a real API key (no `sk-` prefix that would trip secret + * scanners). It only needs to be non-empty to pass the credential + * serializer; anything beyond that is ignored by the mock. + */ +export async function stageQaMockAuthProfiles(params: { + cfg: OpenClawConfig; + stateDir: string; + agentIds?: readonly string[]; + providers?: readonly string[]; +}): Promise { + const agentIds = [...new Set(params.agentIds ?? QA_MOCK_AUTH_AGENT_IDS)]; + const providers = [...new Set(params.providers ?? QA_MOCK_AUTH_PROVIDERS)]; + let next = params.cfg; + for (const agentId of agentIds) { + const agentDir = path.join(params.stateDir, "agents", agentId, "agent"); + await fs.mkdir(agentDir, { recursive: true }); + for (const provider of providers) { + const profileId = buildQaMockProfileId(provider); + upsertAuthProfile({ + profileId, + credential: { + type: "api_key", + provider, + key: "qa-mock-not-a-real-key", + displayName: `QA mock ${provider} credential`, + }, + agentDir, + }); + } + } + for (const provider of providers) { + next = applyAuthProfileConfig(next, { + profileId: buildQaMockProfileId(provider), + provider, + mode: "api_key", + displayName: `QA mock ${provider} credential`, + }); + } + return next; +} + function isRetryableGatewayCallError(details: string): boolean { return ( details.includes("handshake timeout") || @@ -440,8 +512,10 @@ export const __testing = { preserveQaGatewayDebugArtifacts, redactQaGatewayDebugText, readQaLiveProviderConfigOverrides, + resolveQaGatewayChildProviderMode, resolveQaLiveAnthropicSetupToken, stageQaLiveAnthropicSetupToken, + stageQaMockAuthProfiles, resolveQaLiveCliAuthEnv, resolveQaOwnerPluginIdsForProviderIds, resolveQaBundledPluginsSourceRoot, @@ -868,8 +942,9 @@ export async function startQaGatewayChild(params: { fs.mkdir(xdgDataHome, { recursive: true }), fs.mkdir(xdgCacheHome, { recursive: true }), ]); + const providerMode = resolveQaGatewayChildProviderMode(params.providerMode); const liveProviderIds = - params.providerMode === "live-frontier" + providerMode === "live-frontier" ? [params.primaryModel, params.alternateModel] .map((modelRef) => typeof modelRef === "string" ? splitQaModelRef(modelRef)?.provider : undefined, @@ -902,7 +977,7 @@ export async function startQaGatewayChild(params: { controlUiEnabled: params.controlUiEnabled, }), controlUiAllowedOrigins: params.controlUiAllowedOrigins, - providerMode: params.providerMode, + providerMode, primaryModel: params.primaryModel, alternateModel: params.alternateModel, enabledPluginIds, @@ -921,6 +996,12 @@ export async function startQaGatewayChild(params: { cfg, stateDir, }); + if (providerMode === "mock-openai") { + cfg = await stageQaMockAuthProfiles({ + cfg, + stateDir, + }); + } return params.mutateConfig ? params.mutateConfig(cfg) : cfg; }; const stdout: Buffer[] = []; @@ -981,7 +1062,7 @@ export async function startQaGatewayChild(params: { xdgCacheHome, bundledPluginsDir, compatibilityHostVersion: runtimeHostVersion, - providerMode: params.providerMode, + providerMode, forwardHostHomeForClaudeCli: liveProviderIds.includes("claude-cli"), claudeCliAuthMode: params.claudeCliAuthMode, }); diff --git a/extensions/qa-lab/src/mock-openai-server.test.ts b/extensions/qa-lab/src/mock-openai-server.test.ts index 578569f09fc..5e598f7949c 100644 --- a/extensions/qa-lab/src/mock-openai-server.test.ts +++ b/extensions/qa-lab/src/mock-openai-server.test.ts @@ -1,5 +1,5 @@ import { afterEach, describe, expect, it } from "vitest"; -import { startQaMockOpenAiServer } from "./mock-openai-server.js"; +import { resolveProviderVariant, startQaMockOpenAiServer } from "./mock-openai-server.js"; const cleanups: Array<() => Promise> = []; const QA_IMAGE_PNG_BASE64 = @@ -11,42 +11,15 @@ afterEach(async () => { } }); -async function startMockServer() { - const server = await startQaMockOpenAiServer({ - host: "127.0.0.1", - port: 0, - }); - cleanups.push(async () => { - await server.stop(); - }); - return server; -} - -async function postResponses(server: { baseUrl: string }, body: unknown) { - return fetch(`${server.baseUrl}/v1/responses`, { - method: "POST", - headers: { - "content-type": "application/json", - }, - body: JSON.stringify(body), - }); -} - -async function expectResponsesText(server: { baseUrl: string }, body: unknown) { - const response = await postResponses(server, body); - expect(response.status).toBe(200); - return response.text(); -} - -async function expectResponsesJson(server: { baseUrl: string }, body: unknown) { - const response = await postResponses(server, body); - expect(response.status).toBe(200); - return (await response.json()) as T; -} - describe("qa mock openai server", () => { it("serves health and streamed responses", async () => { - const server = await startMockServer(); + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); const health = await fetch(`${server.baseUrl}/healthz`); expect(health.status).toBe(200); @@ -75,22 +48,36 @@ describe("qa mock openai server", () => { }); it("prefers path-like refs over generic quoted keys in prompts", async () => { - const server = await startMockServer(); - - const body = await expectResponsesText(server, { - stream: true, - input: [ - { - role: "user", - content: [ - { - type: "input_text", - text: 'Please inspect "message_id" metadata first, then read `./QA_KICKOFF_TASK.md`.', - }, - ], - }, - ], + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: true, + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: 'Please inspect "message_id" metadata first, then read `./QA_KICKOFF_TASK.md`.', + }, + ], + }, + ], + }), + }); + expect(response.status).toBe(200); + const body = await response.text(); expect(body).toContain('"arguments":"{\\"path\\":\\"QA_KICKOFF_TASK.md\\"}"'); const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`); @@ -103,7 +90,13 @@ describe("qa mock openai server", () => { }); it("drives the Lobster Invaders write flow and memory recall responses", async () => { - const server = await startMockServer(); + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); const lobster = await fetch(`${server.baseUrl}/v1/responses`, { method: "POST", @@ -132,32 +125,40 @@ describe("qa mock openai server", () => { expect(lobsterBody).toContain('"name":"write"'); expect(lobsterBody).toContain("lobster-invaders.html"); - const payload = await expectResponsesJson<{ - output?: Array<{ content?: Array<{ text?: string }> }>; - }>(server, { - stream: false, - model: "gpt-5.4-alt", - input: [ - { - role: "user", - content: [ - { - type: "input_text", - text: "Please remember this fact for later: the QA canary code is ALPHA-7.", - }, - ], - }, - { - role: "user", - content: [ - { - type: "input_text", - text: "What was the QA canary code I asked you to remember earlier?", - }, - ], - }, - ], + const recall = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: false, + model: "gpt-5.4-alt", + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: "Please remember this fact for later: the QA canary code is ALPHA-7.", + }, + ], + }, + { + role: "user", + content: [ + { + type: "input_text", + text: "What was the QA canary code I asked you to remember earlier?", + }, + ], + }, + ], + }), }); + expect(recall.status).toBe(200); + const payload = (await recall.json()) as { + output?: Array<{ content?: Array<{ text?: string }> }>; + }; expect(payload.output?.[0]?.content?.[0]?.text).toContain("ALPHA-7"); const requests = await fetch(`${server.baseUrl}/debug/requests`); @@ -168,8 +169,157 @@ describe("qa mock openai server", () => { ]); }); + it("keeps remember prompts prose-only even when they mention repo cleanup", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: true, + model: "gpt-5.4", + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: "Please remember this fact for later: the QA canary code is ALPHA-7. Use your normal memory mechanism, avoid manual repo cleanup, and reply exactly `Remembered ALPHA-7.` once stored.", + }, + ], + }, + ], + }), + }); + expect(response.status).toBe(200); + const body = await response.text(); + expect(body).toContain("Remembered ALPHA-7."); + expect(body).not.toContain('"name":"read"'); + }); + + it("drives repo-contract followthrough as read-read-read-write-then-report", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const prompt = + "Repo contract followthrough check. Read AGENT.md, SOUL.md, and FOLLOWTHROUGH_INPUT.md first. Then follow the repo contract exactly, write ./repo-contract-summary.txt, and reply with three labeled lines: Read, Wrote, Status."; + + const first = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + model: "gpt-5.4", + input: [{ role: "user", content: [{ type: "input_text", text: prompt }] }], + }), + }); + expect(first.status).toBe(200); + expect(await first.text()).toContain('"arguments":"{\\"path\\":\\"AGENT.md\\"}"'); + + const second = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + model: "gpt-5.4", + input: [ + { role: "user", content: [{ type: "input_text", text: prompt }] }, + { + type: "function_call_output", + output: + "# Repo contract\n\nStep order:\n1. Read AGENT.md.\n2. Read SOUL.md.\n3. Read FOLLOWTHROUGH_INPUT.md.\n4. Write ./repo-contract-summary.txt.\n", + }, + ], + }), + }); + expect(second.status).toBe(200); + expect(await second.text()).toContain('"arguments":"{\\"path\\":\\"SOUL.md\\"}"'); + + const third = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + model: "gpt-5.4", + input: [ + { role: "user", content: [{ type: "input_text", text: prompt }] }, + { + type: "function_call_output", + output: "# Execution style\n\nStay brief, honest, and action-first.\n", + }, + ], + }), + }); + expect(third.status).toBe(200); + expect(await third.text()).toContain('"arguments":"{\\"path\\":\\"FOLLOWTHROUGH_INPUT.md\\"}"'); + + const fourth = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + model: "gpt-5.4", + input: [ + { role: "user", content: [{ type: "input_text", text: prompt }] }, + { + type: "function_call_output", + output: + "Mission: prove you followed the repo contract.\nEvidence path: AGENT.md -> SOUL.md -> FOLLOWTHROUGH_INPUT.md -> repo-contract-summary.txt\n", + }, + ], + }), + }); + expect(fourth.status).toBe(200); + const fourthBody = await fourth.text(); + expect(fourthBody).toContain('"name":"write"'); + expect(fourthBody).toContain("repo-contract-summary.txt"); + + const fifth = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: false, + model: "gpt-5.4", + input: [ + { role: "user", content: [{ type: "input_text", text: prompt }] }, + { + type: "function_call_output", + output: + "Successfully wrote repo-contract-summary.txt\nMission: prove you followed the repo contract.\nStatus: complete\n", + }, + ], + }), + }); + expect(fifth.status).toBe(200); + const payload = (await fifth.json()) as { + output?: Array<{ content?: Array<{ text?: string }> }>; + }; + expect(payload.output?.[0]?.content?.[0]?.text).toContain("Read: AGENT.md, SOUL.md"); + expect(payload.output?.[0]?.content?.[0]?.text).toContain("Wrote: repo-contract-summary.txt"); + expect(payload.output?.[0]?.content?.[0]?.text).toContain("Status: complete"); + }); + it("drives the compaction retry mutating tool parity flow", async () => { - const server = await startMockServer(); + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); const writePlan = await fetch(`${server.baseUrl}/v1/responses`, { method: "POST", @@ -201,27 +351,35 @@ describe("qa mock openai server", () => { expect(writePlanBody).toContain('"name":"write"'); expect(writePlanBody).toContain("compaction-retry-summary.txt"); - const finalPayload = await expectResponsesJson<{ - output?: Array<{ content?: Array<{ text?: string }> }>; - }>(server, { - stream: false, - model: "gpt-5.4", - input: [ - { - role: "user", - content: [ - { - type: "input_text", - text: "Compaction retry mutating tool check: read COMPACTION_RETRY_CONTEXT.md, then create compaction-retry-summary.txt and keep replay safety explicit.", - }, - ], - }, - { - type: "function_call_output", - output: "Successfully wrote 41 bytes to compaction-retry-summary.txt.", - }, - ], + const finalReply = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: false, + model: "gpt-5.4", + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: "Compaction retry mutating tool check: read COMPACTION_RETRY_CONTEXT.md, then create compaction-retry-summary.txt and keep replay safety explicit.", + }, + ], + }, + { + type: "function_call_output", + output: "Successfully wrote 41 bytes to compaction-retry-summary.txt.", + }, + ], + }), }); + expect(finalReply.status).toBe(200); + const finalPayload = (await finalReply.json()) as { + output?: Array<{ content?: Array<{ text?: string }> }>; + }; expect(finalPayload.output?.[0]?.content?.[0]?.text).toContain("replay unsafe after write"); }); @@ -282,22 +440,36 @@ describe("qa mock openai server", () => { }); it("requests non-threaded subagent handoff for QA channel runs", async () => { - const server = await startMockServer(); - - const body = await expectResponsesText(server, { - stream: true, - input: [ - { - role: "user", - content: [ - { - type: "input_text", - text: "Delegate a bounded QA task to a subagent, then summarize the delegated result clearly.", - }, - ], - }, - ], + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: true, + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: "Delegate a bounded QA task to a subagent, then summarize the delegated result clearly.", + }, + ], + }, + ], + }), + }); + expect(response.status).toBe(200); + const body = await response.text(); expect(body).toContain('"name":"sessions_spawn"'); expect(body).toContain('\\"label\\":\\"qa-sidecar\\"'); expect(body).toContain('\\"thread\\":false'); @@ -672,11 +844,63 @@ describe("qa mock openai server", () => { }); }); - it("answers heartbeat prompts without spawning extra subagents", async () => { - const server = await startMockServer(); + it("keeps subagent fanout state isolated per mock server instance", async () => { + const serverA = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await serverA.stop(); + }); + const serverB = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await serverB.stop(); + }); - expect( - await expectResponsesJson(server, { + const prompt = + "Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together."; + + const firstA = await fetch(`${serverA.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + input: [{ role: "user", content: [{ type: "input_text", text: prompt }] }], + }), + }); + expect(firstA.status).toBe(200); + expect(await firstA.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"'); + + const firstB = await fetch(`${serverB.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: true, + input: [{ role: "user", content: [{ type: "input_text", text: prompt }] }], + }), + }); + expect(firstB.status).toBe(200); + expect(await firstB.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"'); + }); + + it("answers heartbeat prompts without spawning extra subagents", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ stream: false, input: [ { @@ -690,7 +914,10 @@ describe("qa mock openai server", () => { }, ], }), - ).toMatchObject({ + }); + + expect(response.status).toBe(200); + expect(await response.json()).toMatchObject({ output: [ { content: [{ text: "HEARTBEAT_OK" }], @@ -768,10 +995,20 @@ describe("qa mock openai server", () => { }); it("uses the latest exact marker directive from conversation history", async () => { - const server = await startMockServer(); + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); - expect( - await expectResponsesJson(server, { + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ stream: false, input: [ { @@ -794,7 +1031,10 @@ describe("qa mock openai server", () => { }, ], }), - ).toMatchObject({ + }); + + expect(response.status).toBe(200); + expect(await response.json()).toMatchObject({ output: [ { content: [{ text: "NEW_TOKEN" }], @@ -854,33 +1094,45 @@ describe("qa mock openai server", () => { }); it("describes reattached generated images in the roundtrip flow", async () => { - const server = await startMockServer(); - - const payload = await expectResponsesJson<{ - output?: Array<{ content?: Array<{ text?: string }> }>; - }>(server, { - stream: false, - model: "mock-openai/gpt-5.4", - input: [ - { - role: "user", - content: [ - { - type: "input_text", - text: "Roundtrip image inspection check: describe the generated lighthouse attachment in one short sentence.", - }, - { - type: "input_image", - source: { - type: "base64", - mime_type: "image/png", - data: QA_IMAGE_PNG_BASE64, - }, - }, - ], - }, - ], + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + stream: false, + model: "mock-openai/gpt-5.4", + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: "Roundtrip image inspection check: describe the generated lighthouse attachment in one short sentence.", + }, + { + type: "input_image", + source: { + type: "base64", + mime_type: "image/png", + data: QA_IMAGE_PNG_BASE64, + }, + }, + ], + }, + ], + }), + }); + expect(response.status).toBe(200); + const payload = (await response.json()) as { + output?: Array<{ content?: Array<{ text?: string }> }>; + }; const text = payload.output?.[0]?.content?.[0]?.text ?? ""; expect(text.toLowerCase()).toContain("lighthouse"); }); @@ -927,10 +1179,20 @@ describe("qa mock openai server", () => { }); it("returns continuity language after the model-switch reread completes", async () => { - const server = await startMockServer(); + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); - expect( - await expectResponsesJson(server, { + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ stream: false, model: "gpt-5.4-alt", input: [ @@ -949,7 +1211,10 @@ describe("qa mock openai server", () => { }, ], }), - ).toMatchObject({ + }); + + expect(response.status).toBe(200); + expect(await response.json()).toMatchObject({ output: [ { content: [ @@ -963,10 +1228,20 @@ describe("qa mock openai server", () => { }); it("returns NO_REPLY for unmentioned group chatter", async () => { - const server = await startMockServer(); + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); - expect( - await expectResponsesJson(server, { + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ stream: false, input: [ { @@ -980,7 +1255,9 @@ describe("qa mock openai server", () => { }, ], }), - ).toMatchObject({ + }); + expect(response.status).toBe(200); + expect(await response.json()).toMatchObject({ output: [ { content: [{ text: "NO_REPLY" }], @@ -988,4 +1265,605 @@ describe("qa mock openai server", () => { ], }); }); + + it("advertises Anthropic claude-opus-4-6 baseline model on /v1/models", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/models`); + expect(response.status).toBe(200); + const body = (await response.json()) as { data: Array<{ id: string }> }; + const ids = body.data.map((entry) => entry.id); + expect(ids).toContain("claude-opus-4-6"); + expect(ids).toContain("gpt-5.4"); + }); + + it("dispatches an Anthropic /v1/messages read tool call for source discovery prompts", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/messages`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + model: "claude-opus-4-6", + max_tokens: 256, + messages: [ + { + role: "user", + content: [ + { + type: "text", + text: "Read the seeded docs and report worked, failed, blocked, and follow-up items.", + }, + ], + }, + ], + }), + }); + expect(response.status).toBe(200); + const body = (await response.json()) as { + type: string; + role: string; + model: string; + stop_reason: string; + content: Array>; + }; + expect(body.type).toBe("message"); + expect(body.role).toBe("assistant"); + expect(body.model).toBe("claude-opus-4-6"); + expect(body.stop_reason).toBe("tool_use"); + const toolUseBlock = body.content.find((block) => block.type === "tool_use") as + | { name: string; input: Record } + | undefined; + expect(toolUseBlock?.name).toBe("read"); + expect(toolUseBlock?.input).toEqual({ path: "QA_SCENARIO_PLAN.md" }); + + const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`); + expect(debugResponse.status).toBe(200); + expect(await debugResponse.json()).toMatchObject({ + model: "claude-opus-4-6", + plannedToolName: "read", + }); + }); + + it("dispatches Anthropic /v1/messages tool_result follow-ups through the shared scenario logic", async () => { + // This verifies the Anthropic adapter correctly feeds tool_result + // content blocks into the shared scenario dispatcher so downstream + // "has this scenario already called a tool?" logic fires the same way + // it does on the OpenAI /v1/responses route. The subagent handoff + // scenario is ideal because the mock has a two-stage flow: first + // delegate prompt → sessions_spawn tool_use, then tool_result → + // "Delegated task: ..." prose summary. + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/messages`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + model: "claude-opus-4-6", + max_tokens: 256, + messages: [ + { + role: "user", + content: [ + { + type: "text", + text: "Delegate one bounded QA task to a subagent, wait for it to finish, then reply with Delegated task, Result, and Evidence sections.", + }, + ], + }, + { + role: "assistant", + content: [ + { + type: "tool_use", + id: "toolu_mock_spawn_1", + name: "sessions_spawn", + input: { task: "Inspect the QA workspace", label: "qa-sidecar", thread: false }, + }, + ], + }, + { + role: "user", + content: [ + { + type: "tool_result", + tool_use_id: "toolu_mock_spawn_1", + content: "SUBAGENT-OK", + }, + ], + }, + ], + }), + }); + expect(response.status).toBe(200); + const body = (await response.json()) as { + stop_reason: string; + content: Array<{ type: string; text?: string }>; + }; + expect(body.stop_reason).toBe("end_turn"); + const textBlock = body.content.find((block) => block.type === "text") as + | { text: string } + | undefined; + // The mock's subagent-handoff branch echoes "Delegated task", a + // tool-output evidence line, and a folded-back "Evidence" marker. + expect(textBlock?.text).toContain("Delegated task"); + expect(textBlock?.text).toContain("Evidence"); + }); + + it("places tool_result after the parent user message even in mixed-content turns", async () => { + // Regression for the loop-6 Copilot / Greptile finding: a user message + // that mixes a tool_result block with fresh text blocks must still land + // the function_call_output AFTER the parent user message in the + // converted ResponsesInputItem[], otherwise extractToolOutput (which + // scans AFTER the last user-role index) fails to see the tool output + // and the downstream scenario dispatcher behaves as if no tool output + // was returned. We verify the conversion directly via the snapshot + // that /debug/last-request exposes: the last-request `toolOutput` + // field should be the stringified tool_result content, and `prompt` + // should be the trailing fresh-text block. + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/messages`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + model: "claude-opus-4-6", + max_tokens: 256, + messages: [ + { + role: "user", + content: [ + { + type: "text", + text: "Delegate one bounded QA task to a subagent.", + }, + ], + }, + { + role: "assistant", + content: [ + { + type: "tool_use", + id: "toolu_mock_spawn_mixed", + name: "sessions_spawn", + input: { task: "Inspect the QA workspace", label: "qa-sidecar", thread: false }, + }, + ], + }, + { + role: "user", + content: [ + { + type: "tool_result", + tool_use_id: "toolu_mock_spawn_mixed", + content: "SUBAGENT-OK", + }, + // A trailing fresh text block in the same user turn. Before + // the loop-6 fix, the tool_result was pushed BEFORE the + // parent user message, so extractToolOutput saw the text + // turn as the last user-role item and found no + // function_call_output after it → returned "". The + // downstream dispatcher then behaved as if no tool output + // was present at all. + { + type: "text", + text: "Keep going with the fanout.", + }, + ], + }, + ], + }), + }); + expect(response.status).toBe(200); + + const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`); + expect(debugResponse.status).toBe(200); + const debug = (await debugResponse.json()) as { + prompt: string; + allInputText: string; + toolOutput: string; + }; + // extractToolOutput should surface the tool_result content because + // the function_call_output item is placed AFTER the parent user + // message in the converted input array. + expect(debug.toolOutput).toBe("SUBAGENT-OK"); + // extractLastUserText should surface the fresh-text block (the parent + // user message that was pushed BEFORE the function_call_output). + expect(debug.prompt).toBe("Keep going with the fanout."); + // The converted history still records both turns, including the + // original delegate prompt from the first user turn. + expect(debug.allInputText).toContain("Delegate one bounded QA task"); + }); + + it("streams Anthropic /v1/messages tool_use responses as SSE", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/messages`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + model: "claude-opus-4-6", + max_tokens: 256, + stream: true, + messages: [ + { + role: "user", + content: [ + { + type: "text", + text: "Read the seeded docs and report worked, failed, blocked, and follow-up items.", + }, + ], + }, + ], + }), + }); + expect(response.status).toBe(200); + expect(response.headers.get("content-type")).toContain("text/event-stream"); + const body = await response.text(); + expect(body).toContain("event: message_start"); + expect(body).toContain("event: content_block_start"); + expect(body).toContain('"type":"tool_use"'); + expect(body).toContain('"name":"read"'); + expect(body).toContain("QA_SCENARIO_PLAN.md"); + expect(body).toContain("event: message_delta"); + expect(body).toContain("event: message_stop"); + }); + + it("streams Anthropic /v1/messages tool_result follow-ups as text deltas", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/messages`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + model: "claude-opus-4-6", + max_tokens: 256, + stream: true, + messages: [ + { + role: "user", + content: [ + { + type: "text", + text: "Delegate one bounded QA task to a subagent, wait for it to finish, then reply with Delegated task, Result, and Evidence sections.", + }, + ], + }, + { + role: "assistant", + content: [ + { + type: "tool_use", + id: "toolu_mock_spawn_1", + name: "sessions_spawn", + input: { task: "Inspect the QA workspace", label: "qa-sidecar", thread: false }, + }, + ], + }, + { + role: "user", + content: [ + { + type: "tool_result", + tool_use_id: "toolu_mock_spawn_1", + content: "SUBAGENT-OK", + }, + ], + }, + ], + }), + }); + expect(response.status).toBe(200); + expect(response.headers.get("content-type")).toContain("text/event-stream"); + const body = await response.text(); + expect(body).toContain("event: content_block_delta"); + expect(body).toContain('"type":"text_delta"'); + expect(body).toContain("Delegated task"); + expect(body).toContain("Evidence"); + }); + + it("keeps Anthropic remember prompts on the prose branch even when system text mentions HEARTBEAT", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/messages`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + model: "claude-opus-4-6", + max_tokens: 256, + stream: true, + system: [ + { + type: "text", + text: "Read HEARTBEAT.md if it exists (workspace context). Follow it strictly. If nothing needs attention, reply HEARTBEAT_OK.", + }, + ], + messages: [ + { + role: "user", + content: [ + { + type: "text", + text: "Please remember this fact for later: the QA canary code is ALPHA-7. Use your normal memory mechanism, avoid manual repo cleanup, and reply exactly `Remembered ALPHA-7.` once stored.", + }, + ], + }, + ], + }), + }); + + expect(response.status).toBe(200); + const body = await response.text(); + expect(body).toContain("Remembered ALPHA-7."); + expect(body).not.toContain("HEARTBEAT_OK"); + expect(body).not.toContain('"name":"read"'); + }); + + it("prefers the prompt-local exact reply directive over heartbeat context", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/messages`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + model: "claude-opus-4-6", + max_tokens: 256, + stream: true, + system: [ + { + type: "text", + text: [ + "Read HEARTBEAT.md if it exists (workspace context). Follow it strictly.", + "If the current user message is a heartbeat poll and nothing needs attention, reply exactly:", + "HEARTBEAT_OK", + ].join("\n"), + }, + ], + messages: [ + { + role: "user", + content: [ + { + type: "text", + text: "Please remember this fact for later: the QA canary code is ALPHA-7. Use your normal memory mechanism, avoid manual repo cleanup, and reply exactly `Remembered ALPHA-7.` once stored.", + }, + ], + }, + ], + }), + }); + + expect(response.status).toBe(200); + const body = await response.text(); + expect(body).toContain("Remembered ALPHA-7."); + expect(body).not.toContain("HEARTBEAT_OK"); + }); + + it("rejects malformed Anthropic /v1/messages JSON with an invalid_request_error", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/messages`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: '{"model":"claude-opus-4-6","messages":[', + }); + + expect(response.status).toBe(400); + const body = (await response.json()) as { + type: string; + error: { type: string; message: string }; + }; + expect(body.type).toBe("error"); + expect(body.error.type).toBe("invalid_request_error"); + expect(body.error.message).toContain("Malformed JSON body"); + }); + + it("defaults empty-string Anthropic /v1/messages model to claude-opus-4-6", async () => { + // Regression for the loop-7 Copilot finding: a bare `typeof + // body.model === "string"` check lets an empty-string model leak + // through to `lastRequest.model` and `responseBody.model`. Empty + // strings must be treated the same as absent and default to + // `"claude-opus-4-6"` so parity consumers can trust the echoed label. + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/messages`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + model: "", + max_tokens: 256, + messages: [ + { + role: "user", + content: "Read the plan", + }, + ], + }), + }); + expect(response.status).toBe(200); + const body = (await response.json()) as { model: string }; + expect(body.model).toBe("claude-opus-4-6"); + + const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`); + expect(debugResponse.status).toBe(200); + const debug = (await debugResponse.json()) as { model: string }; + expect(debug.model).toBe("claude-opus-4-6"); + }); +}); + +describe("resolveProviderVariant", () => { + it("tags prefix-qualified openai models", () => { + expect(resolveProviderVariant("openai/gpt-5.4")).toBe("openai"); + expect(resolveProviderVariant("openai:gpt-5.4")).toBe("openai"); + expect(resolveProviderVariant("openai-codex/gpt-5.4")).toBe("openai"); + }); + + it("tags prefix-qualified anthropic models", () => { + expect(resolveProviderVariant("anthropic/claude-opus-4-6")).toBe("anthropic"); + expect(resolveProviderVariant("anthropic:claude-opus-4-6")).toBe("anthropic"); + expect(resolveProviderVariant("claude-cli/claude-opus-4-6")).toBe("anthropic"); + }); + + it("tags bare model names by prefix", () => { + expect(resolveProviderVariant("gpt-5.4")).toBe("openai"); + expect(resolveProviderVariant("gpt-5.4-alt")).toBe("openai"); + expect(resolveProviderVariant("gpt-4.5")).toBe("openai"); + expect(resolveProviderVariant("o1-preview")).toBe("openai"); + expect(resolveProviderVariant("claude-opus-4-6")).toBe("anthropic"); + expect(resolveProviderVariant("claude-sonnet-4-6")).toBe("anthropic"); + }); + + it("handles case drift and whitespace", () => { + expect(resolveProviderVariant(" OpenAI/GPT-5.4 ")).toBe("openai"); + expect(resolveProviderVariant("ANTHROPIC/CLAUDE-OPUS-4-6")).toBe("anthropic"); + }); + + it("falls through to unknown for unrecognized providers", () => { + expect(resolveProviderVariant("")).toBe("unknown"); + expect(resolveProviderVariant(undefined)).toBe("unknown"); + expect(resolveProviderVariant("mistral/mistral-large")).toBe("unknown"); + expect(resolveProviderVariant("some-random-model")).toBe("unknown"); + }); +}); + +describe("qa mock openai server provider variant tagging", () => { + it("records providerVariant on /debug/last-request for openai requests", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + model: "openai/gpt-5.4", + stream: false, + input: [{ role: "user", content: [{ type: "input_text", text: "Heartbeat check" }] }], + }), + }); + + const debug = (await (await fetch(`${server.baseUrl}/debug/last-request`)).json()) as { + model: string; + providerVariant: string; + }; + expect(debug.model).toBe("openai/gpt-5.4"); + expect(debug.providerVariant).toBe("openai"); + }); + + it("records providerVariant=anthropic on /v1/messages requests", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + await fetch(`${server.baseUrl}/v1/messages`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + model: "claude-opus-4-6", + max_tokens: 256, + messages: [{ role: "user", content: "Heartbeat check" }], + }), + }); + + const debug = (await (await fetch(`${server.baseUrl}/debug/last-request`)).json()) as { + model: string; + providerVariant: string; + }; + expect(debug.model).toBe("claude-opus-4-6"); + expect(debug.providerVariant).toBe("anthropic"); + }); + + it("records providerVariant=unknown for unrecognized models", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + model: "mistral/mistral-large", + stream: false, + input: [{ role: "user", content: [{ type: "input_text", text: "Heartbeat check" }] }], + }), + }); + + const debug = (await (await fetch(`${server.baseUrl}/debug/last-request`)).json()) as { + providerVariant: string; + }; + expect(debug.providerVariant).toBe("unknown"); + }); }); diff --git a/extensions/qa-lab/src/mock-openai-server.ts b/extensions/qa-lab/src/mock-openai-server.ts index 1c3e7863f5c..c0e9b6fdcab 100644 --- a/extensions/qa-lab/src/mock-openai-server.ts +++ b/extensions/qa-lab/src/mock-openai-server.ts @@ -22,6 +22,58 @@ type StreamEvent = }; }; +/** + * Provider variant tag for `body.model`. The mock previously ignored + * `body.model` for dispatch and only echoed it in the prose output, which + * made the parity gate tautological when run against the mock alone + * (both providers produced identical scenario plans by construction). + * Tagging requests with a normalized variant lets individual scenario + * branches opt into provider-specific behavior while the rest of the + * dispatcher stays shared, and lets `/debug/requests` consumers verify + * which provider lane a given request came from without re-parsing the + * raw model string. + * + * Policy: + * - `openai/*`, `gpt-*`, `o1-*`, anything starting with `gpt-` → `"openai"` + * - `anthropic/*`, `claude-*` → `"anthropic"` + * - Everything else (including empty strings) → `"unknown"` + * + * The `/v1/messages` route always feeds `body.model` straight through, + * so an Anthropic request with an `openai/gpt-5.4` model string is still + * classified as `"openai"`. That matches the parity program's convention + * where the provider label is the source of truth, not the HTTP route. + */ +export type MockOpenAiProviderVariant = "openai" | "anthropic" | "unknown"; + +export function resolveProviderVariant(model: string | undefined): MockOpenAiProviderVariant { + if (typeof model !== "string") { + return "unknown"; + } + const trimmed = model.trim().toLowerCase(); + if (trimmed.length === 0) { + return "unknown"; + } + // Prefer the explicit `provider/model` or `provider:model` prefix when + // the caller supplied one — that's the most reliable signal. + const separatorMatch = /^([^/:]+)[/:]/.exec(trimmed); + const provider = separatorMatch?.[1] ?? trimmed; + if (provider === "openai" || provider === "openai-codex") { + return "openai"; + } + if (provider === "anthropic" || provider === "claude-cli") { + return "anthropic"; + } + // Fall back to model-name prefix matching for bare model strings like + // `gpt-5.4` or `claude-opus-4-6`. + if (/^(?:gpt-|o1-|openai-)/.test(trimmed)) { + return "openai"; + } + if (/^(?:claude-|anthropic-)/.test(trimmed)) { + return "anthropic"; + } + return "unknown"; +} + type MockOpenAiRequestSnapshot = { raw: string; body: Record; @@ -30,13 +82,52 @@ type MockOpenAiRequestSnapshot = { instructions?: string; toolOutput: string; model: string; + providerVariant: MockOpenAiProviderVariant; imageInputCount: number; plannedToolName?: string; }; +// Anthropic /v1/messages request/response shapes the mock actually needs. +// This is a subset of the real Anthropic Messages API — just enough so the +// QA suite can run its parity pack against a "baseline" Anthropic provider +// without needing real API keys. The scenarios drive their dispatch through +// the shared mock scenario logic (buildResponsesPayload), so whatever +// behavior the OpenAI mock exposes is automatically mirrored on this route. +type AnthropicMessageContentBlock = + | { type: "text"; text: string } + | { + type: "tool_use"; + id: string; + name: string; + input: Record; + } + | { + type: "tool_result"; + tool_use_id: string; + content: string | Array<{ type: "text"; text: string }>; + } + | { type: "image"; source: Record }; + +type AnthropicMessage = { + role: "user" | "assistant"; + content: string | AnthropicMessageContentBlock[]; +}; + +type AnthropicMessagesRequest = { + model?: string; + max_tokens?: number; + system?: string | Array<{ type: "text"; text: string }>; + messages?: AnthropicMessage[]; + tools?: Array>; + stream?: boolean; +}; + const TINY_PNG_BASE64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII="; -let subagentFanoutPhase = 0; + +type MockScenarioState = { + subagentFanoutPhase: number; +}; function readBody(req: IncomingMessage): Promise { return new Promise((resolve, reject) => { @@ -68,6 +159,23 @@ function writeSse(res: ServerResponse, events: StreamEvent[]) { res.end(body); } +type AnthropicStreamEvent = Record & { + type: string; +}; + +function writeAnthropicSse(res: ServerResponse, events: AnthropicStreamEvent[]) { + const body = events + .map((event) => `event: ${event.type}\ndata: ${JSON.stringify(event)}\n\n`) + .join(""); + res.writeHead(200, { + "content-type": "text/event-stream", + "cache-control": "no-store", + connection: "keep-alive", + "content-length": Buffer.byteLength(body), + }); + res.end(body); +} + function countApproxTokens(text: string) { const trimmed = text.trim(); if (!trimmed) { @@ -376,11 +484,11 @@ function extractLastCapture(text: string, pattern: RegExp) { } function extractExactReplyDirective(text: string) { - const colonMatch = extractLastCapture(text, /reply(?: with)? exactly:\s*([^\n]+)/i); - if (colonMatch) { - return colonMatch; + const backtickedMatch = extractLastCapture(text, /reply(?: with)? exactly\s+`([^`]+)`/i); + if (backtickedMatch) { + return backtickedMatch; } - return extractLastCapture(text, /reply(?: with)? exactly\s+`([^`]+)`/i); + return extractLastCapture(text, /reply(?: with)? exactly:\s*([^\n]+)/i); } function extractExactMarkerDirective(text: string) { @@ -392,10 +500,18 @@ function extractExactMarkerDirective(text: string) { } function isHeartbeatPrompt(text: string) { - return /Read HEARTBEAT\.md if it exists/i.test(text); + const trimmed = text.trim(); + if (!trimmed || /remember this fact/i.test(trimmed)) { + return false; + } + return /(?:^|\n)Read HEARTBEAT\.md if it exists\b/i.test(trimmed); } -function buildAssistantText(input: ResponsesInputItem[], body: Record) { +function buildAssistantText( + input: ResponsesInputItem[], + body: Record, + scenarioState: MockScenarioState, +) { const prompt = extractLastUserText(input); const toolOutput = extractToolOutput(input); const toolJson = parseToolOutputJson(toolOutput); @@ -411,8 +527,10 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record= 2) { + if ( + /subagent fanout synthesis check/i.test(prompt) && + toolOutput && + scenarioState.subagentFanoutPhase >= 2 + ) { return "Protocol note: delegated fanout complete. Alpha=ALPHA-OK. Beta=BETA-OK."; } if (toolOutput && (/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt))) { @@ -579,7 +718,10 @@ function buildAssistantEvents(text: string): StreamEvent[] { ]; } -async function buildResponsesPayload(body: Record) { +async function buildResponsesPayload( + body: Record, + scenarioState: MockScenarioState, +) { const input = Array.isArray(body.input) ? (body.input as ResponsesInputItem[]) : []; const prompt = extractLastUserText(input); const toolOutput = extractToolOutput(input); @@ -587,6 +729,9 @@ async function buildResponsesPayload(body: Record) { const allInputText = extractAllRequestTexts(input, body); const isGroupChat = allInputText.includes('"is_group_chat": true'); const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt); + if (/remember this fact/i.test(prompt)) { + return buildAssistantEvents(buildAssistantText(input, body, scenarioState)); + } if (isHeartbeatPrompt(prompt)) { return buildAssistantEvents("HEARTBEAT_OK"); } @@ -756,16 +901,16 @@ async function buildResponsesPayload(body: Record) { }); } if (/subagent fanout synthesis check/i.test(prompt)) { - if (!toolOutput && subagentFanoutPhase === 0) { - subagentFanoutPhase = 1; + if (!toolOutput && scenarioState.subagentFanoutPhase === 0) { + scenarioState.subagentFanoutPhase = 1; return buildToolCallEventsWithArgs("sessions_spawn", { task: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.", label: "qa-fanout-alpha", thread: false, }); } - if (toolOutput && subagentFanoutPhase === 1) { - subagentFanoutPhase = 2; + if (toolOutput && scenarioState.subagentFanoutPhase === 1) { + scenarioState.subagentFanoutPhase = 2; return buildToolCallEventsWithArgs("sessions_spawn", { task: "Fanout worker beta: inspect the QA workspace and finish with exactly BETA-OK.", label: "qa-fanout-beta", @@ -776,6 +921,30 @@ async function buildResponsesPayload(body: Record) { if (/tool continuity check/i.test(prompt) && !toolOutput) { return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" }); } + if (/repo contract followthrough check/i.test(prompt)) { + if (!toolOutput) { + return buildToolCallEventsWithArgs("read", { path: "AGENT.md" }); + } + if (toolOutput.includes("# Repo contract")) { + return buildToolCallEventsWithArgs("read", { path: "SOUL.md" }); + } + if (toolOutput.includes("# Execution style")) { + return buildToolCallEventsWithArgs("read", { path: "FOLLOWTHROUGH_INPUT.md" }); + } + if ( + toolOutput.includes("Mission: prove you followed the repo contract.") && + toolOutput.includes("Evidence path: AGENT.md -> SOUL.md -> FOLLOWTHROUGH_INPUT.md") + ) { + return buildToolCallEventsWithArgs("write", { + path: "repo-contract-summary.txt", + content: [ + "Mission: prove you followed the repo contract.", + "Evidence: AGENT.md -> SOUL.md -> FOLLOWTHROUGH_INPUT.md", + "Status: complete", + ].join("\n"), + }); + } + } if ((/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt)) && !toolOutput) { return buildToolCallEventsWithArgs("sessions_spawn", { task: "Inspect the QA workspace and return one concise protocol note.", @@ -807,12 +976,390 @@ async function buildResponsesPayload(body: Record) { ) { await sleep(60_000); } - return buildAssistantEvents(buildAssistantText(input, body)); + return buildAssistantEvents(buildAssistantText(input, body, scenarioState)); +} + +// --------------------------------------------------------------------------- +// Anthropic /v1/messages adapter +// --------------------------------------------------------------------------- +// +// The QA parity gate needs two comparable scenario runs: one against the +// "candidate" (openai/gpt-5.4) and one against the "baseline" +// (anthropic/claude-opus-4-6). The OpenAI mock above already dispatches all +// the scenario prompt branches we care about. Rather than duplicating that +// machinery, the /v1/messages route below translates Anthropic request +// shapes into the shared ResponsesInputItem[] format, calls the same +// buildResponsesPayload() dispatcher, and then re-serializes the resulting +// events into an Anthropic response. This gives the parity harness a +// baseline lane that exercises the same scenario logic without requiring +// real Anthropic API keys. +// +// Scope: handles Anthropic Messages requests with text and tool_result +// content blocks, supporting both non-streaming JSON responses and the +// streaming SSE path used by the parity harness. + +function normalizeAnthropicSystemToString( + system: AnthropicMessagesRequest["system"], +): string | undefined { + if (typeof system === "string") { + return system.trim() || undefined; + } + if (Array.isArray(system)) { + const joined = system + .map((block) => (block?.type === "text" ? block.text : "")) + .filter(Boolean) + .join("\n") + .trim(); + return joined || undefined; + } + return undefined; +} + +function stringifyToolResultContent( + content: Extract["content"], +): string { + if (typeof content === "string") { + return content; + } + if (Array.isArray(content)) { + return content + .map((block) => (block?.type === "text" ? block.text : "")) + .filter(Boolean) + .join("\n"); + } + return ""; +} + +function convertAnthropicMessagesToResponsesInput(params: { + system?: AnthropicMessagesRequest["system"]; + messages: AnthropicMessage[]; +}): ResponsesInputItem[] { + const items: ResponsesInputItem[] = []; + const systemText = normalizeAnthropicSystemToString(params.system); + if (systemText) { + items.push({ + role: "system", + content: [{ type: "input_text", text: systemText }], + }); + } + for (const message of params.messages) { + const content = message.content; + if (typeof content === "string") { + items.push({ + role: message.role, + content: [ + message.role === "assistant" + ? { type: "output_text", text: content } + : { type: "input_text", text: content }, + ], + }); + continue; + } + if (!Array.isArray(content)) { + continue; + } + // Buffer each block type so we can push in OpenAI-Responses order instead + // of the order they appear in the Anthropic content array. The parent + // role message must precede any function_call_output items from the same + // turn, otherwise extractToolOutput() (which scans for + // function_call_output AFTER the last user-role index) will not see the + // output and the downstream scenario dispatcher will behave as if no + // tool output was returned. Similarly, assistant tool_use blocks become + // function_call items that must follow the assistant text message they + // narrate. + const textPieces: Array<{ type: "input_text" | "output_text"; text: string }> = []; + const imagePieces: Array<{ type: "input_image"; image_url: string }> = []; + const toolResultItems: ResponsesInputItem[] = []; + const toolUseItems: ResponsesInputItem[] = []; + for (const block of content) { + if (!block || typeof block !== "object") { + continue; + } + if (block.type === "text") { + textPieces.push({ + type: message.role === "assistant" ? "output_text" : "input_text", + text: block.text ?? "", + }); + continue; + } + if (block.type === "image") { + // Mock only needs to count image inputs; a placeholder URL is fine. + imagePieces.push({ type: "input_image", image_url: "anthropic-mock:image" }); + continue; + } + if (block.type === "tool_result") { + const output = stringifyToolResultContent(block.content); + if (output.trim()) { + toolResultItems.push({ type: "function_call_output", output }); + } + continue; + } + if (block.type === "tool_use") { + // Mirror OpenAI's function_call output_item shape so downstream + // prompt extraction still sees "the assistant just emitted a tool + // call". The scenario dispatcher looks for tool_output on the next + // user turn, not the assistant's prior tool_use, so a minimal + // placeholder is enough. + toolUseItems.push({ + type: "function_call", + name: block.name, + arguments: JSON.stringify(block.input ?? {}), + call_id: block.id, + }); + continue; + } + } + if (textPieces.length > 0 || imagePieces.length > 0) { + const combinedContent: Array> = [...textPieces, ...imagePieces]; + items.push({ role: message.role, content: combinedContent }); + } + // Emit tool_use (assistant prior calls) and tool_result (user-side + // returns) AFTER the parent role message so extractLastUserText and + // extractToolOutput walk the array in the order they expect. For a + // tool_result-only user turn with no text/image blocks, the parent + // message is intentionally omitted — the function_call_output itself + // represents the user's "return the tool output" turn. + for (const toolUse of toolUseItems) { + items.push(toolUse); + } + for (const toolResult of toolResultItems) { + items.push(toolResult); + } + } + return items; +} + +type ExtractedAssistantOutput = { + text: string; + toolCalls: Array<{ id: string; name: string; input: Record }>; +}; + +function extractFinalAssistantOutputFromEvents(events: StreamEvent[]): ExtractedAssistantOutput { + const toolCalls: ExtractedAssistantOutput["toolCalls"] = []; + let text = ""; + for (const event of events) { + if (event.type !== "response.output_item.done") { + continue; + } + const item = event.item as { + type?: unknown; + name?: unknown; + call_id?: unknown; + id?: unknown; + arguments?: unknown; + content?: unknown; + }; + if (item.type === "function_call" && typeof item.name === "string") { + let input: Record = {}; + if (typeof item.arguments === "string" && item.arguments.trim()) { + try { + const parsed = JSON.parse(item.arguments) as unknown; + if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) { + input = parsed as Record; + } + } catch { + // keep empty input on malformed args — mock dispatcher owns arg shape + } + } + toolCalls.push({ + id: typeof item.call_id === "string" ? item.call_id : `toolu_mock_${toolCalls.length + 1}`, + name: item.name, + input, + }); + continue; + } + if (item.type === "message" && Array.isArray(item.content)) { + for (const piece of item.content as Array<{ type?: unknown; text?: unknown }>) { + if (piece?.type === "output_text" && typeof piece.text === "string") { + text = piece.text; + } + } + } + } + return { text, toolCalls }; +} + +function buildAnthropicMessageResponse(params: { + model: string; + extracted: ExtractedAssistantOutput; +}): Record { + const content: Array> = []; + if (params.extracted.text) { + content.push({ type: "text", text: params.extracted.text }); + } + for (const call of params.extracted.toolCalls) { + content.push({ + type: "tool_use", + id: call.id, + name: call.name, + input: call.input, + }); + } + if (content.length === 0) { + content.push({ type: "text", text: "" }); + } + const stopReason = params.extracted.toolCalls.length > 0 ? "tool_use" : "end_turn"; + const approxInputTokens = 64; + const approxOutputTokens = Math.max( + 16, + countApproxTokens(params.extracted.text) + params.extracted.toolCalls.length * 16, + ); + return { + id: `msg_mock_${Math.floor(Math.random() * 1_000_000).toString(16)}`, + type: "message", + role: "assistant", + model: params.model || "claude-opus-4-6", + content, + stop_reason: stopReason, + stop_sequence: null, + usage: { + input_tokens: approxInputTokens, + output_tokens: approxOutputTokens, + }, + }; +} + +function buildAnthropicMessageStreamEvents(params: { + model: string; + extracted: ExtractedAssistantOutput; +}): AnthropicStreamEvent[] { + const approxInputTokens = 64; + const approxOutputTokens = Math.max( + 16, + countApproxTokens(params.extracted.text) + params.extracted.toolCalls.length * 16, + ); + const messageId = `msg_mock_${Math.floor(Math.random() * 1_000_000).toString(16)}`; + const events: AnthropicStreamEvent[] = [ + { + type: "message_start", + message: { + id: messageId, + type: "message", + role: "assistant", + model: params.model || "claude-opus-4-6", + content: [], + stop_reason: null, + stop_sequence: null, + usage: { + input_tokens: approxInputTokens, + output_tokens: 0, + }, + }, + }, + ]; + let index = 0; + if (params.extracted.text || params.extracted.toolCalls.length === 0) { + events.push({ + type: "content_block_start", + index, + content_block: { + type: "text", + text: "", + }, + }); + if (params.extracted.text) { + events.push({ + type: "content_block_delta", + index, + delta: { + type: "text_delta", + text: params.extracted.text, + }, + }); + } + events.push({ + type: "content_block_stop", + index, + }); + index += 1; + } + for (const call of params.extracted.toolCalls) { + events.push({ + type: "content_block_start", + index, + content_block: { + type: "tool_use", + id: call.id, + name: call.name, + input: {}, + }, + }); + events.push({ + type: "content_block_delta", + index, + delta: { + type: "input_json_delta", + partial_json: JSON.stringify(call.input ?? {}), + }, + }); + events.push({ + type: "content_block_stop", + index, + }); + index += 1; + } + events.push({ + type: "message_delta", + delta: { + stop_reason: params.extracted.toolCalls.length > 0 ? "tool_use" : "end_turn", + }, + usage: { + input_tokens: approxInputTokens, + output_tokens: approxOutputTokens, + }, + }); + events.push({ + type: "message_stop", + }); + return events; +} + +async function buildMessagesPayload( + body: AnthropicMessagesRequest, + scenarioState: MockScenarioState, +): Promise<{ + events: StreamEvent[]; + input: ResponsesInputItem[]; + extracted: ExtractedAssistantOutput; + responseBody: Record; + streamEvents: AnthropicStreamEvent[]; + model: string; +}> { + const messages = Array.isArray(body.messages) ? body.messages : []; + const input = convertAnthropicMessagesToResponsesInput({ + system: body.system, + messages, + }); + // Treat empty-string model the same as absent. A bare typeof check lets + // `""` leak through to `responseBody.model` and `lastRequest.model`, + // which then confuses parity consumers that assume the mock always + // echoes the real provider label. Normalize once and reuse everywhere. + const normalizedModel = + typeof body.model === "string" && body.model.trim() !== "" ? body.model : "claude-opus-4-6"; + // Dispatch through the same scenario logic the /v1/responses route uses. + // The mock dispatcher only reads `body.input`, `body.model`, and + // `body.stream`, so a synthetic shim body is sufficient. + const dispatchBody: Record = { + input, + model: normalizedModel, + stream: false, + }; + const events = await buildResponsesPayload(dispatchBody, scenarioState); + const extracted = extractFinalAssistantOutputFromEvents(events); + const responseBody = buildAnthropicMessageResponse({ + model: normalizedModel, + extracted, + }); + const streamEvents = buildAnthropicMessageStreamEvents({ + model: normalizedModel, + extracted, + }); + return { events, input, extracted, responseBody, streamEvents, model: normalizedModel }; } export async function startQaMockOpenAiServer(params?: { host?: string; port?: number }) { const host = params?.host ?? "127.0.0.1"; - subagentFanoutPhase = 0; + const scenarioState: MockScenarioState = { subagentFanoutPhase: 0 }; let lastRequest: MockOpenAiRequestSnapshot | null = null; const requests: MockOpenAiRequestSnapshot[] = []; const imageGenerationRequests: Array> = []; @@ -829,6 +1376,8 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n { id: "gpt-5.4-alt", object: "model" }, { id: "gpt-image-1", object: "model" }, { id: "text-embedding-3-small", object: "model" }, + { id: "claude-opus-4-6", object: "model" }, + { id: "claude-sonnet-4-6", object: "model" }, ], }); return; @@ -888,7 +1437,8 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n const raw = await readBody(req); const body = raw ? (JSON.parse(raw) as Record) : {}; const input = Array.isArray(body.input) ? (body.input as ResponsesInputItem[]) : []; - const events = await buildResponsesPayload(body); + const events = await buildResponsesPayload(body, scenarioState); + const resolvedModel = typeof body.model === "string" ? body.model : ""; lastRequest = { raw, body, @@ -896,7 +1446,8 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n allInputText: extractAllRequestTexts(input, body), instructions: extractInstructionsText(body) || undefined, toolOutput: extractToolOutput(input), - model: typeof body.model === "string" ? body.model : "", + model: resolvedModel, + providerVariant: resolveProviderVariant(resolvedModel), imageInputCount: countImageInputs(input), plannedToolName: extractPlannedToolName(events), }; @@ -916,6 +1467,56 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n writeSse(res, events); return; } + if (req.method === "POST" && url.pathname === "/v1/messages") { + const raw = await readBody(req); + let body: AnthropicMessagesRequest = {}; + try { + body = raw ? (JSON.parse(raw) as AnthropicMessagesRequest) : {}; + } catch { + writeJson(res, 400, { + type: "error", + error: { + type: "invalid_request_error", + message: "Malformed JSON body for Anthropic Messages request.", + }, + }); + return; + } + const { + events, + input, + responseBody, + streamEvents, + model: normalizedModel, + } = await buildMessagesPayload(body, scenarioState); + // Record the adapted request snapshot so /debug/requests gives the QA + // suite the same plannedToolName / allInputText / toolOutput signals + // on the Anthropic route that the OpenAI route already exposes. This + // is what lets a single parity run diff assertions across both lanes. + // Reuse the normalized model so an empty-string body.model no longer + // leaks through to `lastRequest.model`. + lastRequest = { + raw, + body: body as Record, + prompt: extractLastUserText(input), + allInputText: extractAllInputTexts(input), + toolOutput: extractToolOutput(input), + model: normalizedModel, + providerVariant: resolveProviderVariant(normalizedModel), + imageInputCount: countImageInputs(input), + plannedToolName: extractPlannedToolName(events), + }; + requests.push(lastRequest); + if (requests.length > 50) { + requests.splice(0, requests.length - 50); + } + if (body.stream === true) { + writeAnthropicSse(res, streamEvents); + return; + } + writeJson(res, 200, responseBody); + return; + } writeJson(res, 404, { error: "not found" }); }); diff --git a/extensions/qa-lab/src/qa-gateway-config.test.ts b/extensions/qa-lab/src/qa-gateway-config.test.ts index 74ae3bfc26a..62973aeb73e 100644 --- a/extensions/qa-lab/src/qa-gateway-config.test.ts +++ b/extensions/qa-lab/src/qa-gateway-config.test.ts @@ -53,6 +53,11 @@ describe("buildQaGatewayConfig", () => { expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("mock-openai/gpt-5.4"); expect(cfg.models?.providers?.["mock-openai"]?.baseUrl).toBe("http://127.0.0.1:44080/v1"); + expect(cfg.models?.providers?.["mock-openai"]?.request).toEqual({ allowPrivateNetwork: true }); + expect(cfg.models?.providers?.openai?.baseUrl).toBe("http://127.0.0.1:44080/v1"); + expect(cfg.models?.providers?.openai?.request).toEqual({ allowPrivateNetwork: true }); + expect(cfg.models?.providers?.anthropic?.baseUrl).toBe("http://127.0.0.1:44080"); + expect(cfg.models?.providers?.anthropic?.request).toEqual({ allowPrivateNetwork: true }); expect(cfg.plugins?.allow).toEqual(["memory-core", "qa-channel"]); expect(cfg.plugins?.entries?.["memory-core"]).toEqual({ enabled: true }); expect(cfg.plugins?.entries?.["qa-channel"]).toEqual({ enabled: true }); @@ -66,6 +71,31 @@ describe("buildQaGatewayConfig", () => { expect(cfg.messages?.groupChat?.mentionPatterns).toEqual(["\\b@?openclaw\\b"]); }); + it("maps provider-qualified openai and anthropic refs through the mock provider lane", () => { + const cfg = buildQaGatewayConfig({ + bind: "loopback", + gatewayPort: 18789, + gatewayToken: "token", + providerBaseUrl: "http://127.0.0.1:44080/v1", + workspaceDir: "/tmp/qa-workspace", + providerMode: "mock-openai", + primaryModel: "openai/gpt-5.4", + alternateModel: "anthropic/claude-opus-4-6", + }); + + expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("openai/gpt-5.4"); + expect(cfg.models?.providers?.openai?.api).toBe("openai-responses"); + expect(cfg.models?.providers?.openai?.request).toEqual({ allowPrivateNetwork: true }); + expect(cfg.models?.providers?.openai?.models.map((model) => model.id)).toContain("gpt-5.4"); + expect(cfg.models?.providers?.anthropic?.api).toBe("anthropic-messages"); + expect(cfg.models?.providers?.anthropic?.baseUrl).toBe("http://127.0.0.1:44080"); + expect(cfg.models?.providers?.anthropic?.request).toEqual({ allowPrivateNetwork: true }); + expect(cfg.models?.providers?.anthropic?.models.map((model) => model.id)).toContain( + "claude-opus-4-6", + ); + expect(cfg.plugins?.allow).toEqual(["memory-core"]); + }); + it("can omit qa-channel for live transport gateway children", () => { const cfg = buildQaGatewayConfig({ bind: "loopback", diff --git a/extensions/qa-lab/src/qa-gateway-config.ts b/extensions/qa-lab/src/qa-gateway-config.ts index 18f3b9e4a3a..6cc5c2832de 100644 --- a/extensions/qa-lab/src/qa-gateway-config.ts +++ b/extensions/qa-lab/src/qa-gateway-config.ts @@ -45,6 +45,10 @@ export function normalizeQaThinkingLevel(input: unknown): QaThinkingLevel | unde return undefined; } +function trimTrailingApiV1(baseUrl: string) { + return baseUrl.replace(/\/v1\/?$/i, ""); +} + export function mergeQaControlUiAllowedOrigins(extraOrigins?: string[]) { const normalizedExtra = (extraOrigins ?? []) .map((origin) => origin.trim()) @@ -74,10 +78,14 @@ export function buildQaGatewayConfig(params: { thinkingDefault?: QaThinkingLevel; }): OpenClawConfig { const mockProviderBaseUrl = params.providerBaseUrl ?? "http://127.0.0.1:44080/v1"; + const mockAnthropicBaseUrl = trimTrailingApiV1(mockProviderBaseUrl); const mockOpenAiProvider: ModelProviderConfig = { baseUrl: mockProviderBaseUrl, apiKey: "test", api: "openai-responses", + request: { + allowPrivateNetwork: true, + }, models: [ { id: "gpt-5.4", @@ -126,6 +134,50 @@ export function buildQaGatewayConfig(params: { }, ], }; + const mockNamedOpenAiProvider: ModelProviderConfig = { + ...mockOpenAiProvider, + models: mockOpenAiProvider.models.map((model) => ({ ...model })), + }; + const mockAnthropicProvider: ModelProviderConfig = { + baseUrl: mockAnthropicBaseUrl, + apiKey: "test", + api: "anthropic-messages", + request: { + allowPrivateNetwork: true, + }, + models: [ + { + id: "claude-opus-4-6", + name: "claude-opus-4-6", + api: "anthropic-messages", + reasoning: false, + input: ["text", "image"], + cost: { + input: 0, + output: 0, + cacheRead: 0, + cacheWrite: 0, + }, + contextWindow: 200_000, + maxTokens: 4096, + }, + { + id: "claude-sonnet-4-6", + name: "claude-sonnet-4-6", + api: "anthropic-messages", + reasoning: false, + input: ["text", "image"], + cost: { + input: 0, + output: 0, + cacheRead: 0, + cacheWrite: 0, + }, + contextWindow: 200_000, + maxTokens: 4096, + }, + ], + }; const providerMode = normalizeQaProviderMode(params.providerMode ?? "mock-openai"); const primaryModel = params.primaryModel ?? defaultQaModelForMode(providerMode); const alternateModel = @@ -273,6 +325,8 @@ export function buildQaGatewayConfig(params: { mode: "replace", providers: { "mock-openai": mockOpenAiProvider, + openai: mockNamedOpenAiProvider, + anthropic: mockAnthropicProvider, }, }, } diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index 658194fb382..fa25fc272bb 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -118,6 +118,50 @@ describe("qa scenario catalog", () => { ); }); + it("keeps mock-only image debug assertions guarded in live-frontier runs", () => { + const scenario = readQaScenarioPack().scenarios.find( + (candidate) => candidate.id === "image-understanding-attachment", + ); + const imageRequestAction = scenario?.execution.flow?.steps + .flatMap((step) => step.actions ?? []) + .find( + ( + action, + ): action is { + set: string; + value?: { expr?: string }; + } => + typeof action === "object" && + action !== null && + "set" in action && + action.set === "imageRequest", + ); + const imageRequestExpr = imageRequestAction?.value?.expr; + + expect(imageRequestExpr).toContain("env.mock ?"); + expect(imageRequestExpr).toContain("/debug/requests"); + }); + + it("adds a repo-instruction followthrough scenario to the parity pack", () => { + const scenario = readQaScenarioById("instruction-followthrough-repo-contract"); + const config = readQaScenarioExecutionConfig("instruction-followthrough-repo-contract") as + | { + workspaceFiles?: Record; + prompt?: string; + expectedReplyAll?: string[]; + } + | undefined; + + expect(config?.workspaceFiles?.["AGENT.md"]).toContain("Step order:"); + expect(config?.workspaceFiles?.["SOUL.md"]).toContain("action-first"); + expect(config?.workspaceFiles?.["FOLLOWTHROUGH_INPUT.md"]).toContain( + "Mission: prove you followed the repo contract.", + ); + expect(config?.prompt).toContain("Repo contract followthrough check."); + expect(config?.expectedReplyAll).toEqual(["read:", "wrote:", "status:"]); + expect(scenario.title).toBe("Instruction followthrough repo contract"); + }); + it("rejects malformed string matcher lists before running a flow", () => { expect(() => validateQaScenarioExecutionConfig({ diff --git a/extensions/qa-lab/src/suite.summary-json.test.ts b/extensions/qa-lab/src/suite.summary-json.test.ts new file mode 100644 index 00000000000..5db4a6646f4 --- /dev/null +++ b/extensions/qa-lab/src/suite.summary-json.test.ts @@ -0,0 +1,101 @@ +import { describe, expect, it } from "vitest"; +import { buildQaSuiteSummaryJson } from "./suite.js"; + +describe("buildQaSuiteSummaryJson", () => { + const baseParams = { + // Test scenarios include a `steps: []` field to match the real suite + // scenario-result shape so downstream consumers that rely on the shape + // (parity gate, report render) stay aligned. + scenarios: [ + { name: "Scenario A", status: "pass" as const, steps: [] }, + { name: "Scenario B", status: "fail" as const, details: "something broke", steps: [] }, + ], + startedAt: new Date("2026-04-11T00:00:00.000Z"), + finishedAt: new Date("2026-04-11T00:05:00.000Z"), + providerMode: "mock-openai" as const, + primaryModel: "openai/gpt-5.4", + alternateModel: "openai/gpt-5.4-alt", + fastMode: true, + concurrency: 2, + }; + + it("records provider/model/mode so parity gates can verify labels", () => { + const json = buildQaSuiteSummaryJson(baseParams); + expect(json.run).toMatchObject({ + startedAt: "2026-04-11T00:00:00.000Z", + finishedAt: "2026-04-11T00:05:00.000Z", + providerMode: "mock-openai", + primaryModel: "openai/gpt-5.4", + primaryProvider: "openai", + primaryModelName: "gpt-5.4", + alternateModel: "openai/gpt-5.4-alt", + alternateProvider: "openai", + alternateModelName: "gpt-5.4-alt", + fastMode: true, + concurrency: 2, + scenarioIds: null, + }); + }); + + it("includes scenarioIds in run metadata when provided", () => { + const scenarioIds = ["approval-turn-tool-followthrough", "subagent-handoff", "memory-recall"]; + const json = buildQaSuiteSummaryJson({ + ...baseParams, + scenarioIds, + }); + expect(json.run.scenarioIds).toEqual(scenarioIds); + }); + + it("treats an empty scenarioIds array as unspecified (no filter)", () => { + // A CLI path that omits --scenario passes an empty array to runQaSuite. + // The summary must encode that as null so downstream parity/report + // tooling doesn't interpret a full run as an explicit empty selection. + const json = buildQaSuiteSummaryJson({ + ...baseParams, + scenarioIds: [], + }); + expect(json.run.scenarioIds).toBeNull(); + }); + + it("records an Anthropic baseline lane cleanly for parity runs", () => { + const json = buildQaSuiteSummaryJson({ + ...baseParams, + primaryModel: "anthropic/claude-opus-4-6", + alternateModel: "anthropic/claude-sonnet-4-6", + }); + expect(json.run).toMatchObject({ + primaryModel: "anthropic/claude-opus-4-6", + primaryProvider: "anthropic", + primaryModelName: "claude-opus-4-6", + alternateModel: "anthropic/claude-sonnet-4-6", + alternateProvider: "anthropic", + alternateModelName: "claude-sonnet-4-6", + }); + }); + + it("leaves split fields null when a model ref is malformed", () => { + const json = buildQaSuiteSummaryJson({ + ...baseParams, + primaryModel: "not-a-real-ref", + alternateModel: "", + }); + expect(json.run).toMatchObject({ + primaryModel: "not-a-real-ref", + primaryProvider: null, + primaryModelName: null, + alternateModel: "", + alternateProvider: null, + alternateModelName: null, + }); + }); + + it("keeps scenarios and counts alongside the run metadata", () => { + const json = buildQaSuiteSummaryJson(baseParams); + expect(json.scenarios).toHaveLength(2); + expect(json.counts).toEqual({ + total: 2, + passed: 1, + failed: 1, + }); + }); +}); diff --git a/extensions/qa-lab/src/suite.ts b/extensions/qa-lab/src/suite.ts index bde9b6786dd..d9f916eac21 100644 --- a/extensions/qa-lab/src/suite.ts +++ b/extensions/qa-lab/src/suite.ts @@ -81,7 +81,7 @@ type QaSuiteStep = { run: () => Promise; }; -type QaSuiteScenarioResult = { +export type QaSuiteScenarioResult = { name: string; status: "pass" | "fail"; steps: QaReportCheck[]; @@ -1365,17 +1365,105 @@ function createQaSuiteReportNotes(params: { return params.transport.createReportNotes(params); } +export type QaSuiteSummaryJsonParams = { + scenarios: QaSuiteScenarioResult[]; + startedAt: Date; + finishedAt: Date; + providerMode: QaProviderMode; + primaryModel: string; + alternateModel: string; + fastMode: boolean; + concurrency: number; + scenarioIds?: readonly string[]; +}; + +/** + * Strongly-typed shape of `qa-suite-summary.json`. The GPT-5.4 parity gate + * (agentic-parity-report.ts, #64441) and any future parity wrapper can + * import this type instead of re-declaring the shape, so changes to the + * summary schema propagate through to every consumer at type-check time. + */ +export type QaSuiteSummaryJson = { + scenarios: QaSuiteScenarioResult[]; + counts: { + total: number; + passed: number; + failed: number; + }; + run: { + startedAt: string; + finishedAt: string; + providerMode: QaProviderMode; + primaryModel: string; + primaryProvider: string | null; + primaryModelName: string | null; + alternateModel: string; + alternateProvider: string | null; + alternateModelName: string | null; + fastMode: boolean; + concurrency: number; + scenarioIds: string[] | null; + }; +}; + +/** + * Pure-ish JSON builder for qa-suite-summary.json. Exported so the GPT-5.4 + * parity gate (agentic-parity-report.ts, #64441) and any future parity + * runner can assert-and-trust the provider/model that produced a given + * summary instead of blindly accepting the caller's candidateLabel / + * baselineLabel. Without the `run` block, a maintainer who swaps candidate + * and baseline summary paths could silently produce a mislabeled verdict. + * + * `scenarioIds` is only recorded when the caller passed a non-empty array + * (an explicit scenario selection). A missing or empty array means "no + * filter, full lane-selected catalog", which the summary encodes as `null` + * so parity/report tooling doesn't mistake a full run for an explicit + * empty selection. + */ +export function buildQaSuiteSummaryJson(params: QaSuiteSummaryJsonParams): QaSuiteSummaryJson { + const primarySplit = splitModelRef(params.primaryModel); + const alternateSplit = splitModelRef(params.alternateModel); + return { + scenarios: params.scenarios, + counts: { + total: params.scenarios.length, + passed: params.scenarios.filter((scenario) => scenario.status === "pass").length, + failed: params.scenarios.filter((scenario) => scenario.status === "fail").length, + }, + run: { + startedAt: params.startedAt.toISOString(), + finishedAt: params.finishedAt.toISOString(), + providerMode: params.providerMode, + primaryModel: params.primaryModel, + primaryProvider: primarySplit?.provider ?? null, + primaryModelName: primarySplit?.model ?? null, + alternateModel: params.alternateModel, + alternateProvider: alternateSplit?.provider ?? null, + alternateModelName: alternateSplit?.model ?? null, + fastMode: params.fastMode, + concurrency: params.concurrency, + scenarioIds: + params.scenarioIds && params.scenarioIds.length > 0 ? [...params.scenarioIds] : null, + }, + }; +} + async function writeQaSuiteArtifacts(params: { outputDir: string; startedAt: Date; finishedAt: Date; scenarios: QaSuiteScenarioResult[]; transport: QaTransportAdapter; - providerMode: "mock-openai" | "live-frontier"; + // Reuse the canonical QaProviderMode union instead of re-declaring it + // inline. Loop 6 already unified `QaSuiteSummaryJsonParams.providerMode` + // on this type; keeping the writer in sync prevents drift when model- + // selection.ts adds a new provider mode. + providerMode: QaProviderMode; primaryModel: string; alternateModel: string; fastMode: boolean; concurrency: number; + scenarioIds?: readonly string[]; }) { const report = renderQaMarkdownReport({ title: "OpenClaw QA Scenario Suite", @@ -1395,18 +1483,7 @@ async function writeQaSuiteArtifacts(params: { await fs.writeFile(reportPath, report, "utf8"); await fs.writeFile( summaryPath, - `${JSON.stringify( - { - scenarios: params.scenarios, - counts: { - total: params.scenarios.length, - passed: params.scenarios.filter((scenario) => scenario.status === "pass").length, - failed: params.scenarios.filter((scenario) => scenario.status === "fail").length, - }, - }, - null, - 2, - )}\n`, + `${JSON.stringify(buildQaSuiteSummaryJson(params), null, 2)}\n`, "utf8", ); return { report, reportPath, summaryPath }; @@ -1576,6 +1653,16 @@ export async function runQaSuite(params?: QaSuiteRunParams): Promise 0 + ? selectedCatalogScenarios.map((scenario) => scenario.id) + : undefined, }); lab.setLatestReport({ outputPath: reportPath, @@ -1737,6 +1824,12 @@ export async function runQaSuite(params?: QaSuiteRunParams): Promise 0 + ? selectedCatalogScenarios.map((scenario) => scenario.id) + : undefined, }); const latestReport = { outputPath: reportPath, diff --git a/qa/scenarios/config-restart-capability-flip.md b/qa/scenarios/config-restart-capability-flip.md index 1717941e804..b25cb5e40b7 100644 --- a/qa/scenarios/config-restart-capability-flip.md +++ b/qa/scenarios/config-restart-capability-flip.md @@ -151,6 +151,20 @@ steps: ref: imageStartedAtMs timeoutMs: expr: liveTurnTimeoutMs(env, 45000) + # Tool-call assertion (criterion 2 of the parity completion + # gate in #64227): the restored `image_generate` capability + # must have actually fired as a real tool call. Without this + # assertion, a prose reply that just mentions a MEDIA path + # could satisfy the scenario, so strengthen it by requiring + # the mock to have recorded `plannedToolName: "image_generate"` + # against a post-restart request. The `!env.mock || ...` + # guard means this check only runs in mock mode (where + # `/debug/requests` is available); live-frontier runs skip + # it and still pass the rest of the scenario. + - assert: + expr: "!env.mock || [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].some((request) => String(request.allInputText ?? '').toLowerCase().includes('capability flip image check') && request.plannedToolName === 'image_generate')" + message: + expr: "`expected image_generate tool call during capability flip scenario, saw plannedToolNames=${JSON.stringify([...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].filter((request) => String(request.allInputText ?? '').toLowerCase().includes('capability flip image check')).map((request) => request.plannedToolName ?? null))}`" finally: - call: patchConfig args: diff --git a/qa/scenarios/image-understanding-attachment.md b/qa/scenarios/image-understanding-attachment.md index ed508d8aba0..31801ee207f 100644 --- a/qa/scenarios/image-understanding-attachment.md +++ b/qa/scenarios/image-understanding-attachment.md @@ -64,9 +64,26 @@ steps: expr: "!missingColorGroup" message: expr: "`missing expected colors in image description: ${outbound.text}`" + # Image-processing assertion: verify the mock actually received an + # image on the scenario-unique prompt. This is as strong as a + # tool-call assertion for this scenario — unlike the + # `source-docs-discovery-report` / `subagent-handoff` / + # `config-restart-capability-flip` scenarios that rely on a real + # tool call to satisfy the parity criterion, image understanding + # is handled inside the provider's vision capability and does NOT + # emit a tool call the mock can record as `plannedToolName`. The + # `imageInputCount` field IS the tool-call evidence for vision + # scenarios: it proves the attachment reached the provider, which + # is the only thing an external harness can verify in mock mode. + # Match on the scenario-unique prompt substring so the assertion + # can't be accidentally satisfied by some other scenario's image + # request that happens to share a debug log with this one. + - set: imageRequest + value: + expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].find((request) => String(request.prompt ?? '').includes('Image understanding check')) : null" - assert: - expr: "!env.mock || (((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.prompt ?? '').includes('Image understanding check'))?.imageInputCount ?? 0) >= 1)" + expr: "!env.mock || (imageRequest && (imageRequest.imageInputCount ?? 0) >= 1)" message: - expr: "`expected at least one input image, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.prompt ?? '').includes('Image understanding check'))?.imageInputCount ?? 0)}`" + expr: "`expected at least one input image on the Image understanding check request, got imageInputCount=${String(imageRequest?.imageInputCount ?? 0)}`" detailsExpr: outbound.text ``` diff --git a/qa/scenarios/instruction-followthrough-repo-contract.md b/qa/scenarios/instruction-followthrough-repo-contract.md new file mode 100644 index 00000000000..4bbee26e63f --- /dev/null +++ b/qa/scenarios/instruction-followthrough-repo-contract.md @@ -0,0 +1,127 @@ +# Instruction followthrough repo contract + +```yaml qa-scenario +id: instruction-followthrough-repo-contract +title: Instruction followthrough repo contract +surface: repo-contract +objective: Verify the agent reads repo instruction files first, follows the required tool order, and completes the first feasible action instead of stopping at a plan. +successCriteria: + - Agent reads the seeded instruction files before writing the requested artifact. + - Agent writes the requested artifact in the same run instead of returning only a plan. + - Agent does not ask for permission before the first feasible action. + - Final reply makes the completed read/write sequence explicit. +docsRefs: + - docs/help/testing.md + - docs/channels/qa-channel.md +codeRefs: + - src/agents/system-prompt.ts + - src/agents/pi-embedded-runner/run/incomplete-turn.ts + - extensions/qa-lab/src/mock-openai-server.ts +execution: + kind: flow + summary: Verify the agent reads repo instructions first, then completes the first bounded followthrough task without stalling. + config: + workspaceFiles: + AGENT.md: |- + # Repo contract + + Step order: + 1. Read AGENT.md. + 2. Read SOUL.md. + 3. Read FOLLOWTHROUGH_INPUT.md. + 4. Write ./repo-contract-summary.txt. + 5. Reply with three labeled lines exactly once: Read, Wrote, Status. + + Do not stop after planning. + Do not ask for permission before the first feasible action. + SOUL.md: |- + # Execution style + + Stay brief, honest, and action-first. + If the next tool action is feasible, do it before replying. + FOLLOWTHROUGH_INPUT.md: |- + Mission: prove you followed the repo contract. + Evidence path: AGENT.md -> SOUL.md -> FOLLOWTHROUGH_INPUT.md -> repo-contract-summary.txt + prompt: |- + Repo contract followthrough check. Read AGENT.md, SOUL.md, and FOLLOWTHROUGH_INPUT.md first. + Then follow the repo contract exactly, write ./repo-contract-summary.txt, and reply with + three labeled lines: Read, Wrote, Status. + Do not stop after planning and do not ask for permission before the first feasible action. + expectedReplyAll: + - "read:" + - "wrote:" + - "status:" + forbiddenNeedles: + - need permission + - need your approval + - can you approve + - i would + - i can + - next i would +``` + +```yaml qa-flow +steps: + - name: follows repo instructions instead of stopping at a plan + actions: + - call: reset + - forEach: + items: + expr: "Object.entries(config.workspaceFiles ?? {})" + item: workspaceFile + actions: + - call: fs.writeFile + args: + - expr: "path.join(env.gateway.workspaceDir, String(workspaceFile[0]))" + - expr: "`${String(workspaceFile[1] ?? '').trimEnd()}\\n`" + - utf8 + - set: artifactPath + value: + expr: "path.join(env.gateway.workspaceDir, 'repo-contract-summary.txt')" + - call: runAgentPrompt + args: + - ref: env + - sessionKey: agent:qa:repo-contract + message: + expr: config.prompt + timeoutMs: + expr: liveTurnTimeoutMs(env, 40000) + - call: waitForCondition + saveAs: artifact + args: + - lambda: + async: true + expr: "((await fs.readFile(artifactPath, 'utf8').catch(() => null))?.includes('Mission: prove you followed the repo contract.') ? await fs.readFile(artifactPath, 'utf8').catch(() => null) : undefined)" + - expr: liveTurnTimeoutMs(env, 30000) + - expr: "env.providerMode === 'mock-openai' ? 100 : 250" + - set: expectedReplyAll + value: + expr: config.expectedReplyAll.map(normalizeLowercaseStringOrEmpty) + - call: waitForCondition + saveAs: outbound + args: + - lambda: + expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && expectedReplyAll.every((needle) => normalizeLowercaseStringOrEmpty(candidate.text).includes(needle))).at(-1)" + - expr: liveTurnTimeoutMs(env, 30000) + - expr: "env.providerMode === 'mock-openai' ? 100 : 250" + - assert: + expr: "!config.forbiddenNeedles.some((needle) => normalizeLowercaseStringOrEmpty(outbound.text).includes(needle))" + message: + expr: "`repo contract followthrough bounced for permission or stalled: ${outbound.text}`" + - set: followthroughDebugRequests + value: + expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].filter((request) => /repo contract followthrough check/i.test(String(request.allInputText ?? ''))) : []" + - assert: + expr: "!env.mock || followthroughDebugRequests.filter((request) => request.plannedToolName === 'read').length >= 3" + message: + expr: "`expected three read tool calls before write, saw plannedToolNames=${JSON.stringify(followthroughDebugRequests.map((request) => request.plannedToolName ?? null))}`" + - assert: + expr: "!env.mock || followthroughDebugRequests.some((request) => request.plannedToolName === 'write')" + message: + expr: "`expected write tool call during repo contract followthrough, saw plannedToolNames=${JSON.stringify(followthroughDebugRequests.map((request) => request.plannedToolName ?? null))}`" + - assert: + expr: "!env.mock || (() => { const readIndices = followthroughDebugRequests.map((r, i) => r.plannedToolName === 'read' ? i : -1).filter(i => i >= 0); const firstWrite = followthroughDebugRequests.findIndex((r) => r.plannedToolName === 'write'); return readIndices.length >= 3 && firstWrite >= 0 && readIndices[2] < firstWrite; })()" + message: + expr: "`expected all 3 reads before any write during repo contract followthrough, saw plannedToolNames=${JSON.stringify(followthroughDebugRequests.map((request) => request.plannedToolName ?? null))}`" + detailsExpr: outbound.text +``` diff --git a/qa/scenarios/memory-recall.md b/qa/scenarios/memory-recall.md index a6886afcd01..908cbdca72c 100644 --- a/qa/scenarios/memory-recall.md +++ b/qa/scenarios/memory-recall.md @@ -1,5 +1,36 @@ # Memory recall after context switch + + ```yaml qa-scenario id: memory-recall title: Memory recall after context switch diff --git a/qa/scenarios/model-switch-tool-continuity.md b/qa/scenarios/model-switch-tool-continuity.md index cbee28b6bbd..7e162b2e331 100644 --- a/qa/scenarios/model-switch-tool-continuity.md +++ b/qa/scenarios/model-switch-tool-continuity.md @@ -69,13 +69,22 @@ steps: expr: hasModelSwitchContinuityEvidence(outbound.text) message: expr: "`switch reply missed kickoff continuity: ${outbound.text}`" - - assert: - expr: "!env.mock || (((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.plannedToolName) === 'read')" - message: - expr: "`expected read after switch, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.plannedToolName ?? '')}`" - - assert: - expr: "!env.mock || (((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.model) === 'gpt-5.4-alt')" - message: - expr: "`expected alternate model, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.model ?? '')}`" + - if: + expr: "Boolean(env.mock)" + then: + - set: switchDebugRequests + value: + expr: "await fetchJson(`${env.mock.baseUrl}/debug/requests`)" + - set: switchRequest + value: + expr: "switchDebugRequests.find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))" + - assert: + expr: "switchRequest?.plannedToolName === 'read'" + message: + expr: "`expected read after switch, got ${String(switchRequest?.plannedToolName ?? '')}`" + - assert: + expr: "String(switchRequest?.model ?? '') === String(alternate?.model ?? '')" + message: + expr: "`expected alternate model, got ${String(switchRequest?.model ?? '')}`" detailsExpr: outbound.text ``` diff --git a/qa/scenarios/source-docs-discovery-report.md b/qa/scenarios/source-docs-discovery-report.md index 2de07b14fcd..8a4f999478a 100644 --- a/qa/scenarios/source-docs-discovery-report.md +++ b/qa/scenarios/source-docs-discovery-report.md @@ -56,5 +56,20 @@ steps: expr: "!reportsDiscoveryScopeLeak(outbound.text)" message: expr: "`discovery report drifted beyond scope: ${outbound.text}`" + # Parity gate criterion 2 (no fake progress / fake tool completion): + # require an actual read tool call before the prose report. Without this, + # a model could fabricate a plausible Worked/Failed/Blocked/Follow-up + # report without ever touching the repo files the prompt names. The + # debug request log is fetched once and reused for both the assertion + # and its failure-message diagnostic. Each request's allInputText is + # lowercased inline at match time (the real prompt writes it as + # "Worked, Failed, Blocked") so the contains check is case-insensitive. + - set: discoveryDebugRequests + value: + expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))] : []" + - assert: + expr: "!env.mock || discoveryDebugRequests.some((request) => String(request.allInputText ?? '').toLowerCase().includes('worked, failed, blocked') && request.plannedToolName === 'read')" + message: + expr: "`expected at least one read tool call during discovery report scenario, saw plannedToolNames=${JSON.stringify(discoveryDebugRequests.map((request) => request.plannedToolName ?? null))}`" detailsExpr: outbound.text ``` diff --git a/qa/scenarios/subagent-fanout-synthesis.md b/qa/scenarios/subagent-fanout-synthesis.md index 0f4f6501852..4d142151620 100644 --- a/qa/scenarios/subagent-fanout-synthesis.md +++ b/qa/scenarios/subagent-fanout-synthesis.md @@ -113,6 +113,28 @@ steps: expr: "sawAlpha && sawBeta" message: expr: "`fanout child sessions missing (alpha=${String(sawAlpha)} beta=${String(sawBeta)})`" + # Tool-call assertion (criterion 2 of the + # parity completion gate in #64227): the + # scenario must have actually invoked + # `sessions_spawn` at least twice with + # distinct labels, not just ended up with + # two rows in the session store through + # prose trickery. The session store alone + # can be populated by other flows or by a + # model that fabricates "delegation" + # narration. `plannedToolName` on the + # mock's `/debug/requests` log is the + # tool-call ground truth: two recorded + # sessions_spawn requests with distinct + # labels means the model really dispatched + # both subagents. + - set: fanoutSpawnRequests + value: + expr: "[...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].filter((request) => request.plannedToolName === 'sessions_spawn' && /subagent fanout synthesis check/i.test(String(request.allInputText ?? '')))" + - assert: + expr: "fanoutSpawnRequests.length >= 2" + message: + expr: "`expected at least two sessions_spawn tool calls during subagent fanout scenario, saw ${fanoutSpawnRequests.length}`" - set: details value: expr: "outbound.text" diff --git a/qa/scenarios/subagent-handoff.md b/qa/scenarios/subagent-handoff.md index 1b61cd61f7b..74853aa65d9 100644 --- a/qa/scenarios/subagent-handoff.md +++ b/qa/scenarios/subagent-handoff.md @@ -46,5 +46,25 @@ steps: expr: "!['failed to delegate','could not delegate','subagent unavailable'].some((needle) => normalizeLowercaseStringOrEmpty(outbound.text).includes(needle))" message: expr: "`subagent handoff reported failure: ${outbound.text}`" + # Parity gate criterion 2 (no fake progress / fake tool completion): + # require an actual sessions_spawn tool call. Without this, a model + # could produce the three labeled sections ("Delegated task", "Result", + # "Evidence") as free-form prose without ever delegating to a real + # subagent. The assertion is pinned to THIS scenario by matching the + # scenario-unique prompt substring "Delegate one bounded QA task" + # (not a broad /delegate|subagent/ regex) so the earlier + # subagent-fanout-synthesis scenario — which also contains "delegate" + # and produces its own pre-tool sessions_spawn request — cannot + # satisfy the assertion here. The match is also constrained to + # pre-tool requests (no toolOutput) because the mock only plans + # sessions_spawn on requests with no toolOutput; the follow-up + # request after the tool runs has plannedToolName unset. + - set: subagentDebugRequests + value: + expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))] : []" + - assert: + expr: "!env.mock || subagentDebugRequests.some((request) => !request.toolOutput && /delegate one bounded qa task/i.test(String(request.allInputText ?? '')) && request.plannedToolName === 'sessions_spawn')" + message: + expr: "`expected sessions_spawn tool call during subagent handoff scenario, saw plannedToolNames=${JSON.stringify(subagentDebugRequests.map((request) => request.plannedToolName ?? null))}`" detailsExpr: outbound.text ``` diff --git a/src/canvas-host/a2ui/.bundle.hash b/src/canvas-host/a2ui/.bundle.hash index a06dc954c6a..6d2e0ab9c39 100644 --- a/src/canvas-host/a2ui/.bundle.hash +++ b/src/canvas-host/a2ui/.bundle.hash @@ -1 +1 @@ -b92daceecab88cdb1ceeab30a7321399850a1fd13773af22dbb2035d39cdd5f8 +1d087c0991987824d78c8ac4ec2c0e66d661f4bd4afd12b193d66634c69d75a0