diff --git a/.github/workflows/parity-gate.yml b/.github/workflows/parity-gate.yml
new file mode 100644
index 00000000000..a96958eb106
--- /dev/null
+++ b/.github/workflows/parity-gate.yml
@@ -0,0 +1,93 @@
+name: Parity gate
+
+on:
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
+    paths:
+      - "extensions/qa-lab/**"
+      - "extensions/qa-channel/**"
+      - "extensions/openai/**"
+      - "qa/scenarios/**"
+      - "src/agents/**"
+      - "src/context-engine/**"
+      - "src/gateway/**"
+      - "src/media/**"
+      - ".github/workflows/parity-gate.yml"
+
+permissions:
+  contents: read
+
+concurrency:
+  group: parity-gate-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  parity-gate:
+    name: Run the GPT-5.4 / Opus 4.6 parity gate against the qa-lab mock
+    if: ${{ github.event.pull_request.draft != true }}
+    runs-on: blacksmith-8vcpu-ubuntu-2404
+    timeout-minutes: 20
+    env:
+      # Fence the gate off from any real provider credentials. The qa-lab
+      # mock server + auth staging (PR N) should be enough to produce a
+      # meaningful verdict without touching a real API. If any of these
+      # leak into the job env, fail hard instead of silently running
+      # against a live provider and burning real budget.
+      OPENAI_API_KEY: ""
+      ANTHROPIC_API_KEY: ""
+      OPENCLAW_LIVE_OPENAI_KEY: ""
+      OPENCLAW_LIVE_ANTHROPIC_KEY: ""
+      OPENCLAW_LIVE_GEMINI_KEY: ""
+      OPENCLAW_LIVE_SETUP_TOKEN_VALUE: ""
+    steps:
+      - name: Checkout PR
+        uses: actions/checkout@v4
+
+      - name: Install pnpm
+        uses: pnpm/action-setup@v4
+
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: "22.14.0"
+          cache: "pnpm"
+
+      - name: Install dependencies
+        run: pnpm install --frozen-lockfile
+
+      - name: Run GPT-5.4 lane
+        run: |
+          pnpm openclaw qa suite \
+            --provider-mode mock-openai \
+            --parity-pack agentic \
+            --model openai/gpt-5.4 \
+            --alt-model openai/gpt-5.4-alt \
+            --output-dir .artifacts/qa-e2e/gpt54
+
+      - name: Run Opus 4.6 lane
+        run: |
+          pnpm openclaw qa suite \
+            --provider-mode mock-openai \
+            --parity-pack agentic \
+            --model anthropic/claude-opus-4-6 \
+            --alt-model anthropic/claude-sonnet-4-6 \
+            --output-dir .artifacts/qa-e2e/opus46
+
+      - name: Generate parity report
+        run: |
+          pnpm openclaw qa parity-report \
+            --repo-root . \
+            --candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \
+            --baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \
+            --candidate-label openai/gpt-5.4 \
+            --baseline-label anthropic/claude-opus-4-6 \
+            --output-dir .artifacts/qa-e2e/parity
+
+      - name: Upload parity artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: parity-gate-${{ github.event.pull_request.number || github.sha }}
+          path: .artifacts/qa-e2e/
+          retention-days: 14
+          if-no-files-found: warn
diff --git a/extensions/qa-lab/src/agentic-parity-report.test.ts b/extensions/qa-lab/src/agentic-parity-report.test.ts
index a8291027ac4..8fcee784939 100644
--- a/extensions/qa-lab/src/agentic-parity-report.test.ts
+++ b/extensions/qa-lab/src/agentic-parity-report.test.ts
@@ -2,16 +2,42 @@ import { describe, expect, it } from "vitest";
 import {
   buildQaAgenticParityComparison,
   computeQaAgenticParityMetrics,
+  QaParityLabelMismatchError,
   renderQaAgenticParityMarkdownReport,
+  type QaParityReportScenario,
   type QaParitySuiteSummary,
 } from "./agentic-parity-report.js";
 
+const FULL_PARITY_PASS_SCENARIOS: QaParityReportScenario[] = [
+  { name: "Approval turn tool followthrough", status: "pass" as const },
+  { name: "Compaction retry after mutating tool", status: "pass" as const },
+  { name: "Model switch with tool continuity", status: "pass" as const },
+  { name: "Source and docs discovery report", status: "pass" as const },
+  { name: "Image understanding from attachment", status: "pass" as const },
+  { name: "Subagent handoff", status: "pass" as const },
+  { name: "Subagent fanout synthesis", status: "pass" as const },
+  { name: "Memory recall after context switch", status: "pass" as const },
+  { name: "Thread memory isolation", status: "pass" as const },
+  { name: "Config restart capability flip", status: "pass" as const },
+  { name: "Instruction followthrough repo contract", status: "pass" as const },
+];
+
+function withScenarioOverride(name: string, override: Partial<QaParityReportScenario>) {
+  return FULL_PARITY_PASS_SCENARIOS.map((scenario) =>
+    scenario.name === name ? { ...scenario, ...override } : scenario,
+  );
+}
+
 describe("qa agentic parity report", () => {
   it("computes first-wave parity metrics from suite summaries", () => {
     const summary: QaParitySuiteSummary = {
       scenarios: [
-        { name: "Scenario A", status: "pass" },
-        { name: "Scenario B", status: "fail", details: "incomplete turn detected" },
+        { name: "Approval turn tool followthrough", status: "pass" },
+        {
+          name: "Compaction retry after mutating tool",
+          status: "fail",
+          details: "incomplete turn detected",
+        },
       ],
     };
 
@@ -28,6 +54,23 @@ describe("qa agentic parity report", () => {
     });
   });
 
+  it("keeps non-tool scenarios out of the valid-tool-call metric", () => {
+    const summary: QaParitySuiteSummary = {
+      scenarios: [
+        { name: "Approval turn tool followthrough", status: "pass" },
+        { name: "Memory recall after context switch", status: "pass" },
+        { name: "Image understanding from attachment", status: "pass" },
+      ],
+    };
+
+    expect(computeQaAgenticParityMetrics(summary)).toMatchObject({
+      totalScenarios: 3,
+      passedScenarios: 3,
+      validToolCallCount: 1,
+      validToolCallRate: 1,
+    });
+  });
+
   it("fails the parity gate when the candidate regresses against baseline", () => {
     const comparison = buildQaAgenticParityComparison({
       candidateLabel: "openai/gpt-5.4",
@@ -207,33 +250,70 @@ describe("qa agentic parity report", () => {
     );
   });
 
+  it("fails the parity gate when a required parity scenario fails on both sides", () => {
+    // Regression for the loop-7 Codex-connector P1 finding: without this
+    // check, a required parity scenario that fails on both candidate and
+    // baseline still produces pass=true because the downstream metric
+    // comparisons are purely relative (candidate vs baseline). Cover the
+    // whole parity pack as pass on both sides except the one scenario we
+    // deliberately fail on both sides, so the assertion can pin the
+    // isolated gate failure under test.
+    const scenariosWithBothFail = withScenarioOverride("Approval turn tool followthrough", {
+      status: "fail",
+    });
+    const comparison = buildQaAgenticParityComparison({
+      candidateLabel: "openai/gpt-5.4",
+      baselineLabel: "anthropic/claude-opus-4-6",
+      candidateSummary: { scenarios: scenariosWithBothFail },
+      baselineSummary: { scenarios: scenariosWithBothFail },
+      comparedAt: "2026-04-11T00:00:00.000Z",
+    });
+
+    expect(comparison.pass).toBe(false);
+    expect(comparison.failures).toContain(
+      "Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.4=fail, anthropic/claude-opus-4-6=fail.",
+    );
+    // Metric comparisons are relative, so a same-on-both-sides failure
+    // must not appear as a relative metric failure. The required-scenario
+    // failure line is the only thing keeping the gate honest here.
+    expect(comparison.failures.some((failure) => failure.includes("completion rate"))).toBe(false);
+  });
+
+  it("fails the parity gate when a required parity scenario fails on the candidate only", () => {
+    // A candidate regression below a passing baseline is already caught
+    // by the relative completion-rate comparison, but surface it as a
+    // named required-scenario failure too so operators see a concrete
+    // scenario name alongside the rate differential.
+    const candidateWithOneFail = withScenarioOverride("Approval turn tool followthrough", {
+      status: "fail",
+    });
+    const comparison = buildQaAgenticParityComparison({
+      candidateLabel: "openai/gpt-5.4",
+      baselineLabel: "anthropic/claude-opus-4-6",
+      candidateSummary: { scenarios: candidateWithOneFail },
+      baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
+      comparedAt: "2026-04-11T00:00:00.000Z",
+    });
+
+    expect(comparison.pass).toBe(false);
+    expect(comparison.failures).toContain(
+      "Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.4=fail, anthropic/claude-opus-4-6=pass.",
+    );
+  });
+
   it("fails the parity gate when the baseline contains suspicious pass results", () => {
-    // Cover the full first-wave pack on both sides so the suspicious-pass assertion
+    // Cover the full second-wave pack on both sides so the suspicious-pass assertion
     // below is the isolated gate failure under test (no coverage-gap noise).
     const comparison = buildQaAgenticParityComparison({
       candidateLabel: "openai/gpt-5.4",
       baselineLabel: "anthropic/claude-opus-4-6",
       candidateSummary: {
-        scenarios: [
-          { name: "Approval turn tool followthrough", status: "pass" },
-          { name: "Compaction retry after mutating tool", status: "pass" },
-          { name: "Model switch with tool continuity", status: "pass" },
-          { name: "Source and docs discovery report", status: "pass" },
-          { name: "Image understanding from attachment", status: "pass" },
-        ],
+        scenarios: FULL_PARITY_PASS_SCENARIOS,
       },
       baselineSummary: {
-        scenarios: [
-          {
-            name: "Approval turn tool followthrough",
-            status: "pass",
-            details: "timed out before it continued",
-          },
-          { name: "Compaction retry after mutating tool", status: "pass" },
-          { name: "Model switch with tool continuity", status: "pass" },
-          { name: "Source and docs discovery report", status: "pass" },
-          { name: "Image understanding from attachment", status: "pass" },
-        ],
+        scenarios: withScenarioOverride("Approval turn tool followthrough", {
+          details: "timed out before it continued",
+        }),
       },
       comparedAt: "2026-04-11T00:00:00.000Z",
     });
@@ -303,36 +383,333 @@ Follow-up:
     expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(1);
   });
 
-  it("renders a readable markdown parity report", () => {
+  it("does not flag positive-tone prose as fake success (positive-tone detection removed)", () => {
+    // Positive-tone detection was removed because for passing runs the
+    // `details` field is the model's prose, which never contains tool-call
+    // evidence. Criterion 2 is enforced by per-scenario tool-call assertions.
+    const summary: QaParitySuiteSummary = {
+      scenarios: [
+        {
+          name: "Subagent handoff",
+          status: "pass",
+          details: "Successfully completed the delegation. The subagent returned its result.",
+        },
+      ],
+    };
+
+    expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(0);
+  });
+
+  it("does not flag bare 'Done.' prose as fake success", () => {
+    const summary: QaParitySuiteSummary = {
+      scenarios: [
+        {
+          name: "Approval turn tool followthrough",
+          status: "pass",
+          details: "Done.",
+        },
+      ],
+    };
+
+    expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(0);
+  });
+
+  it("does not flag structured status lines that end in `done`", () => {
+    const summary: QaParitySuiteSummary = {
+      scenarios: [
+        {
+          name: "Compaction retry after mutating tool",
+          status: "pass",
+          details: `Confirmed, replay unsafe after write.
+compactionCount=0
+status=done`,
+        },
+      ],
+    };
+
+    expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(0);
+  });
+
+  it("does not flag positive-tone passes when the scenario shows real tool-call evidence", () => {
+    // A legitimate tool-mediated pass that happens to include
+    // "successfully" in its prose must not be flagged. The
+    // `plannedToolName` evidence (or any of the other tool-call
+    // evidence patterns) exempts the scenario from positive-tone
+    // detection. Without this exemption, real tool-backed passes with
+    // self-congratulatory prose would count as fake successes and break
+    // the gate.
+    const summary: QaParitySuiteSummary = {
+      scenarios: [
+        {
+          name: "Source and docs discovery report",
+          status: "pass",
+          details:
+            "Successfully completed the report. plannedToolName=read recorded via /debug/requests.",
+        },
+      ],
+    };
+
+    expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(0);
+  });
+
+  it("only flags failure-tone passes, not positive-tone", () => {
+    const summary: QaParitySuiteSummary = {
+      scenarios: [
+        {
+          name: "Approval turn tool followthrough",
+          status: "pass",
+          details: "Task executed successfully without errors.",
+        },
+        {
+          name: "Subagent handoff",
+          status: "pass",
+          details: "Tool call completed, but an error occurred mid-turn.",
+        },
+      ],
+    };
+
+    // Only the failure-tone scenario ("error occurred") counts.
+    // The positive-tone one ("successfully") is not flagged.
+    expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(1);
+  });
+
+  it("throws QaParityLabelMismatchError when the candidate run.primaryProvider does not match the label", () => {
+    // Regression for the gate footgun: if an operator swaps the
+    // --candidate-summary and --baseline-summary paths, the gate would
+    // silently produce a reversed verdict. PR L #64789 ships the `run`
+    // block on every summary so the parity report can verify it against
+    // the caller-supplied label; this test pins the precondition check.
+    const parityPassScenarios = [
+      { name: "Approval turn tool followthrough", status: "pass" as const },
+      { name: "Compaction retry after mutating tool", status: "pass" as const },
+      { name: "Model switch with tool continuity", status: "pass" as const },
+      { name: "Source and docs discovery report", status: "pass" as const },
+      { name: "Image understanding from attachment", status: "pass" as const },
+    ];
+
+    expect(() =>
+      buildQaAgenticParityComparison({
+        candidateLabel: "openai/gpt-5.4",
+        baselineLabel: "anthropic/claude-opus-4-6",
+        candidateSummary: {
+          scenarios: parityPassScenarios,
+          run: { primaryProvider: "anthropic", primaryModel: "claude-opus-4-6" },
+        },
+        baselineSummary: {
+          scenarios: parityPassScenarios,
+          run: { primaryProvider: "anthropic", primaryModel: "claude-opus-4-6" },
+        },
+        comparedAt: "2026-04-11T00:00:00.000Z",
+      }),
+    ).toThrow(QaParityLabelMismatchError);
+  });
+
+  it("throws QaParityLabelMismatchError when the baseline run.primaryProvider does not match the label", () => {
+    const parityPassScenarios = [
+      { name: "Approval turn tool followthrough", status: "pass" as const },
+    ];
+
+    expect(() =>
+      buildQaAgenticParityComparison({
+        candidateLabel: "openai/gpt-5.4",
+        baselineLabel: "anthropic/claude-opus-4-6",
+        candidateSummary: {
+          scenarios: parityPassScenarios,
+          run: { primaryProvider: "openai" },
+        },
+        baselineSummary: {
+          scenarios: parityPassScenarios,
+          run: { primaryProvider: "openai", primaryModel: "gpt-5.4" },
+        },
+        comparedAt: "2026-04-11T00:00:00.000Z",
+      }),
+    ).toThrow(
+      /baseline summary run\.primaryProvider=openai and run\.primaryModel=gpt-5\.4 do not match --baseline-label/,
+    );
+  });
+
+  it("accepts matching run.primaryProvider labels without throwing", () => {
     const comparison = buildQaAgenticParityComparison({
       candidateLabel: "openai/gpt-5.4",
       baselineLabel: "anthropic/claude-opus-4-6",
       candidateSummary: {
-        scenarios: [
-          { name: "Approval turn tool followthrough", status: "pass" },
-          { name: "Compaction retry after mutating tool", status: "pass" },
-          { name: "Model switch with tool continuity", status: "pass" },
-          { name: "Source and docs discovery report", status: "pass" },
-          { name: "Image understanding from attachment", status: "pass" },
-        ],
+        scenarios: FULL_PARITY_PASS_SCENARIOS,
+        run: {
+          primaryProvider: "openai",
+          primaryModel: "openai/gpt-5.4",
+          primaryModelName: "gpt-5.4",
+        },
       },
       baselineSummary: {
-        scenarios: [
-          { name: "Approval turn tool followthrough", status: "pass" },
-          { name: "Compaction retry after mutating tool", status: "pass" },
-          { name: "Model switch with tool continuity", status: "pass" },
-          { name: "Source and docs discovery report", status: "pass" },
-          { name: "Image understanding from attachment", status: "pass" },
-        ],
+        scenarios: FULL_PARITY_PASS_SCENARIOS,
+        run: {
+          primaryProvider: "anthropic",
+          primaryModel: "anthropic/claude-opus-4-6",
+          primaryModelName: "claude-opus-4-6",
+        },
       },
       comparedAt: "2026-04-11T00:00:00.000Z",
     });
+    expect(comparison.pass).toBe(true);
+  });
+
+  it("skips run.primaryProvider verification when the summary is missing a run block (legacy summaries)", () => {
+    // Pre-PR-L summaries don't carry a `run` block. The gate must still
+    // work against those, trusting the caller-supplied label.
+    const comparison = buildQaAgenticParityComparison({
+      candidateLabel: "openai/gpt-5.4",
+      baselineLabel: "anthropic/claude-opus-4-6",
+      candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
+      baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
+      comparedAt: "2026-04-11T00:00:00.000Z",
+    });
+    expect(comparison.pass).toBe(true);
+  });
+
+  it("skips provider verification for arbitrary display labels when run metadata is present", () => {
+    const comparison = buildQaAgenticParityComparison({
+      candidateLabel: "GPT-5.4 candidate",
+      baselineLabel: "Opus 4.6 baseline",
+      candidateSummary: {
+        scenarios: FULL_PARITY_PASS_SCENARIOS,
+        run: {
+          primaryProvider: "openai",
+          primaryModel: "openai/gpt-5.4",
+          primaryModelName: "gpt-5.4",
+        },
+      },
+      baselineSummary: {
+        scenarios: FULL_PARITY_PASS_SCENARIOS,
+        run: {
+          primaryProvider: "anthropic",
+          primaryModel: "anthropic/claude-opus-4-6",
+          primaryModelName: "claude-opus-4-6",
+        },
+      },
+      comparedAt: "2026-04-11T00:00:00.000Z",
+    });
+
+    expect(comparison.pass).toBe(true);
+  });
+
+  it("skips provider verification for mixed-case or decorated display labels", () => {
+    const comparison = buildQaAgenticParityComparison({
+      candidateLabel: "Candidate: GPT-5.4",
+      baselineLabel: "Opus 4.6 / baseline",
+      candidateSummary: {
+        scenarios: FULL_PARITY_PASS_SCENARIOS,
+        run: {
+          primaryProvider: "openai",
+          primaryModel: "openai/gpt-5.4",
+          primaryModelName: "gpt-5.4",
+        },
+      },
+      baselineSummary: {
+        scenarios: FULL_PARITY_PASS_SCENARIOS,
+        run: {
+          primaryProvider: "anthropic",
+          primaryModel: "anthropic/claude-opus-4-6",
+          primaryModelName: "claude-opus-4-6",
+        },
+      },
+      comparedAt: "2026-04-11T00:00:00.000Z",
+    });
+
+    expect(comparison.pass).toBe(true);
+  });
+
+  it("throws when a structured label mismatches the recorded model even if the provider matches", () => {
+    expect(() =>
+      buildQaAgenticParityComparison({
+        candidateLabel: "openai/gpt-5.4",
+        baselineLabel: "anthropic/claude-opus-4-6",
+        candidateSummary: {
+          scenarios: FULL_PARITY_PASS_SCENARIOS,
+          run: {
+            primaryProvider: "openai",
+            primaryModel: "openai/gpt-5.4-alt",
+            primaryModelName: "gpt-5.4-alt",
+          },
+        },
+        baselineSummary: {
+          scenarios: FULL_PARITY_PASS_SCENARIOS,
+          run: {
+            primaryProvider: "anthropic",
+            primaryModel: "anthropic/claude-opus-4-6",
+            primaryModelName: "claude-opus-4-6",
+          },
+        },
+        comparedAt: "2026-04-11T00:00:00.000Z",
+      }),
+    ).toThrow(
+      /candidate summary run\.primaryProvider=openai and run\.primaryModel=openai\/gpt-5\.4-alt do not match --candidate-label=openai\/gpt-5\.4/,
+    );
+  });
+
+  it("accepts colon-delimited structured labels when provider and model both match", () => {
+    const comparison = buildQaAgenticParityComparison({
+      candidateLabel: "openai:gpt-5.4",
+      baselineLabel: "anthropic:claude-opus-4-6",
+      candidateSummary: {
+        scenarios: FULL_PARITY_PASS_SCENARIOS,
+        run: {
+          primaryProvider: "openai",
+          primaryModel: "openai/gpt-5.4",
+          primaryModelName: "gpt-5.4",
+        },
+      },
+      baselineSummary: {
+        scenarios: FULL_PARITY_PASS_SCENARIOS,
+        run: {
+          primaryProvider: "anthropic",
+          primaryModel: "anthropic/claude-opus-4-6",
+          primaryModelName: "claude-opus-4-6",
+        },
+      },
+      comparedAt: "2026-04-11T00:00:00.000Z",
+    });
+
+    expect(comparison.pass).toBe(true);
+  });
+
+  it("renders a readable markdown parity report", () => {
+    // Cover the full parity pack on both sides so the pass
+    // verdict is not disrupted by required-scenario coverage failures
+    // added by the second-wave expansion.
+    const comparison = buildQaAgenticParityComparison({
+      candidateLabel: "openai/gpt-5.4",
+      baselineLabel: "anthropic/claude-opus-4-6",
+      candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
+      baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
+      comparedAt: "2026-04-11T00:00:00.000Z",
+    });
 
     const report = renderQaAgenticParityMarkdownReport(comparison);
 
-    expect(report).toContain("# OpenClaw GPT-5.4 / Opus 4.6 Agentic Parity Report");
+    expect(report).toContain(
+      "# OpenClaw Agentic Parity Report — openai/gpt-5.4 vs anthropic/claude-opus-4-6",
+    );
     expect(report).toContain("| Completion rate | 100.0% | 100.0% |");
     expect(report).toContain("### Approval turn tool followthrough");
     expect(report).toContain("- Verdict: pass");
   });
+
+  it("parametrizes the markdown header from the comparison labels", () => {
+    // Regression for the loop-7 Copilot finding: callers that configure
+    // non-gpt-5.4 / non-opus labels (for example an internal candidate vs
+    // another candidate) must see the labels in the rendered H1 instead of
+    // the hardcoded "GPT-5.4 / Opus 4.6" title that would otherwise confuse
+    // readers of saved reports.
+    const comparison = buildQaAgenticParityComparison({
+      candidateLabel: "openai/gpt-5.4-alt",
+      baselineLabel: "openai/gpt-5.4",
+      candidateSummary: { scenarios: [] },
+      baselineSummary: { scenarios: [] },
+      comparedAt: "2026-04-11T00:00:00.000Z",
+    });
+    const report = renderQaAgenticParityMarkdownReport(comparison);
+    expect(report).toContain(
+      "# OpenClaw Agentic Parity Report — openai/gpt-5.4-alt vs openai/gpt-5.4",
+    );
+  });
 });
diff --git a/extensions/qa-lab/src/agentic-parity-report.ts b/extensions/qa-lab/src/agentic-parity-report.ts
index 0e900253b0d..d057ceffb6c 100644
--- a/extensions/qa-lab/src/agentic-parity-report.ts
+++ b/extensions/qa-lab/src/agentic-parity-report.ts
@@ -1,4 +1,7 @@
-import { QA_AGENTIC_PARITY_SCENARIO_TITLES } from "./agentic-parity.js";
+import {
+  QA_AGENTIC_PARITY_SCENARIO_TITLES,
+  QA_AGENTIC_PARITY_TOOL_BACKED_SCENARIO_TITLES,
+} from "./agentic-parity.js";
 
 export type QaParityReportStep = {
   name: string;
@@ -13,6 +16,21 @@ export type QaParityReportScenario = {
   steps?: QaParityReportStep[];
 };
 
+/**
+ * Optional self-describing run metadata written by PR L (#64789). Before
+ * that PR merges, older summaries only have `scenarios` + `counts`; the
+ * parity report treats a missing `run` block as "unknown provenance" and
+ * skips the label-match verification for backwards compatibility
+ * with legacy summaries that predate the run metadata block.
+ */
+export type QaParityRunBlock = {
+  primaryProvider?: string;
+  primaryModel?: string;
+  primaryModelName?: string;
+  providerMode?: string;
+  scenarioIds?: readonly string[] | null;
+};
+
 export type QaParitySuiteSummary = {
   scenarios: QaParityReportScenario[];
   counts?: {
@@ -20,6 +38,8 @@ export type QaParitySuiteSummary = {
     passed?: number;
     failed?: number;
   };
+  /** Self-describing run metadata — see PR L #64789 for the writer side. */
+  run?: QaParityRunBlock;
 };
 
 export type QaAgenticParityMetrics = {
@@ -64,7 +84,11 @@ const UNINTENDED_STOP_PATTERNS = [
   /did not continue/i,
 ] as const;
 
-const SUSPICIOUS_PASS_PATTERNS = [
+// Failure-tone patterns: a passing scenario whose details text matches any
+// of these is treated as a "fake success" — the scenario is marked pass but
+// the supporting text reveals something went wrong. Adding new patterns here
+// widens the net for bad prose that correlates with runtime failure modes.
+const SUSPICIOUS_PASS_FAILURE_TONE_PATTERNS = [
   /incomplete turn/i,
   /\btimed out\b/i,
   /\btimeout\b/i,
@@ -76,6 +100,13 @@ const SUSPICIOUS_PASS_PATTERNS = [
   /an error was/i,
 ] as const;
 
+// Positive-tone patterns (e.g. "Successfully completed", "Done.") are NOT
+// checked in fakeSuccessCount. For passing runs, `details` is the model's
+// outbound prose, which never contains tool-call evidence strings, so a
+// tool-call-evidence exemption would false-positive on every legitimate
+// pass. Criterion 2 ("no fake progress") is enforced by per-scenario
+// `/debug/requests` tool-call assertions in the YAML flows (PR J) instead.
+
 function normalizeScenarioStatus(status: string | undefined): "pass" | "fail" | "skip" {
   return status === "pass" || status === "fail" || status === "skip" ? status : "fail";
 }
@@ -103,6 +134,9 @@ export function computeQaAgenticParityMetrics(
     ...scenario,
     status: normalizeScenarioStatus(scenario.status),
   }));
+  const toolBackedTitleSet: ReadonlySet<string> = new Set(
+    QA_AGENTIC_PARITY_TOOL_BACKED_SCENARIO_TITLES,
+  );
   const totalScenarios = summary.counts?.total ?? scenarios.length;
   const passedScenarios =
     summary.counts?.passed ?? scenarios.filter((scenario) => scenario.status === "pass").length;
@@ -112,16 +146,40 @@ export function computeQaAgenticParityMetrics(
     (scenario) =>
       scenario.status !== "pass" && scenarioHasPattern(scenario, UNINTENDED_STOP_PATTERNS),
   ).length;
-  const fakeSuccessCount = scenarios.filter(
-    (scenario) =>
-      scenario.status === "pass" && scenarioHasPattern(scenario, SUSPICIOUS_PASS_PATTERNS),
+  const fakeSuccessCount = scenarios.filter((scenario) => {
+    if (scenario.status !== "pass") {
+      return false;
+    }
+    // Failure-tone patterns catch obviously-broken passes regardless of
+    // whether the scenario shows tool-call evidence — "timed out" under a
+    // pass is always fake.
+    if (scenarioHasPattern(scenario, SUSPICIOUS_PASS_FAILURE_TONE_PATTERNS)) {
+      return true;
+    }
+    // Positive-tone patterns (like "Successfully completed") are NOT checked
+    // here because for passing runs the `details` field is the model's
+    // outbound prose, which never contains tool-call evidence strings.
+    // The `scenarioLacksToolCallEvidence` check would return true for ALL
+    // passes and false-positive on legitimate completions. Criterion 2
+    // ("no fake tool completion") is instead enforced by the per-scenario
+    // `/debug/requests` tool-call assertions from the scenario YAML flows.
+    return false;
+  }).length;
+
+  // Count only the scenarios that are supposed to exercise a real tool,
+  // subagent, or capability invocation. Memory recall and image-only
+  // understanding lanes stay in the parity pack, but they should not inflate
+  // the tool-call metric just by passing.
+  const toolBackedScenarioCount = scenarios.filter((scenario) =>
+    toolBackedTitleSet.has(scenario.name),
+  ).length;
+  const validToolCallCount = scenarios.filter(
+    (scenario) => toolBackedTitleSet.has(scenario.name) && scenario.status === "pass",
   ).length;
 
-  // First-wave parity scenarios are all tool-mediated tasks, so a passing scenario is our
-  // verified unit of valid tool-backed execution in this harness.
-  const validToolCallCount = passedScenarios;
-
   const rate = (value: number) => (totalScenarios > 0 ? value / totalScenarios : 0);
+  const toolRate = (value: number) =>
+    toolBackedScenarioCount > 0 ? value / toolBackedScenarioCount : 0;
   return {
     totalScenarios,
     passedScenarios,
@@ -130,7 +188,7 @@ export function computeQaAgenticParityMetrics(
     unintendedStopCount,
     unintendedStopRate: rate(unintendedStopCount),
     validToolCallCount,
-    validToolCallRate: rate(validToolCallCount),
+    validToolCallRate: toolRate(validToolCallCount),
     fakeSuccessCount,
   };
 }
@@ -149,14 +207,116 @@ function scopeSummaryToParityPack(
   summary: QaParitySuiteSummary,
   parityTitleSet: ReadonlySet<string>,
 ): QaParitySuiteSummary {
-  // The parity verdict must only consider the declared first-wave parity scenarios.
-  // Drop `counts` so the metric helper recomputes totals from the filtered scenario
-  // list instead of inheriting the caller's full-suite counters.
+  // The parity verdict must only consider the declared parity scenarios
+  // (the full first-wave + second-wave pack from QA_AGENTIC_PARITY_SCENARIOS).
+  // Drop `counts` so the metric helper recomputes totals from the filtered
+  // scenario list instead of inheriting the caller's full-suite counters.
   return {
     scenarios: summary.scenarios.filter((scenario) => parityTitleSet.has(scenario.name)),
+    ...(summary.run ? { run: summary.run } : {}),
   };
 }
 
+type StructuredQaParityLabel = {
+  provider: string;
+  model: string;
+};
+
+/**
+ * Only treat caller labels as provenance-checked identifiers when they are
+ * exact lower-case provider/model refs. Human-facing display labels like
+ * "GPT-5.4 candidate" or "Candidate: GPT-5.4" should render in the report
+ * without being misread as structured provider ids.
+ */
+function parseStructuredLabelRef(label: string): StructuredQaParityLabel | null {
+  const trimmed = label.trim();
+  if (trimmed.length === 0) {
+    return null;
+  }
+  if (trimmed !== trimmed.toLowerCase()) {
+    return null;
+  }
+  const separatorMatch = /^([a-z0-9][a-z0-9-]*)[/:]([a-z0-9][a-z0-9._-]*)$/.exec(trimmed);
+  if (!separatorMatch) {
+    return null;
+  }
+  return {
+    provider: separatorMatch[1] ?? "",
+    model: separatorMatch[2] ?? "",
+  };
+}
+
+/**
+ * Verify the `run.primaryProvider` + `run.primaryModel` fields on a summary
+ * match the caller-supplied label when that label is a structured
+ * `provider/model` or `provider:model` ref. PR L #64789 ships the `run`
+ * block; before it lands, older summaries don't have the field and this check
+ * is a no-op.
+ *
+ * Throws `QaParityLabelMismatchError` when the summary reports a different
+ * provider/model than the caller claimed — this catches the "swapped
+ * candidate and baseline summary paths" footgun the earlier adversarial
+ * review flagged. Returns silently when the fields are absent (legacy
+ * summaries) or when the fields match.
+ */
+function verifySummaryLabelMatch(params: {
+  summary: QaParitySuiteSummary;
+  label: string;
+  role: "candidate" | "baseline";
+}): void {
+  const runProvider = params.summary.run?.primaryProvider?.trim();
+  const runModel = params.summary.run?.primaryModel?.trim();
+  const runModelName = params.summary.run?.primaryModelName?.trim();
+  if (!runProvider || !runModel) {
+    return;
+  }
+  const labelRef = parseStructuredLabelRef(params.label);
+  if (!labelRef) {
+    return;
+  }
+  const normalizedRunModel = runModel.toLowerCase();
+  const normalizedRunModelName = runModelName?.toLowerCase();
+  const normalizedLabelModel = labelRef.model;
+  if (
+    runProvider.toLowerCase() === labelRef.provider &&
+    (normalizedRunModel === normalizedLabelModel ||
+      normalizedRunModelName === normalizedLabelModel ||
+      normalizedRunModel === `${labelRef.provider}/${normalizedLabelModel}`)
+  ) {
+    return;
+  }
+  throw new QaParityLabelMismatchError({
+    role: params.role,
+    label: params.label,
+    runProvider,
+    runModel,
+  });
+}
+
+export class QaParityLabelMismatchError extends Error {
+  readonly role: "candidate" | "baseline";
+  readonly label: string;
+  readonly runProvider: string;
+  readonly runModel: string;
+
+  constructor(params: {
+    role: "candidate" | "baseline";
+    label: string;
+    runProvider: string;
+    runModel: string;
+  }) {
+    super(
+      `${params.role} summary run.primaryProvider=${params.runProvider} and run.primaryModel=${params.runModel} do not match --${params.role}-label=${params.label}. ` +
+        `Check that the --candidate-summary / --baseline-summary paths weren't swapped.`,
+    );
+    this.name = "QaParityLabelMismatchError";
+    this.role = params.role;
+    this.label = params.label;
+    this.runProvider = params.runProvider;
+    this.runModel = params.runModel;
+  }
+}
+
 export function buildQaAgenticParityComparison(params: {
   candidateLabel: string;
   baselineLabel: string;
@@ -164,6 +324,22 @@ export function buildQaAgenticParityComparison(params: {
   baselineSummary: QaParitySuiteSummary;
   comparedAt?: string;
 }): QaAgenticParityComparison {
+  // Precondition: verify the `run.primaryProvider` field on each summary
+  // matches the caller-supplied label (when the `run` block is present).
+  // Throws `QaParityLabelMismatchError` on mismatch so the release gate
+  // fails loudly instead of silently producing a reversed verdict when an
+  // operator swaps the --candidate-summary and --baseline-summary paths.
+  // Legacy summaries without a `run` block are accepted as-is.
+  verifySummaryLabelMatch({
+    summary: params.candidateSummary,
+    label: params.candidateLabel,
+    role: "candidate",
+  });
+  verifySummaryLabelMatch({
+    summary: params.baselineSummary,
+    label: params.baselineLabel,
+    role: "baseline",
+  });
   const parityTitleSet: ReadonlySet<string> = new Set<string>(QA_AGENTIC_PARITY_SCENARIO_TITLES);
   // Rates and fake-success counts are computed from the parity-scoped summaries only,
   // so extra non-parity scenarios in the input (for example when a caller feeds a full
@@ -203,7 +379,7 @@ export function buildQaAgenticParityComparison(params: {
     });
 
   const failures: string[] = [];
-  const requiredScenarioCoverage = QA_AGENTIC_PARITY_SCENARIO_TITLES.map((name) => {
+  const requiredScenarioStatuses = QA_AGENTIC_PARITY_SCENARIO_TITLES.map((name) => {
     const candidate = candidateByName.get(name);
     const baseline = baselineByName.get(name);
     return {
@@ -211,7 +387,8 @@ export function buildQaAgenticParityComparison(params: {
       candidateStatus: requiredCoverageStatus(candidate),
       baselineStatus: requiredCoverageStatus(baseline),
     };
-  }).filter(
+  });
+  const requiredScenarioCoverage = requiredScenarioStatuses.filter(
     (scenario) =>
       scenario.candidateStatus === "missing" ||
       scenario.baselineStatus === "missing" ||
@@ -223,6 +400,26 @@ export function buildQaAgenticParityComparison(params: {
       `Missing required parity scenario coverage for ${scenario.name}: ${params.candidateLabel}=${scenario.candidateStatus}, ${params.baselineLabel}=${scenario.baselineStatus}.`,
     );
   }
+  // Required parity scenarios that ran on both sides but FAILED also fail
+  // the gate. Without this check, a run where both models fail the same
+  // required scenarios still produced pass=true, because the downstream
+  // metric comparisons are purely relative (candidate vs baseline) and
+  // the suspicious-pass fake-success check only catches passes that carry
+  // failure-sounding details. Excluding missing/skip here keeps operator
+  // output from double-counting the same scenario with two lines.
+  const requiredScenarioFailures = requiredScenarioStatuses.filter(
+    (scenario) =>
+      scenario.candidateStatus !== "missing" &&
+      scenario.baselineStatus !== "missing" &&
+      scenario.candidateStatus !== "skip" &&
+      scenario.baselineStatus !== "skip" &&
+      (scenario.candidateStatus === "fail" || scenario.baselineStatus === "fail"),
+  );
+  for (const scenario of requiredScenarioFailures) {
+    failures.push(
+      `Required parity scenario ${scenario.name} failed: ${params.candidateLabel}=${scenario.candidateStatus}, ${params.baselineLabel}=${scenario.baselineStatus}.`,
+    );
+  }
   // Required parity scenarios are already reported via `requiredScenarioCoverage`
   // above; excluding them here keeps the operator-facing failure list from
   // double-counting the same missing scenario (one "Missing required parity scenario
@@ -281,8 +478,13 @@ export function buildQaAgenticParityComparison(params: {
 }
 
 export function renderQaAgenticParityMarkdownReport(comparison: QaAgenticParityComparison): string {
+  // Title is parametrized from the candidate / baseline labels so reports
+  // for any candidate/baseline pair (not only gpt-5.4 vs opus 4.6) render
+  // with an accurate header. The default CLI labels are still
+  // openai/gpt-5.4 vs anthropic/claude-opus-4-6, but the helper works for
+  // any parity comparison a caller configures.
   const lines = [
-    "# OpenClaw GPT-5.4 / Opus 4.6 Agentic Parity Report",
+    `# OpenClaw Agentic Parity Report — ${comparison.candidateLabel} vs ${comparison.baselineLabel}`,
     "",
     `- Compared at: ${comparison.comparedAt}`,
     `- Candidate: ${comparison.candidateLabel}`,
diff --git a/extensions/qa-lab/src/agentic-parity.ts b/extensions/qa-lab/src/agentic-parity.ts
index e2972c92e17..d997778e85f 100644
--- a/extensions/qa-lab/src/agentic-parity.ts
+++ b/extensions/qa-lab/src/agentic-parity.ts
@@ -4,22 +4,57 @@ export const QA_AGENTIC_PARITY_SCENARIOS = [
   {
     id: "approval-turn-tool-followthrough",
     title: "Approval turn tool followthrough",
+    countsTowardValidToolCallRate: true,
   },
   {
     id: "model-switch-tool-continuity",
     title: "Model switch with tool continuity",
+    countsTowardValidToolCallRate: true,
   },
   {
     id: "source-docs-discovery-report",
     title: "Source and docs discovery report",
+    countsTowardValidToolCallRate: true,
   },
   {
     id: "image-understanding-attachment",
     title: "Image understanding from attachment",
+    countsTowardValidToolCallRate: false,
   },
   {
     id: "compaction-retry-mutating-tool",
     title: "Compaction retry after mutating tool",
+    countsTowardValidToolCallRate: true,
+  },
+  {
+    id: "subagent-handoff",
+    title: "Subagent handoff",
+    countsTowardValidToolCallRate: true,
+  },
+  {
+    id: "subagent-fanout-synthesis",
+    title: "Subagent fanout synthesis",
+    countsTowardValidToolCallRate: true,
+  },
+  {
+    id: "memory-recall",
+    title: "Memory recall after context switch",
+    countsTowardValidToolCallRate: false,
+  },
+  {
+    id: "thread-memory-isolation",
+    title: "Thread memory isolation",
+    countsTowardValidToolCallRate: true,
+  },
+  {
+    id: "config-restart-capability-flip",
+    title: "Config restart capability flip",
+    countsTowardValidToolCallRate: true,
+  },
+  {
+    id: "instruction-followthrough-repo-contract",
+    title: "Instruction followthrough repo contract",
+    countsTowardValidToolCallRate: true,
   },
 ] as const;
 
@@ -27,6 +62,9 @@ export const QA_AGENTIC_PARITY_SCENARIO_IDS = QA_AGENTIC_PARITY_SCENARIOS.map(({
 export const QA_AGENTIC_PARITY_SCENARIO_TITLES = QA_AGENTIC_PARITY_SCENARIOS.map(
   ({ title }) => title,
 );
+export const QA_AGENTIC_PARITY_TOOL_BACKED_SCENARIO_TITLES = QA_AGENTIC_PARITY_SCENARIOS.filter(
+  ({ countsTowardValidToolCallRate }) => countsTowardValidToolCallRate,
+).map(({ title }) => title);
 
 export function resolveQaParityPackScenarioIds(params: {
   parityPack?: string;
diff --git a/extensions/qa-lab/src/cli.runtime.test.ts b/extensions/qa-lab/src/cli.runtime.test.ts
index 6c0efab1755..75629b1afec 100644
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -338,6 +338,12 @@ describe("qa cli runtime", () => {
           "source-docs-discovery-report",
           "image-understanding-attachment",
           "compaction-retry-mutating-tool",
+          "subagent-handoff",
+          "subagent-fanout-synthesis",
+          "memory-recall",
+          "thread-memory-isolation",
+          "config-restart-capability-flip",
+          "instruction-followthrough-repo-contract",
         ],
       }),
     );
@@ -566,6 +572,39 @@ describe("qa cli runtime", () => {
     );
   });
 
+  it("passes provider-qualified mock parity suite selection through to the host runner", async () => {
+    await runQaSuiteCommand({
+      repoRoot: "/tmp/openclaw-repo",
+      providerMode: "mock-openai",
+      parityPack: "agentic",
+      primaryModel: "openai/gpt-5.4",
+      alternateModel: "anthropic/claude-opus-4-6",
+    });
+
+    expect(runQaSuiteFromRuntime).toHaveBeenCalledWith({
+      repoRoot: path.resolve("/tmp/openclaw-repo"),
+      outputDir: undefined,
+      transportId: "qa-channel",
+      providerMode: "mock-openai",
+      primaryModel: "openai/gpt-5.4",
+      alternateModel: "anthropic/claude-opus-4-6",
+      fastMode: undefined,
+      scenarioIds: [
+        "approval-turn-tool-followthrough",
+        "model-switch-tool-continuity",
+        "source-docs-discovery-report",
+        "image-understanding-attachment",
+        "compaction-retry-mutating-tool",
+        "subagent-handoff",
+        "subagent-fanout-synthesis",
+        "memory-recall",
+        "thread-memory-isolation",
+        "config-restart-capability-flip",
+        "instruction-followthrough-repo-contract",
+      ],
+    });
+  });
+
   it("rejects multipass-only suite flags on the host runner", async () => {
     await expect(
       runQaSuiteCommand({
diff --git a/extensions/qa-lab/src/gateway-child.test.ts b/extensions/qa-lab/src/gateway-child.test.ts
index 5f340ae238e..c70559457b3 100644
--- a/extensions/qa-lab/src/gateway-child.test.ts
+++ b/extensions/qa-lab/src/gateway-child.test.ts
@@ -64,6 +64,11 @@ describe("buildQaRuntimeEnv", () => {
     expect(env.GEMINI_API_KEY).toBe("gemini-live");
   });
 
+  it("defaults gateway-child provider mode to mock-openai when omitted", () => {
+    expect(__testing.resolveQaGatewayChildProviderMode(undefined)).toBe("mock-openai");
+    expect(__testing.resolveQaGatewayChildProviderMode("live-frontier")).toBe("live-frontier");
+  });
+
   it("keeps explicit provider env vars over live aliases", () => {
     const env = buildQaRuntimeEnv({
       ...createParams({
@@ -299,6 +304,88 @@ describe("buildQaRuntimeEnv", () => {
     });
   });
 
+  it("stages placeholder mock auth profiles per agent dir so mock-openai runs can resolve credentials", async () => {
+    const stateDir = await mkdtemp(path.join(os.tmpdir(), "qa-mock-auth-"));
+    cleanups.push(async () => {
+      await rm(stateDir, { recursive: true, force: true });
+    });
+
+    const cfg = await __testing.stageQaMockAuthProfiles({
+      cfg: {},
+      stateDir,
+    });
+
+    // Config side: both providers should have a profile entry with mode
+    // "api_key" so the runtime picks up the staging without any further
+    // config mutation.
+    expect(cfg.auth?.profiles?.["qa-mock-openai"]).toMatchObject({
+      provider: "openai",
+      mode: "api_key",
+      displayName: "QA mock openai credential",
+    });
+    expect(cfg.auth?.profiles?.["qa-mock-anthropic"]).toMatchObject({
+      provider: "anthropic",
+      mode: "api_key",
+      displayName: "QA mock anthropic credential",
+    });
+
+    // Store side: each agent dir should have its own auth-profiles.json
+    // containing the placeholder credential for each staged provider. This
+    // is what the scenario runner actually reads when it resolves auth
+    // before calling the mock.
+    for (const agentId of ["main", "qa"]) {
+      const storeRaw = await readFile(
+        path.join(stateDir, "agents", agentId, "agent", "auth-profiles.json"),
+        "utf8",
+      );
+      const parsed = JSON.parse(storeRaw) as {
+        profiles: Record<string, { type: string; provider: string; key: string }>;
+      };
+      expect(parsed.profiles["qa-mock-openai"]).toMatchObject({
+        type: "api_key",
+        provider: "openai",
+        key: "qa-mock-not-a-real-key",
+      });
+      expect(parsed.profiles["qa-mock-anthropic"]).toMatchObject({
+        type: "api_key",
+        provider: "anthropic",
+        key: "qa-mock-not-a-real-key",
+      });
+    }
+  });
+
+  it("stages mock profiles only for the requested agents and providers when callers override the defaults", async () => {
+    const stateDir = await mkdtemp(path.join(os.tmpdir(), "qa-mock-auth-override-"));
+    cleanups.push(async () => {
+      await rm(stateDir, { recursive: true, force: true });
+    });
+
+    const cfg = await __testing.stageQaMockAuthProfiles({
+      cfg: {},
+      stateDir,
+      agentIds: ["qa"],
+      providers: ["openai"],
+    });
+
+    expect(cfg.auth?.profiles?.["qa-mock-openai"]).toMatchObject({
+      provider: "openai",
+      mode: "api_key",
+    });
+    // Anthropic should NOT be staged when the caller restricts providers.
+    expect(cfg.auth?.profiles?.["qa-mock-anthropic"]).toBeUndefined();
+
+    const qaStore = JSON.parse(
+      await readFile(path.join(stateDir, "agents", "qa", "agent", "auth-profiles.json"), "utf8"),
+    ) as { profiles: Record<string, unknown> };
+    expect(qaStore.profiles["qa-mock-openai"]).toBeDefined();
+    expect(qaStore.profiles["qa-mock-anthropic"]).toBeUndefined();
+
+    // main/agent should not exist because it wasn't in the agentIds list.
+    await expect(
+      readFile(path.join(stateDir, "agents", "main", "agent", "auth-profiles.json"), "utf8"),
+    ).rejects.toThrow(/ENOENT/);
+  });
+
   it("allows loopback gateway health probes through the SSRF guard", async () => {
     const release = vi.fn(async () => {});
     fetchWithSsrFGuardMock.mockResolvedValue({
diff --git a/extensions/qa-lab/src/gateway-child.ts b/extensions/qa-lab/src/gateway-child.ts
index 3ab565ba015..94c6e69edf7 100644
--- a/extensions/qa-lab/src/gateway-child.ts
+++ b/extensions/qa-lab/src/gateway-child.ts
@@ -222,6 +222,12 @@ export function normalizeQaProviderModeEnv(
   return env;
 }
 
+export function resolveQaGatewayChildProviderMode(
+  providerMode?: "mock-openai" | "live-frontier",
+): "mock-openai" | "live-frontier" {
+  return providerMode ?? "mock-openai";
+}
+
 function resolveQaLiveCliAuthEnv(
   baseEnv: NodeJS.ProcessEnv,
   opts?: {
@@ -395,6 +401,72 @@ export async function stageQaLiveAnthropicSetupToken(params: {
   });
 }
 
+/** Providers the mock-openai harness stages placeholder credentials for. */
+export const QA_MOCK_AUTH_PROVIDERS = Object.freeze(["openai", "anthropic"] as const);
+
+/** Agent IDs the mock-openai harness stages credentials under. */
+export const QA_MOCK_AUTH_AGENT_IDS = Object.freeze(["main", "qa"] as const);
+
+export function buildQaMockProfileId(provider: string): string {
+  return `qa-mock-${provider}`;
+}
+
+/**
+ * In mock-openai mode the qa suite runs against the embedded mock server
+ * instead of a real provider API. The mock does not validate credentials, but
+ * the agent auth layer still needs a matching `api_key` auth profile in
+ * `auth-profiles.json` before it will route the request through
+ * `providerBaseUrl`. Without this staging step, every scenario fails with
+ * `FailoverError: No API key found for provider "openai"` before the mock
+ * server ever sees a request.
+ *
+ * Stages a placeholder `api_key` profile per provider in each of the agent
+ * dirs the qa suite uses (`main` for the runtime config, `qa` for scenario
+ * runs) and returns a config with matching `auth.profiles` entries so the
+ * runtime accepts the profile on the first lookup.
+ *
+ * The placeholder value `qa-mock-not-a-real-key` is intentionally not
+ * shaped like a real API key (no `sk-` prefix that would trip secret
+ * scanners). It only needs to be non-empty to pass the credential
+ * serializer; anything beyond that is ignored by the mock.
+ */
+export async function stageQaMockAuthProfiles(params: {
+  cfg: OpenClawConfig;
+  stateDir: string;
+  agentIds?: readonly string[];
+  providers?: readonly string[];
+}): Promise<OpenClawConfig> {
+  const agentIds = [...new Set(params.agentIds ?? QA_MOCK_AUTH_AGENT_IDS)];
+  const providers = [...new Set(params.providers ?? QA_MOCK_AUTH_PROVIDERS)];
+  let next = params.cfg;
+  for (const agentId of agentIds) {
+    const agentDir = path.join(params.stateDir, "agents", agentId, "agent");
+    await fs.mkdir(agentDir, { recursive: true });
+    for (const provider of providers) {
+      const profileId = buildQaMockProfileId(provider);
+      upsertAuthProfile({
+        profileId,
+        credential: {
+          type: "api_key",
+          provider,
+          key: "qa-mock-not-a-real-key",
+          displayName: `QA mock ${provider} credential`,
+        },
+        agentDir,
+      });
+    }
+  }
+  for (const provider of providers) {
+    next = applyAuthProfileConfig(next, {
+      profileId: buildQaMockProfileId(provider),
+      provider,
+      mode: "api_key",
+      displayName: `QA mock ${provider} credential`,
+    });
+  }
+  return next;
+}
+
 function isRetryableGatewayCallError(details: string): boolean {
   return (
     details.includes("handshake timeout") ||
@@ -440,8 +512,10 @@ export const __testing = {
   preserveQaGatewayDebugArtifacts,
   redactQaGatewayDebugText,
   readQaLiveProviderConfigOverrides,
+  resolveQaGatewayChildProviderMode,
   resolveQaLiveAnthropicSetupToken,
   stageQaLiveAnthropicSetupToken,
+  stageQaMockAuthProfiles,
   resolveQaLiveCliAuthEnv,
   resolveQaOwnerPluginIdsForProviderIds,
   resolveQaBundledPluginsSourceRoot,
@@ -868,8 +942,9 @@ export async function startQaGatewayChild(params: {
     fs.mkdir(xdgDataHome, { recursive: true }),
     fs.mkdir(xdgCacheHome, { recursive: true }),
   ]);
+  const providerMode = resolveQaGatewayChildProviderMode(params.providerMode);
   const liveProviderIds =
-    params.providerMode === "live-frontier"
+    providerMode === "live-frontier"
       ? [params.primaryModel, params.alternateModel]
           .map((modelRef) =>
             typeof modelRef === "string" ? splitQaModelRef(modelRef)?.provider : undefined,
@@ -902,7 +977,7 @@ export async function startQaGatewayChild(params: {
         controlUiEnabled: params.controlUiEnabled,
       }),
       controlUiAllowedOrigins: params.controlUiAllowedOrigins,
-      providerMode: params.providerMode,
+      providerMode,
       primaryModel: params.primaryModel,
       alternateModel: params.alternateModel,
       enabledPluginIds,
@@ -921,6 +996,12 @@ export async function startQaGatewayChild(params: {
       cfg,
       stateDir,
     });
+    if (providerMode === "mock-openai") {
+      cfg = await stageQaMockAuthProfiles({
+        cfg,
+        stateDir,
+      });
+    }
     return params.mutateConfig ? params.mutateConfig(cfg) : cfg;
   };
   const stdout: Buffer[] = [];
@@ -981,7 +1062,7 @@ export async function startQaGatewayChild(params: {
           xdgCacheHome,
           bundledPluginsDir,
           compatibilityHostVersion: runtimeHostVersion,
-          providerMode: params.providerMode,
+          providerMode,
           forwardHostHomeForClaudeCli: liveProviderIds.includes("claude-cli"),
           claudeCliAuthMode: params.claudeCliAuthMode,
         });
diff --git a/extensions/qa-lab/src/mock-openai-server.test.ts b/extensions/qa-lab/src/mock-openai-server.test.ts
index 578569f09fc..5e598f7949c 100644
--- a/extensions/qa-lab/src/mock-openai-server.test.ts
+++ b/extensions/qa-lab/src/mock-openai-server.test.ts
@@ -1,5 +1,5 @@
 import { afterEach, describe, expect, it } from "vitest";
-import { startQaMockOpenAiServer } from "./mock-openai-server.js";
+import { resolveProviderVariant, startQaMockOpenAiServer } from "./mock-openai-server.js";
 
 const cleanups: Array<() => Promise<void>> = [];
 const QA_IMAGE_PNG_BASE64 =
@@ -11,42 +11,15 @@ afterEach(async () => {
   }
 });
 
-async function startMockServer() {
-  const server = await startQaMockOpenAiServer({
-    host: "127.0.0.1",
-    port: 0,
-  });
-  cleanups.push(async () => {
-    await server.stop();
-  });
-  return server;
-}
-
-async function postResponses(server: { baseUrl: string }, body: unknown) {
-  return fetch(`${server.baseUrl}/v1/responses`, {
-    method: "POST",
-    headers: {
-      "content-type": "application/json",
-    },
-    body: JSON.stringify(body),
-  });
-}
-
-async function expectResponsesText(server: { baseUrl: string }, body: unknown) {
-  const response = await postResponses(server, body);
-  expect(response.status).toBe(200);
-  return response.text();
-}
-
-async function expectResponsesJson<T>(server: { baseUrl: string }, body: unknown) {
-  const response = await postResponses(server, body);
-  expect(response.status).toBe(200);
-  return (await response.json()) as T;
-}
-
 describe("qa mock openai server", () => {
   it("serves health and streamed responses", async () => {
-    const server = await startMockServer();
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
 
     const health = await fetch(`${server.baseUrl}/healthz`);
     expect(health.status).toBe(200);
@@ -75,22 +48,36 @@ describe("qa mock openai server", () => {
   });
 
   it("prefers path-like refs over generic quoted keys in prompts", async () => {
-    const server = await startMockServer();
-
-    const body = await expectResponsesText(server, {
-      stream: true,
-      input: [
-        {
-          role: "user",
-          content: [
-            {
-              type: "input_text",
-              text: 'Please inspect "message_id" metadata first, then read `./QA_KICKOFF_TASK.md`.',
-            },
-          ],
-        },
-      ],
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
     });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: true,
+        input: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "input_text",
+                text: 'Please inspect "message_id" metadata first, then read `./QA_KICKOFF_TASK.md`.',
+              },
+            ],
+          },
+        ],
+      }),
+    });
+    expect(response.status).toBe(200);
+    const body = await response.text();
     expect(body).toContain('"arguments":"{\\"path\\":\\"QA_KICKOFF_TASK.md\\"}"');
 
     const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
@@ -103,7 +90,13 @@ describe("qa mock openai server", () => {
   });
 
   it("drives the Lobster Invaders write flow and memory recall responses", async () => {
-    const server = await startMockServer();
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
 
     const lobster = await fetch(`${server.baseUrl}/v1/responses`, {
       method: "POST",
@@ -132,32 +125,40 @@ describe("qa mock openai server", () => {
     expect(lobsterBody).toContain('"name":"write"');
     expect(lobsterBody).toContain("lobster-invaders.html");
 
-    const payload = await expectResponsesJson<{
-      output?: Array<{ content?: Array<{ text?: string }> }>;
-    }>(server, {
-      stream: false,
-      model: "gpt-5.4-alt",
-      input: [
-        {
-          role: "user",
-          content: [
-            {
-              type: "input_text",
-              text: "Please remember this fact for later: the QA canary code is ALPHA-7.",
-            },
-          ],
-        },
-        {
-          role: "user",
-          content: [
-            {
-              type: "input_text",
-              text: "What was the QA canary code I asked you to remember earlier?",
-            },
-          ],
-        },
-      ],
+    const recall = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: false,
+        model: "gpt-5.4-alt",
+        input: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "input_text",
+                text: "Please remember this fact for later: the QA canary code is ALPHA-7.",
+              },
+            ],
+          },
+          {
+            role: "user",
+            content: [
+              {
+                type: "input_text",
+                text: "What was the QA canary code I asked you to remember earlier?",
+              },
+            ],
+          },
+        ],
+      }),
     });
+    expect(recall.status).toBe(200);
+    const payload = (await recall.json()) as {
+      output?: Array<{ content?: Array<{ text?: string }> }>;
+    };
     expect(payload.output?.[0]?.content?.[0]?.text).toContain("ALPHA-7");
 
     const requests = await fetch(`${server.baseUrl}/debug/requests`);
@@ -168,8 +169,157 @@ describe("qa mock openai server", () => {
     ]);
   });
 
+  it("keeps remember prompts prose-only even when they mention repo cleanup", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: true,
+        model: "gpt-5.4",
+        input: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "input_text",
+                text: "Please remember this fact for later: the QA canary code is ALPHA-7. Use your normal memory mechanism, avoid manual repo cleanup, and reply exactly `Remembered ALPHA-7.` once stored.",
+              },
+            ],
+          },
+        ],
+      }),
+    });
+    expect(response.status).toBe(200);
+    const body = await response.text();
+    expect(body).toContain("Remembered ALPHA-7.");
+    expect(body).not.toContain('"name":"read"');
+  });
+
+  it("drives repo-contract followthrough as read-read-read-write-then-report", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const prompt =
+      "Repo contract followthrough check. Read AGENT.md, SOUL.md, and FOLLOWTHROUGH_INPUT.md first. Then follow the repo contract exactly, write ./repo-contract-summary.txt, and reply with three labeled lines: Read, Wrote, Status.";
+
+    const first = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        model: "gpt-5.4",
+        input: [{ role: "user", content: [{ type: "input_text", text: prompt }] }],
+      }),
+    });
+    expect(first.status).toBe(200);
+    expect(await first.text()).toContain('"arguments":"{\\"path\\":\\"AGENT.md\\"}"');
+
+    const second = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        model: "gpt-5.4",
+        input: [
+          { role: "user", content: [{ type: "input_text", text: prompt }] },
+          {
+            type: "function_call_output",
+            output:
+              "# Repo contract\n\nStep order:\n1. Read AGENT.md.\n2. Read SOUL.md.\n3. Read FOLLOWTHROUGH_INPUT.md.\n4. Write ./repo-contract-summary.txt.\n",
+          },
+        ],
+      }),
+    });
+    expect(second.status).toBe(200);
+    expect(await second.text()).toContain('"arguments":"{\\"path\\":\\"SOUL.md\\"}"');
+
+    const third = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        model: "gpt-5.4",
+        input: [
+          { role: "user", content: [{ type: "input_text", text: prompt }] },
+          {
+            type: "function_call_output",
+            output: "# Execution style\n\nStay brief, honest, and action-first.\n",
+          },
+        ],
+      }),
+    });
+    expect(third.status).toBe(200);
+    expect(await third.text()).toContain('"arguments":"{\\"path\\":\\"FOLLOWTHROUGH_INPUT.md\\"}"');
+
+    const fourth = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        model: "gpt-5.4",
+        input: [
+          { role: "user", content: [{ type: "input_text", text: prompt }] },
+          {
+            type: "function_call_output",
+            output:
+              "Mission: prove you followed the repo contract.\nEvidence path: AGENT.md -> SOUL.md -> FOLLOWTHROUGH_INPUT.md -> repo-contract-summary.txt\n",
+          },
+        ],
+      }),
+    });
+    expect(fourth.status).toBe(200);
+    const fourthBody = await fourth.text();
+    expect(fourthBody).toContain('"name":"write"');
+    expect(fourthBody).toContain("repo-contract-summary.txt");
+
+    const fifth = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: false,
+        model: "gpt-5.4",
+        input: [
+          { role: "user", content: [{ type: "input_text", text: prompt }] },
+          {
+            type: "function_call_output",
+            output:
+              "Successfully wrote repo-contract-summary.txt\nMission: prove you followed the repo contract.\nStatus: complete\n",
+          },
+        ],
+      }),
+    });
+    expect(fifth.status).toBe(200);
+    const payload = (await fifth.json()) as {
+      output?: Array<{ content?: Array<{ text?: string }> }>;
+    };
+    expect(payload.output?.[0]?.content?.[0]?.text).toContain("Read: AGENT.md, SOUL.md");
+    expect(payload.output?.[0]?.content?.[0]?.text).toContain("Wrote: repo-contract-summary.txt");
+    expect(payload.output?.[0]?.content?.[0]?.text).toContain("Status: complete");
+  });
+
   it("drives the compaction retry mutating tool parity flow", async () => {
-    const server = await startMockServer();
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
 
     const writePlan = await fetch(`${server.baseUrl}/v1/responses`, {
       method: "POST",
@@ -201,27 +351,35 @@ describe("qa mock openai server", () => {
     expect(writePlanBody).toContain('"name":"write"');
     expect(writePlanBody).toContain("compaction-retry-summary.txt");
 
-    const finalPayload = await expectResponsesJson<{
-      output?: Array<{ content?: Array<{ text?: string }> }>;
-    }>(server, {
-      stream: false,
-      model: "gpt-5.4",
-      input: [
-        {
-          role: "user",
-          content: [
-            {
-              type: "input_text",
-              text: "Compaction retry mutating tool check: read COMPACTION_RETRY_CONTEXT.md, then create compaction-retry-summary.txt and keep replay safety explicit.",
-            },
-          ],
-        },
-        {
-          type: "function_call_output",
-          output: "Successfully wrote 41 bytes to compaction-retry-summary.txt.",
-        },
-      ],
+    const finalReply = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: false,
+        model: "gpt-5.4",
+        input: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "input_text",
+                text: "Compaction retry mutating tool check: read COMPACTION_RETRY_CONTEXT.md, then create compaction-retry-summary.txt and keep replay safety explicit.",
+              },
+            ],
+          },
+          {
+            type: "function_call_output",
+            output: "Successfully wrote 41 bytes to compaction-retry-summary.txt.",
+          },
+        ],
+      }),
     });
+    expect(finalReply.status).toBe(200);
+    const finalPayload = (await finalReply.json()) as {
+      output?: Array<{ content?: Array<{ text?: string }> }>;
+    };
     expect(finalPayload.output?.[0]?.content?.[0]?.text).toContain("replay unsafe after write");
   });
 
@@ -282,22 +440,36 @@ describe("qa mock openai server", () => {
   });
 
   it("requests non-threaded subagent handoff for QA channel runs", async () => {
-    const server = await startMockServer();
-
-    const body = await expectResponsesText(server, {
-      stream: true,
-      input: [
-        {
-          role: "user",
-          content: [
-            {
-              type: "input_text",
-              text: "Delegate a bounded QA task to a subagent, then summarize the delegated result clearly.",
-            },
-          ],
-        },
-      ],
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
     });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: true,
+        input: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "input_text",
+                text: "Delegate a bounded QA task to a subagent, then summarize the delegated result clearly.",
+              },
+            ],
+          },
+        ],
+      }),
+    });
+    expect(response.status).toBe(200);
+    const body = await response.text();
     expect(body).toContain('"name":"sessions_spawn"');
     expect(body).toContain('\\"label\\":\\"qa-sidecar\\"');
     expect(body).toContain('\\"thread\\":false');
@@ -672,11 +844,63 @@ describe("qa mock openai server", () => {
     });
   });
 
-  it("answers heartbeat prompts without spawning extra subagents", async () => {
-    const server = await startMockServer();
+  it("keeps subagent fanout state isolated per mock server instance", async () => {
+    const serverA = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await serverA.stop();
+    });
+    const serverB = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await serverB.stop();
+    });
 
-    expect(
-      await expectResponsesJson(server, {
+    const prompt =
+      "Subagent fanout synthesis check: delegate two bounded subagents sequentially, then report both results together.";
+
+    const firstA = await fetch(`${serverA.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        input: [{ role: "user", content: [{ type: "input_text", text: prompt }] }],
+      }),
+    });
+    expect(firstA.status).toBe(200);
+    expect(await firstA.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"');
+
+    const firstB = await fetch(`${serverB.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: true,
+        input: [{ role: "user", content: [{ type: "input_text", text: prompt }] }],
+      }),
+    });
+    expect(firstB.status).toBe(200);
+    expect(await firstB.text()).toContain('\\"label\\":\\"qa-fanout-alpha\\"');
+  });
+
+  it("answers heartbeat prompts without spawning extra subagents", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
         stream: false,
         input: [
           {
@@ -690,7 +914,10 @@ describe("qa mock openai server", () => {
           },
         ],
       }),
-    ).toMatchObject({
+    });
+
+    expect(response.status).toBe(200);
+    expect(await response.json()).toMatchObject({
       output: [
         {
           content: [{ text: "HEARTBEAT_OK" }],
@@ -768,10 +995,20 @@ describe("qa mock openai server", () => {
   });
 
   it("uses the latest exact marker directive from conversation history", async () => {
-    const server = await startMockServer();
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
 
-    expect(
-      await expectResponsesJson(server, {
+    const response = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
         stream: false,
         input: [
           {
@@ -794,7 +1031,10 @@ describe("qa mock openai server", () => {
           },
         ],
       }),
-    ).toMatchObject({
+    });
+
+    expect(response.status).toBe(200);
+    expect(await response.json()).toMatchObject({
       output: [
         {
           content: [{ text: "NEW_TOKEN" }],
@@ -854,33 +1094,45 @@ describe("qa mock openai server", () => {
   });
 
   it("describes reattached generated images in the roundtrip flow", async () => {
-    const server = await startMockServer();
-
-    const payload = await expectResponsesJson<{
-      output?: Array<{ content?: Array<{ text?: string }> }>;
-    }>(server, {
-      stream: false,
-      model: "mock-openai/gpt-5.4",
-      input: [
-        {
-          role: "user",
-          content: [
-            {
-              type: "input_text",
-              text: "Roundtrip image inspection check: describe the generated lighthouse attachment in one short sentence.",
-            },
-            {
-              type: "input_image",
-              source: {
-                type: "base64",
-                mime_type: "image/png",
-                data: QA_IMAGE_PNG_BASE64,
-              },
-            },
-          ],
-        },
-      ],
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
     });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        stream: false,
+        model: "mock-openai/gpt-5.4",
+        input: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "input_text",
+                text: "Roundtrip image inspection check: describe the generated lighthouse attachment in one short sentence.",
+              },
+              {
+                type: "input_image",
+                source: {
+                  type: "base64",
+                  mime_type: "image/png",
+                  data: QA_IMAGE_PNG_BASE64,
+                },
+              },
+            ],
+          },
+        ],
+      }),
+    });
+    expect(response.status).toBe(200);
+    const payload = (await response.json()) as {
+      output?: Array<{ content?: Array<{ text?: string }> }>;
+    };
     const text = payload.output?.[0]?.content?.[0]?.text ?? "";
     expect(text.toLowerCase()).toContain("lighthouse");
   });
@@ -927,10 +1179,20 @@ describe("qa mock openai server", () => {
   });
 
   it("returns continuity language after the model-switch reread completes", async () => {
-    const server = await startMockServer();
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
 
-    expect(
-      await expectResponsesJson(server, {
+    const response = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
         stream: false,
         model: "gpt-5.4-alt",
         input: [
@@ -949,7 +1211,10 @@ describe("qa mock openai server", () => {
           },
         ],
       }),
-    ).toMatchObject({
+    });
+
+    expect(response.status).toBe(200);
+    expect(await response.json()).toMatchObject({
       output: [
         {
           content: [
@@ -963,10 +1228,20 @@ describe("qa mock openai server", () => {
   });
 
   it("returns NO_REPLY for unmentioned group chatter", async () => {
-    const server = await startMockServer();
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
 
-    expect(
-      await expectResponsesJson(server, {
+    const response = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
         stream: false,
         input: [
           {
@@ -980,7 +1255,9 @@ describe("qa mock openai server", () => {
           },
         ],
       }),
-    ).toMatchObject({
+    });
+    expect(response.status).toBe(200);
+    expect(await response.json()).toMatchObject({
       output: [
         {
           content: [{ text: "NO_REPLY" }],
@@ -988,4 +1265,605 @@ describe("qa mock openai server", () => {
       ],
     });
   });
+
+  it("advertises Anthropic claude-opus-4-6 baseline model on /v1/models", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/models`);
+    expect(response.status).toBe(200);
+    const body = (await response.json()) as { data: Array<{ id: string }> };
+    const ids = body.data.map((entry) => entry.id);
+    expect(ids).toContain("claude-opus-4-6");
+    expect(ids).toContain("gpt-5.4");
+  });
+
+  it("dispatches an Anthropic /v1/messages read tool call for source discovery prompts", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/messages`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        model: "claude-opus-4-6",
+        max_tokens: 256,
+        messages: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "text",
+                text: "Read the seeded docs and report worked, failed, blocked, and follow-up items.",
+              },
+            ],
+          },
+        ],
+      }),
+    });
+    expect(response.status).toBe(200);
+    const body = (await response.json()) as {
+      type: string;
+      role: string;
+      model: string;
+      stop_reason: string;
+      content: Array<Record<string, unknown>>;
+    };
+    expect(body.type).toBe("message");
+    expect(body.role).toBe("assistant");
+    expect(body.model).toBe("claude-opus-4-6");
+    expect(body.stop_reason).toBe("tool_use");
+    const toolUseBlock = body.content.find((block) => block.type === "tool_use") as
+      | { name: string; input: Record<string, unknown> }
+      | undefined;
+    expect(toolUseBlock?.name).toBe("read");
+    expect(toolUseBlock?.input).toEqual({ path: "QA_SCENARIO_PLAN.md" });
+
+    const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
+    expect(debugResponse.status).toBe(200);
+    expect(await debugResponse.json()).toMatchObject({
+      model: "claude-opus-4-6",
+      plannedToolName: "read",
+    });
+  });
+
+  it("dispatches Anthropic /v1/messages tool_result follow-ups through the shared scenario logic", async () => {
+    // This verifies the Anthropic adapter correctly feeds tool_result
+    // content blocks into the shared scenario dispatcher so downstream
+    // "has this scenario already called a tool?" logic fires the same way
+    // it does on the OpenAI /v1/responses route. The subagent handoff
+    // scenario is ideal because the mock has a two-stage flow: first
+    // delegate prompt → sessions_spawn tool_use, then tool_result →
+    // "Delegated task: ..." prose summary.
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/messages`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        model: "claude-opus-4-6",
+        max_tokens: 256,
+        messages: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "text",
+                text: "Delegate one bounded QA task to a subagent, wait for it to finish, then reply with Delegated task, Result, and Evidence sections.",
+              },
+            ],
+          },
+          {
+            role: "assistant",
+            content: [
+              {
+                type: "tool_use",
+                id: "toolu_mock_spawn_1",
+                name: "sessions_spawn",
+                input: { task: "Inspect the QA workspace", label: "qa-sidecar", thread: false },
+              },
+            ],
+          },
+          {
+            role: "user",
+            content: [
+              {
+                type: "tool_result",
+                tool_use_id: "toolu_mock_spawn_1",
+                content: "SUBAGENT-OK",
+              },
+            ],
+          },
+        ],
+      }),
+    });
+    expect(response.status).toBe(200);
+    const body = (await response.json()) as {
+      stop_reason: string;
+      content: Array<{ type: string; text?: string }>;
+    };
+    expect(body.stop_reason).toBe("end_turn");
+    const textBlock = body.content.find((block) => block.type === "text") as
+      | { text: string }
+      | undefined;
+    // The mock's subagent-handoff branch echoes "Delegated task", a
+    // tool-output evidence line, and a folded-back "Evidence" marker.
+    expect(textBlock?.text).toContain("Delegated task");
+    expect(textBlock?.text).toContain("Evidence");
+  });
+
+  it("places tool_result after the parent user message even in mixed-content turns", async () => {
+    // Regression for the loop-6 Copilot / Greptile finding: a user message
+    // that mixes a tool_result block with fresh text blocks must still land
+    // the function_call_output AFTER the parent user message in the
+    // converted ResponsesInputItem[], otherwise extractToolOutput (which
+    // scans AFTER the last user-role index) fails to see the tool output
+    // and the downstream scenario dispatcher behaves as if no tool output
+    // was returned. We verify the conversion directly via the snapshot
+    // that /debug/last-request exposes: the last-request `toolOutput`
+    // field should be the stringified tool_result content, and `prompt`
+    // should be the trailing fresh-text block.
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/messages`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        model: "claude-opus-4-6",
+        max_tokens: 256,
+        messages: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "text",
+                text: "Delegate one bounded QA task to a subagent.",
+              },
+            ],
+          },
+          {
+            role: "assistant",
+            content: [
+              {
+                type: "tool_use",
+                id: "toolu_mock_spawn_mixed",
+                name: "sessions_spawn",
+                input: { task: "Inspect the QA workspace", label: "qa-sidecar", thread: false },
+              },
+            ],
+          },
+          {
+            role: "user",
+            content: [
+              {
+                type: "tool_result",
+                tool_use_id: "toolu_mock_spawn_mixed",
+                content: "SUBAGENT-OK",
+              },
+              // A trailing fresh text block in the same user turn. Before
+              // the loop-6 fix, the tool_result was pushed BEFORE the
+              // parent user message, so extractToolOutput saw the text
+              // turn as the last user-role item and found no
+              // function_call_output after it → returned "". The
+              // downstream dispatcher then behaved as if no tool output
+              // was present at all.
+              {
+                type: "text",
+                text: "Keep going with the fanout.",
+              },
+            ],
+          },
+        ],
+      }),
+    });
+    expect(response.status).toBe(200);
+
+    const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
+    expect(debugResponse.status).toBe(200);
+    const debug = (await debugResponse.json()) as {
+      prompt: string;
+      allInputText: string;
+      toolOutput: string;
+    };
+    // extractToolOutput should surface the tool_result content because
+    // the function_call_output item is placed AFTER the parent user
+    // message in the converted input array.
+    expect(debug.toolOutput).toBe("SUBAGENT-OK");
+    // extractLastUserText should surface the fresh-text block (the parent
+    // user message that was pushed BEFORE the function_call_output).
+    expect(debug.prompt).toBe("Keep going with the fanout.");
+    // The converted history still records both turns, including the
+    // original delegate prompt from the first user turn.
+    expect(debug.allInputText).toContain("Delegate one bounded QA task");
+  });
+
+  it("streams Anthropic /v1/messages tool_use responses as SSE", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/messages`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        model: "claude-opus-4-6",
+        max_tokens: 256,
+        stream: true,
+        messages: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "text",
+                text: "Read the seeded docs and report worked, failed, blocked, and follow-up items.",
+              },
+            ],
+          },
+        ],
+      }),
+    });
+    expect(response.status).toBe(200);
+    expect(response.headers.get("content-type")).toContain("text/event-stream");
+    const body = await response.text();
+    expect(body).toContain("event: message_start");
+    expect(body).toContain("event: content_block_start");
+    expect(body).toContain('"type":"tool_use"');
+    expect(body).toContain('"name":"read"');
+    expect(body).toContain("QA_SCENARIO_PLAN.md");
+    expect(body).toContain("event: message_delta");
+    expect(body).toContain("event: message_stop");
+  });
+
+  it("streams Anthropic /v1/messages tool_result follow-ups as text deltas", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/messages`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        model: "claude-opus-4-6",
+        max_tokens: 256,
+        stream: true,
+        messages: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "text",
+                text: "Delegate one bounded QA task to a subagent, wait for it to finish, then reply with Delegated task, Result, and Evidence sections.",
+              },
+            ],
+          },
+          {
+            role: "assistant",
+            content: [
+              {
+                type: "tool_use",
+                id: "toolu_mock_spawn_1",
+                name: "sessions_spawn",
+                input: { task: "Inspect the QA workspace", label: "qa-sidecar", thread: false },
+              },
+            ],
+          },
+          {
+            role: "user",
+            content: [
+              {
+                type: "tool_result",
+                tool_use_id: "toolu_mock_spawn_1",
+                content: "SUBAGENT-OK",
+              },
+            ],
+          },
+        ],
+      }),
+    });
+    expect(response.status).toBe(200);
+    expect(response.headers.get("content-type")).toContain("text/event-stream");
+    const body = await response.text();
+    expect(body).toContain("event: content_block_delta");
+    expect(body).toContain('"type":"text_delta"');
+    expect(body).toContain("Delegated task");
+    expect(body).toContain("Evidence");
+  });
+
+  it("keeps Anthropic remember prompts on the prose branch even when system text mentions HEARTBEAT", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/messages`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        model: "claude-opus-4-6",
+        max_tokens: 256,
+        stream: true,
+        system: [
+          {
+            type: "text",
+            text: "Read HEARTBEAT.md if it exists (workspace context). Follow it strictly. If nothing needs attention, reply HEARTBEAT_OK.",
+          },
+        ],
+        messages: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "text",
+                text: "Please remember this fact for later: the QA canary code is ALPHA-7. Use your normal memory mechanism, avoid manual repo cleanup, and reply exactly `Remembered ALPHA-7.` once stored.",
+              },
+            ],
+          },
+        ],
+      }),
+    });
+
+    expect(response.status).toBe(200);
+    const body = await response.text();
+    expect(body).toContain("Remembered ALPHA-7.");
+    expect(body).not.toContain("HEARTBEAT_OK");
+    expect(body).not.toContain('"name":"read"');
+  });
+
+  it("prefers the prompt-local exact reply directive over heartbeat context", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/messages`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        model: "claude-opus-4-6",
+        max_tokens: 256,
+        stream: true,
+        system: [
+          {
+            type: "text",
+            text: [
+              "Read HEARTBEAT.md if it exists (workspace context). Follow it strictly.",
+              "If the current user message is a heartbeat poll and nothing needs attention, reply exactly:",
+              "HEARTBEAT_OK",
+            ].join("\n"),
+          },
+        ],
+        messages: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "text",
+                text: "Please remember this fact for later: the QA canary code is ALPHA-7. Use your normal memory mechanism, avoid manual repo cleanup, and reply exactly `Remembered ALPHA-7.` once stored.",
+              },
+            ],
+          },
+        ],
+      }),
+    });
+
+    expect(response.status).toBe(200);
+    const body = await response.text();
+    expect(body).toContain("Remembered ALPHA-7.");
+    expect(body).not.toContain("HEARTBEAT_OK");
+  });
+
+  it("rejects malformed Anthropic /v1/messages JSON with an invalid_request_error", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/messages`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: '{"model":"claude-opus-4-6","messages":[',
+    });
+
+    expect(response.status).toBe(400);
+    const body = (await response.json()) as {
+      type: string;
+      error: { type: string; message: string };
+    };
+    expect(body.type).toBe("error");
+    expect(body.error.type).toBe("invalid_request_error");
+    expect(body.error.message).toContain("Malformed JSON body");
+  });
+
+  it("defaults empty-string Anthropic /v1/messages model to claude-opus-4-6", async () => {
+    // Regression for the loop-7 Copilot finding: a bare `typeof
+    // body.model === "string"` check lets an empty-string model leak
+    // through to `lastRequest.model` and `responseBody.model`. Empty
+    // strings must be treated the same as absent and default to
+    // `"claude-opus-4-6"` so parity consumers can trust the echoed label.
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const response = await fetch(`${server.baseUrl}/v1/messages`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        model: "",
+        max_tokens: 256,
+        messages: [
+          {
+            role: "user",
+            content: "Read the plan",
+          },
+        ],
+      }),
+    });
+    expect(response.status).toBe(200);
+    const body = (await response.json()) as { model: string };
+    expect(body.model).toBe("claude-opus-4-6");
+
+    const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
+    expect(debugResponse.status).toBe(200);
+    const debug = (await debugResponse.json()) as { model: string };
+    expect(debug.model).toBe("claude-opus-4-6");
+  });
+});
+
+describe("resolveProviderVariant", () => {
+  it("tags prefix-qualified openai models", () => {
+    expect(resolveProviderVariant("openai/gpt-5.4")).toBe("openai");
+    expect(resolveProviderVariant("openai:gpt-5.4")).toBe("openai");
+    expect(resolveProviderVariant("openai-codex/gpt-5.4")).toBe("openai");
+  });
+
+  it("tags prefix-qualified anthropic models", () => {
+    expect(resolveProviderVariant("anthropic/claude-opus-4-6")).toBe("anthropic");
+    expect(resolveProviderVariant("anthropic:claude-opus-4-6")).toBe("anthropic");
+    expect(resolveProviderVariant("claude-cli/claude-opus-4-6")).toBe("anthropic");
+  });
+
+  it("tags bare model names by prefix", () => {
+    expect(resolveProviderVariant("gpt-5.4")).toBe("openai");
+    expect(resolveProviderVariant("gpt-5.4-alt")).toBe("openai");
+    expect(resolveProviderVariant("gpt-4.5")).toBe("openai");
+    expect(resolveProviderVariant("o1-preview")).toBe("openai");
+    expect(resolveProviderVariant("claude-opus-4-6")).toBe("anthropic");
+    expect(resolveProviderVariant("claude-sonnet-4-6")).toBe("anthropic");
+  });
+
+  it("handles case drift and whitespace", () => {
+    expect(resolveProviderVariant("  OpenAI/GPT-5.4  ")).toBe("openai");
+    expect(resolveProviderVariant("ANTHROPIC/CLAUDE-OPUS-4-6")).toBe("anthropic");
+  });
+
+  it("falls through to unknown for unrecognized providers", () => {
+    expect(resolveProviderVariant("")).toBe("unknown");
+    expect(resolveProviderVariant(undefined)).toBe("unknown");
+    expect(resolveProviderVariant("mistral/mistral-large")).toBe("unknown");
+    expect(resolveProviderVariant("some-random-model")).toBe("unknown");
+  });
+});
+
+describe("qa mock openai server provider variant tagging", () => {
+  it("records providerVariant on /debug/last-request for openai requests", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        model: "openai/gpt-5.4",
+        stream: false,
+        input: [{ role: "user", content: [{ type: "input_text", text: "Heartbeat check" }] }],
+      }),
+    });
+
+    const debug = (await (await fetch(`${server.baseUrl}/debug/last-request`)).json()) as {
+      model: string;
+      providerVariant: string;
+    };
+    expect(debug.model).toBe("openai/gpt-5.4");
+    expect(debug.providerVariant).toBe("openai");
+  });
+
+  it("records providerVariant=anthropic on /v1/messages requests", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    await fetch(`${server.baseUrl}/v1/messages`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        model: "claude-opus-4-6",
+        max_tokens: 256,
+        messages: [{ role: "user", content: "Heartbeat check" }],
+      }),
+    });
+
+    const debug = (await (await fetch(`${server.baseUrl}/debug/last-request`)).json()) as {
+      model: string;
+      providerVariant: string;
+    };
+    expect(debug.model).toBe("claude-opus-4-6");
+    expect(debug.providerVariant).toBe("anthropic");
+  });
+
+  it("records providerVariant=unknown for unrecognized models", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        model: "mistral/mistral-large",
+        stream: false,
+        input: [{ role: "user", content: [{ type: "input_text", text: "Heartbeat check" }] }],
+      }),
+    });
+
+    const debug = (await (await fetch(`${server.baseUrl}/debug/last-request`)).json()) as {
+      providerVariant: string;
+    };
+    expect(debug.providerVariant).toBe("unknown");
+  });
 });
diff --git a/extensions/qa-lab/src/mock-openai-server.ts b/extensions/qa-lab/src/mock-openai-server.ts
index 1c3e7863f5c..c0e9b6fdcab 100644
--- a/extensions/qa-lab/src/mock-openai-server.ts
+++ b/extensions/qa-lab/src/mock-openai-server.ts
@@ -22,6 +22,58 @@ type StreamEvent =
       };
     };
 
+/**
+ * Provider variant tag for `body.model`. The mock previously ignored
+ * `body.model` for dispatch and only echoed it in the prose output, which
+ * made the parity gate tautological when run against the mock alone
+ * (both providers produced identical scenario plans by construction).
+ * Tagging requests with a normalized variant lets individual scenario
+ * branches opt into provider-specific behavior while the rest of the
+ * dispatcher stays shared, and lets `/debug/requests` consumers verify
+ * which provider lane a given request came from without re-parsing the
+ * raw model string.
+ *
+ * Policy:
+ * - `openai/*`, `gpt-*`, `o1-*`, anything starting with `gpt-` → `"openai"`
+ * - `anthropic/*`, `claude-*` → `"anthropic"`
+ * - Everything else (including empty strings) → `"unknown"`
+ *
+ * The `/v1/messages` route always feeds `body.model` straight through,
+ * so an Anthropic request with an `openai/gpt-5.4` model string is still
+ * classified as `"openai"`. That matches the parity program's convention
+ * where the provider label is the source of truth, not the HTTP route.
+ */
+export type MockOpenAiProviderVariant = "openai" | "anthropic" | "unknown";
+
+export function resolveProviderVariant(model: string | undefined): MockOpenAiProviderVariant {
+  if (typeof model !== "string") {
+    return "unknown";
+  }
+  const trimmed = model.trim().toLowerCase();
+  if (trimmed.length === 0) {
+    return "unknown";
+  }
+  // Prefer the explicit `provider/model` or `provider:model` prefix when
+  // the caller supplied one — that's the most reliable signal.
+  const separatorMatch = /^([^/:]+)[/:]/.exec(trimmed);
+  const provider = separatorMatch?.[1] ?? trimmed;
+  if (provider === "openai" || provider === "openai-codex") {
+    return "openai";
+  }
+  if (provider === "anthropic" || provider === "claude-cli") {
+    return "anthropic";
+  }
+  // Fall back to model-name prefix matching for bare model strings like
+  // `gpt-5.4` or `claude-opus-4-6`.
+  if (/^(?:gpt-|o1-|openai-)/.test(trimmed)) {
+    return "openai";
+  }
+  if (/^(?:claude-|anthropic-)/.test(trimmed)) {
+    return "anthropic";
+  }
+  return "unknown";
+}
+
 type MockOpenAiRequestSnapshot = {
   raw: string;
   body: Record<string, unknown>;
@@ -30,13 +82,52 @@ type MockOpenAiRequestSnapshot = {
   instructions?: string;
   toolOutput: string;
   model: string;
+  providerVariant: MockOpenAiProviderVariant;
   imageInputCount: number;
   plannedToolName?: string;
 };
 
+// Anthropic /v1/messages request/response shapes the mock actually needs.
+// This is a subset of the real Anthropic Messages API — just enough so the
+// QA suite can run its parity pack against a "baseline" Anthropic provider
+// without needing real API keys. The scenarios drive their dispatch through
+// the shared mock scenario logic (buildResponsesPayload), so whatever
+// behavior the OpenAI mock exposes is automatically mirrored on this route.
+type AnthropicMessageContentBlock =
+  | { type: "text"; text: string }
+  | {
+      type: "tool_use";
+      id: string;
+      name: string;
+      input: Record<string, unknown>;
+    }
+  | {
+      type: "tool_result";
+      tool_use_id: string;
+      content: string | Array<{ type: "text"; text: string }>;
+    }
+  | { type: "image"; source: Record<string, unknown> };
+
+type AnthropicMessage = {
+  role: "user" | "assistant";
+  content: string | AnthropicMessageContentBlock[];
+};
+
+type AnthropicMessagesRequest = {
+  model?: string;
+  max_tokens?: number;
+  system?: string | Array<{ type: "text"; text: string }>;
+  messages?: AnthropicMessage[];
+  tools?: Array<Record<string, unknown>>;
+  stream?: boolean;
+};
+
 const TINY_PNG_BASE64 =
   "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII=";
-let subagentFanoutPhase = 0;
+
+type MockScenarioState = {
+  subagentFanoutPhase: number;
+};
 
 function readBody(req: IncomingMessage): Promise<string> {
   return new Promise((resolve, reject) => {
@@ -68,6 +159,23 @@ function writeSse(res: ServerResponse, events: StreamEvent[]) {
   res.end(body);
 }
 
+type AnthropicStreamEvent = Record<string, unknown> & {
+  type: string;
+};
+
+function writeAnthropicSse(res: ServerResponse, events: AnthropicStreamEvent[]) {
+  const body = events
+    .map((event) => `event: ${event.type}\ndata: ${JSON.stringify(event)}\n\n`)
+    .join("");
+  res.writeHead(200, {
+    "content-type": "text/event-stream",
+    "cache-control": "no-store",
+    connection: "keep-alive",
+    "content-length": Buffer.byteLength(body),
+  });
+  res.end(body);
+}
+
 function countApproxTokens(text: string) {
   const trimmed = text.trim();
   if (!trimmed) {
@@ -376,11 +484,11 @@ function extractLastCapture(text: string, pattern: RegExp) {
 }
 
 function extractExactReplyDirective(text: string) {
-  const colonMatch = extractLastCapture(text, /reply(?: with)? exactly:\s*([^\n]+)/i);
-  if (colonMatch) {
-    return colonMatch;
+  const backtickedMatch = extractLastCapture(text, /reply(?: with)? exactly\s+`([^`]+)`/i);
+  if (backtickedMatch) {
+    return backtickedMatch;
   }
-  return extractLastCapture(text, /reply(?: with)? exactly\s+`([^`]+)`/i);
+  return extractLastCapture(text, /reply(?: with)? exactly:\s*([^\n]+)/i);
 }
 
 function extractExactMarkerDirective(text: string) {
@@ -392,10 +500,18 @@ function extractExactMarkerDirective(text: string) {
 }
 
 function isHeartbeatPrompt(text: string) {
-  return /Read HEARTBEAT\.md if it exists/i.test(text);
+  const trimmed = text.trim();
+  if (!trimmed || /remember this fact/i.test(trimmed)) {
+    return false;
+  }
+  return /(?:^|\n)Read HEARTBEAT\.md if it exists\b/i.test(trimmed);
 }
 
-function buildAssistantText(input: ResponsesInputItem[], body: Record<string, unknown>) {
+function buildAssistantText(
+  input: ResponsesInputItem[],
+  body: Record<string, unknown>,
+  scenarioState: MockScenarioState,
+) {
   const prompt = extractLastUserText(input);
   const toolOutput = extractToolOutput(input);
   const toolJson = parseToolOutputJson(toolOutput);
@@ -411,8 +527,10 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record<string, un
         : toolOutput;
   const orbitCode = extractOrbitCode(memorySnippet);
   const mediaPath = /MEDIA:([^\n]+)/.exec(toolOutput)?.[1]?.trim();
-  const exactReplyDirective = extractExactReplyDirective(allInputText);
-  const exactMarkerDirective = extractExactMarkerDirective(allInputText);
+  const exactReplyDirective =
+    extractExactReplyDirective(prompt) ?? extractExactReplyDirective(allInputText);
+  const exactMarkerDirective =
+    extractExactMarkerDirective(prompt) ?? extractExactMarkerDirective(allInputText);
   const imageInputCount = countImageInputs(input);
   const activeMemorySummary = extractActiveMemorySummary(allInputText);
   const snackPreference = extractSnackPreference(activeMemorySummary ?? memorySnippet);
@@ -456,6 +574,23 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record<string, un
   if (/tool continuity check/i.test(prompt) && toolOutput) {
     return `Protocol note: model switch handoff confirmed on ${model || "the requested model"}. QA mission from QA_KICKOFF_TASK.md still applies: understand this OpenClaw repo from source + docs before acting.`;
   }
+  if (toolOutput && /repo contract followthrough check/i.test(prompt)) {
+    if (
+      /successfully (?:wrote|created|updated|replaced)/i.test(toolOutput) ||
+      /status:\s*complete/i.test(toolOutput)
+    ) {
+      return [
+        "Read: AGENT.md, SOUL.md, FOLLOWTHROUGH_INPUT.md",
+        "Wrote: repo-contract-summary.txt",
+        "Status: complete",
+      ].join("\n");
+    }
+    return [
+      "Read: AGENT.md, SOUL.md, FOLLOWTHROUGH_INPUT.md",
+      "Wrote: repo-contract-summary.txt",
+      "Status: blocked",
+    ].join("\n");
+  }
   if (/session memory ranking check/i.test(prompt) && orbitCode) {
     return `Protocol note: I checked memory and the current Project Nebula codename is ${orbitCode}.`;
   }
@@ -489,7 +624,11 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record<string, un
   if (/fanout worker beta/i.test(prompt)) {
     return "BETA-OK";
   }
-  if (/subagent fanout synthesis check/i.test(prompt) && toolOutput && subagentFanoutPhase >= 2) {
+  if (
+    /subagent fanout synthesis check/i.test(prompt) &&
+    toolOutput &&
+    scenarioState.subagentFanoutPhase >= 2
+  ) {
     return "Protocol note: delegated fanout complete. Alpha=ALPHA-OK. Beta=BETA-OK.";
   }
   if (toolOutput && (/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt))) {
@@ -579,7 +718,10 @@ function buildAssistantEvents(text: string): StreamEvent[] {
   ];
 }
 
-async function buildResponsesPayload(body: Record<string, unknown>) {
+async function buildResponsesPayload(
+  body: Record<string, unknown>,
+  scenarioState: MockScenarioState,
+) {
   const input = Array.isArray(body.input) ? (body.input as ResponsesInputItem[]) : [];
   const prompt = extractLastUserText(input);
   const toolOutput = extractToolOutput(input);
@@ -587,6 +729,9 @@ async function buildResponsesPayload(body: Record<string, unknown>) {
   const allInputText = extractAllRequestTexts(input, body);
   const isGroupChat = allInputText.includes('"is_group_chat": true');
   const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt);
+  if (/remember this fact/i.test(prompt)) {
+    return buildAssistantEvents(buildAssistantText(input, body, scenarioState));
+  }
   if (isHeartbeatPrompt(prompt)) {
     return buildAssistantEvents("HEARTBEAT_OK");
   }
@@ -756,16 +901,16 @@ async function buildResponsesPayload(body: Record<string, unknown>) {
     });
   }
   if (/subagent fanout synthesis check/i.test(prompt)) {
-    if (!toolOutput && subagentFanoutPhase === 0) {
-      subagentFanoutPhase = 1;
+    if (!toolOutput && scenarioState.subagentFanoutPhase === 0) {
+      scenarioState.subagentFanoutPhase = 1;
       return buildToolCallEventsWithArgs("sessions_spawn", {
         task: "Fanout worker alpha: inspect the QA workspace and finish with exactly ALPHA-OK.",
         label: "qa-fanout-alpha",
         thread: false,
       });
     }
-    if (toolOutput && subagentFanoutPhase === 1) {
-      subagentFanoutPhase = 2;
+    if (toolOutput && scenarioState.subagentFanoutPhase === 1) {
+      scenarioState.subagentFanoutPhase = 2;
       return buildToolCallEventsWithArgs("sessions_spawn", {
         task: "Fanout worker beta: inspect the QA workspace and finish with exactly BETA-OK.",
         label: "qa-fanout-beta",
@@ -776,6 +921,30 @@ async function buildResponsesPayload(body: Record<string, unknown>) {
   if (/tool continuity check/i.test(prompt) && !toolOutput) {
     return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
   }
+  if (/repo contract followthrough check/i.test(prompt)) {
+    if (!toolOutput) {
+      return buildToolCallEventsWithArgs("read", { path: "AGENT.md" });
+    }
+    if (toolOutput.includes("# Repo contract")) {
+      return buildToolCallEventsWithArgs("read", { path: "SOUL.md" });
+    }
+    if (toolOutput.includes("# Execution style")) {
+      return buildToolCallEventsWithArgs("read", { path: "FOLLOWTHROUGH_INPUT.md" });
+    }
+    if (
+      toolOutput.includes("Mission: prove you followed the repo contract.") &&
+      toolOutput.includes("Evidence path: AGENT.md -> SOUL.md -> FOLLOWTHROUGH_INPUT.md")
+    ) {
+      return buildToolCallEventsWithArgs("write", {
+        path: "repo-contract-summary.txt",
+        content: [
+          "Mission: prove you followed the repo contract.",
+          "Evidence: AGENT.md -> SOUL.md -> FOLLOWTHROUGH_INPUT.md",
+          "Status: complete",
+        ].join("\n"),
+      });
+    }
+  }
   if ((/\bdelegate\b/i.test(prompt) || /subagent handoff/i.test(prompt)) && !toolOutput) {
     return buildToolCallEventsWithArgs("sessions_spawn", {
       task: "Inspect the QA workspace and return one concise protocol note.",
@@ -807,12 +976,390 @@ async function buildResponsesPayload(body: Record<string, unknown>) {
   ) {
     await sleep(60_000);
   }
-  return buildAssistantEvents(buildAssistantText(input, body));
+  return buildAssistantEvents(buildAssistantText(input, body, scenarioState));
+}
+
+// ---------------------------------------------------------------------------
+// Anthropic /v1/messages adapter
+// ---------------------------------------------------------------------------
+//
+// The QA parity gate needs two comparable scenario runs: one against the
+// "candidate" (openai/gpt-5.4) and one against the "baseline"
+// (anthropic/claude-opus-4-6). The OpenAI mock above already dispatches all
+// the scenario prompt branches we care about. Rather than duplicating that
+// machinery, the /v1/messages route below translates Anthropic request
+// shapes into the shared ResponsesInputItem[] format, calls the same
+// buildResponsesPayload() dispatcher, and then re-serializes the resulting
+// events into an Anthropic response. This gives the parity harness a
+// baseline lane that exercises the same scenario logic without requiring
+// real Anthropic API keys.
+//
+// Scope: handles Anthropic Messages requests with text and tool_result
+// content blocks, supporting both non-streaming JSON responses and the
+// streaming SSE path used by the parity harness.
+
+function normalizeAnthropicSystemToString(
+  system: AnthropicMessagesRequest["system"],
+): string | undefined {
+  if (typeof system === "string") {
+    return system.trim() || undefined;
+  }
+  if (Array.isArray(system)) {
+    const joined = system
+      .map((block) => (block?.type === "text" ? block.text : ""))
+      .filter(Boolean)
+      .join("\n")
+      .trim();
+    return joined || undefined;
+  }
+  return undefined;
+}
+
+function stringifyToolResultContent(
+  content: Extract<AnthropicMessageContentBlock, { type: "tool_result" }>["content"],
+): string {
+  if (typeof content === "string") {
+    return content;
+  }
+  if (Array.isArray(content)) {
+    return content
+      .map((block) => (block?.type === "text" ? block.text : ""))
+      .filter(Boolean)
+      .join("\n");
+  }
+  return "";
+}
+
+function convertAnthropicMessagesToResponsesInput(params: {
+  system?: AnthropicMessagesRequest["system"];
+  messages: AnthropicMessage[];
+}): ResponsesInputItem[] {
+  const items: ResponsesInputItem[] = [];
+  const systemText = normalizeAnthropicSystemToString(params.system);
+  if (systemText) {
+    items.push({
+      role: "system",
+      content: [{ type: "input_text", text: systemText }],
+    });
+  }
+  for (const message of params.messages) {
+    const content = message.content;
+    if (typeof content === "string") {
+      items.push({
+        role: message.role,
+        content: [
+          message.role === "assistant"
+            ? { type: "output_text", text: content }
+            : { type: "input_text", text: content },
+        ],
+      });
+      continue;
+    }
+    if (!Array.isArray(content)) {
+      continue;
+    }
+    // Buffer each block type so we can push in OpenAI-Responses order instead
+    // of the order they appear in the Anthropic content array. The parent
+    // role message must precede any function_call_output items from the same
+    // turn, otherwise extractToolOutput() (which scans for
+    // function_call_output AFTER the last user-role index) will not see the
+    // output and the downstream scenario dispatcher will behave as if no
+    // tool output was returned. Similarly, assistant tool_use blocks become
+    // function_call items that must follow the assistant text message they
+    // narrate.
+    const textPieces: Array<{ type: "input_text" | "output_text"; text: string }> = [];
+    const imagePieces: Array<{ type: "input_image"; image_url: string }> = [];
+    const toolResultItems: ResponsesInputItem[] = [];
+    const toolUseItems: ResponsesInputItem[] = [];
+    for (const block of content) {
+      if (!block || typeof block !== "object") {
+        continue;
+      }
+      if (block.type === "text") {
+        textPieces.push({
+          type: message.role === "assistant" ? "output_text" : "input_text",
+          text: block.text ?? "",
+        });
+        continue;
+      }
+      if (block.type === "image") {
+        // Mock only needs to count image inputs; a placeholder URL is fine.
+        imagePieces.push({ type: "input_image", image_url: "anthropic-mock:image" });
+        continue;
+      }
+      if (block.type === "tool_result") {
+        const output = stringifyToolResultContent(block.content);
+        if (output.trim()) {
+          toolResultItems.push({ type: "function_call_output", output });
+        }
+        continue;
+      }
+      if (block.type === "tool_use") {
+        // Mirror OpenAI's function_call output_item shape so downstream
+        // prompt extraction still sees "the assistant just emitted a tool
+        // call". The scenario dispatcher looks for tool_output on the next
+        // user turn, not the assistant's prior tool_use, so a minimal
+        // placeholder is enough.
+        toolUseItems.push({
+          type: "function_call",
+          name: block.name,
+          arguments: JSON.stringify(block.input ?? {}),
+          call_id: block.id,
+        });
+        continue;
+      }
+    }
+    if (textPieces.length > 0 || imagePieces.length > 0) {
+      const combinedContent: Array<Record<string, unknown>> = [...textPieces, ...imagePieces];
+      items.push({ role: message.role, content: combinedContent });
+    }
+    // Emit tool_use (assistant prior calls) and tool_result (user-side
+    // returns) AFTER the parent role message so extractLastUserText and
+    // extractToolOutput walk the array in the order they expect. For a
+    // tool_result-only user turn with no text/image blocks, the parent
+    // message is intentionally omitted — the function_call_output itself
+    // represents the user's "return the tool output" turn.
+    for (const toolUse of toolUseItems) {
+      items.push(toolUse);
+    }
+    for (const toolResult of toolResultItems) {
+      items.push(toolResult);
+    }
+  }
+  return items;
+}
+
+type ExtractedAssistantOutput = {
+  text: string;
+  toolCalls: Array<{ id: string; name: string; input: Record<string, unknown> }>;
+};
+
+function extractFinalAssistantOutputFromEvents(events: StreamEvent[]): ExtractedAssistantOutput {
+  const toolCalls: ExtractedAssistantOutput["toolCalls"] = [];
+  let text = "";
+  for (const event of events) {
+    if (event.type !== "response.output_item.done") {
+      continue;
+    }
+    const item = event.item as {
+      type?: unknown;
+      name?: unknown;
+      call_id?: unknown;
+      id?: unknown;
+      arguments?: unknown;
+      content?: unknown;
+    };
+    if (item.type === "function_call" && typeof item.name === "string") {
+      let input: Record<string, unknown> = {};
+      if (typeof item.arguments === "string" && item.arguments.trim()) {
+        try {
+          const parsed = JSON.parse(item.arguments) as unknown;
+          if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
+            input = parsed as Record<string, unknown>;
+          }
+        } catch {
+          // keep empty input on malformed args — mock dispatcher owns arg shape
+        }
+      }
+      toolCalls.push({
+        id: typeof item.call_id === "string" ? item.call_id : `toolu_mock_${toolCalls.length + 1}`,
+        name: item.name,
+        input,
+      });
+      continue;
+    }
+    if (item.type === "message" && Array.isArray(item.content)) {
+      for (const piece of item.content as Array<{ type?: unknown; text?: unknown }>) {
+        if (piece?.type === "output_text" && typeof piece.text === "string") {
+          text = piece.text;
+        }
+      }
+    }
+  }
+  return { text, toolCalls };
+}
+
+function buildAnthropicMessageResponse(params: {
+  model: string;
+  extracted: ExtractedAssistantOutput;
+}): Record<string, unknown> {
+  const content: Array<Record<string, unknown>> = [];
+  if (params.extracted.text) {
+    content.push({ type: "text", text: params.extracted.text });
+  }
+  for (const call of params.extracted.toolCalls) {
+    content.push({
+      type: "tool_use",
+      id: call.id,
+      name: call.name,
+      input: call.input,
+    });
+  }
+  if (content.length === 0) {
+    content.push({ type: "text", text: "" });
+  }
+  const stopReason = params.extracted.toolCalls.length > 0 ? "tool_use" : "end_turn";
+  const approxInputTokens = 64;
+  const approxOutputTokens = Math.max(
+    16,
+    countApproxTokens(params.extracted.text) + params.extracted.toolCalls.length * 16,
+  );
+  return {
+    id: `msg_mock_${Math.floor(Math.random() * 1_000_000).toString(16)}`,
+    type: "message",
+    role: "assistant",
+    model: params.model || "claude-opus-4-6",
+    content,
+    stop_reason: stopReason,
+    stop_sequence: null,
+    usage: {
+      input_tokens: approxInputTokens,
+      output_tokens: approxOutputTokens,
+    },
+  };
+}
+
+function buildAnthropicMessageStreamEvents(params: {
+  model: string;
+  extracted: ExtractedAssistantOutput;
+}): AnthropicStreamEvent[] {
+  const approxInputTokens = 64;
+  const approxOutputTokens = Math.max(
+    16,
+    countApproxTokens(params.extracted.text) + params.extracted.toolCalls.length * 16,
+  );
+  const messageId = `msg_mock_${Math.floor(Math.random() * 1_000_000).toString(16)}`;
+  const events: AnthropicStreamEvent[] = [
+    {
+      type: "message_start",
+      message: {
+        id: messageId,
+        type: "message",
+        role: "assistant",
+        model: params.model || "claude-opus-4-6",
+        content: [],
+        stop_reason: null,
+        stop_sequence: null,
+        usage: {
+          input_tokens: approxInputTokens,
+          output_tokens: 0,
+        },
+      },
+    },
+  ];
+  let index = 0;
+  if (params.extracted.text || params.extracted.toolCalls.length === 0) {
+    events.push({
+      type: "content_block_start",
+      index,
+      content_block: {
+        type: "text",
+        text: "",
+      },
+    });
+    if (params.extracted.text) {
+      events.push({
+        type: "content_block_delta",
+        index,
+        delta: {
+          type: "text_delta",
+          text: params.extracted.text,
+        },
+      });
+    }
+    events.push({
+      type: "content_block_stop",
+      index,
+    });
+    index += 1;
+  }
+  for (const call of params.extracted.toolCalls) {
+    events.push({
+      type: "content_block_start",
+      index,
+      content_block: {
+        type: "tool_use",
+        id: call.id,
+        name: call.name,
+        input: {},
+      },
+    });
+    events.push({
+      type: "content_block_delta",
+      index,
+      delta: {
+        type: "input_json_delta",
+        partial_json: JSON.stringify(call.input ?? {}),
+      },
+    });
+    events.push({
+      type: "content_block_stop",
+      index,
+    });
+    index += 1;
+  }
+  events.push({
+    type: "message_delta",
+    delta: {
+      stop_reason: params.extracted.toolCalls.length > 0 ? "tool_use" : "end_turn",
+    },
+    usage: {
+      input_tokens: approxInputTokens,
+      output_tokens: approxOutputTokens,
+    },
+  });
+  events.push({
+    type: "message_stop",
+  });
+  return events;
+}
+
+async function buildMessagesPayload(
+  body: AnthropicMessagesRequest,
+  scenarioState: MockScenarioState,
+): Promise<{
+  events: StreamEvent[];
+  input: ResponsesInputItem[];
+  extracted: ExtractedAssistantOutput;
+  responseBody: Record<string, unknown>;
+  streamEvents: AnthropicStreamEvent[];
+  model: string;
+}> {
+  const messages = Array.isArray(body.messages) ? body.messages : [];
+  const input = convertAnthropicMessagesToResponsesInput({
+    system: body.system,
+    messages,
+  });
+  // Treat empty-string model the same as absent. A bare typeof check lets
+  // `""` leak through to `responseBody.model` and `lastRequest.model`,
+  // which then confuses parity consumers that assume the mock always
+  // echoes the real provider label. Normalize once and reuse everywhere.
+  const normalizedModel =
+    typeof body.model === "string" && body.model.trim() !== "" ? body.model : "claude-opus-4-6";
+  // Dispatch through the same scenario logic the /v1/responses route uses.
+  // The mock dispatcher only reads `body.input`, `body.model`, and
+  // `body.stream`, so a synthetic shim body is sufficient.
+  const dispatchBody: Record<string, unknown> = {
+    input,
+    model: normalizedModel,
+    stream: false,
+  };
+  const events = await buildResponsesPayload(dispatchBody, scenarioState);
+  const extracted = extractFinalAssistantOutputFromEvents(events);
+  const responseBody = buildAnthropicMessageResponse({
+    model: normalizedModel,
+    extracted,
+  });
+  const streamEvents = buildAnthropicMessageStreamEvents({
+    model: normalizedModel,
+    extracted,
+  });
+  return { events, input, extracted, responseBody, streamEvents, model: normalizedModel };
 }
 
 export async function startQaMockOpenAiServer(params?: { host?: string; port?: number }) {
   const host = params?.host ?? "127.0.0.1";
-  subagentFanoutPhase = 0;
+  const scenarioState: MockScenarioState = { subagentFanoutPhase: 0 };
   let lastRequest: MockOpenAiRequestSnapshot | null = null;
   const requests: MockOpenAiRequestSnapshot[] = [];
   const imageGenerationRequests: Array<Record<string, unknown>> = [];
@@ -829,6 +1376,8 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n
           { id: "gpt-5.4-alt", object: "model" },
           { id: "gpt-image-1", object: "model" },
           { id: "text-embedding-3-small", object: "model" },
+          { id: "claude-opus-4-6", object: "model" },
+          { id: "claude-sonnet-4-6", object: "model" },
         ],
       });
       return;
@@ -888,7 +1437,8 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n
       const raw = await readBody(req);
       const body = raw ? (JSON.parse(raw) as Record<string, unknown>) : {};
       const input = Array.isArray(body.input) ? (body.input as ResponsesInputItem[]) : [];
-      const events = await buildResponsesPayload(body);
+      const events = await buildResponsesPayload(body, scenarioState);
+      const resolvedModel = typeof body.model === "string" ? body.model : "";
       lastRequest = {
         raw,
         body,
@@ -896,7 +1446,8 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n
         allInputText: extractAllRequestTexts(input, body),
         instructions: extractInstructionsText(body) || undefined,
         toolOutput: extractToolOutput(input),
-        model: typeof body.model === "string" ? body.model : "",
+        model: resolvedModel,
+        providerVariant: resolveProviderVariant(resolvedModel),
         imageInputCount: countImageInputs(input),
         plannedToolName: extractPlannedToolName(events),
       };
@@ -916,6 +1467,56 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n
       writeSse(res, events);
       return;
     }
+    if (req.method === "POST" && url.pathname === "/v1/messages") {
+      const raw = await readBody(req);
+      let body: AnthropicMessagesRequest = {};
+      try {
+        body = raw ? (JSON.parse(raw) as AnthropicMessagesRequest) : {};
+      } catch {
+        writeJson(res, 400, {
+          type: "error",
+          error: {
+            type: "invalid_request_error",
+            message: "Malformed JSON body for Anthropic Messages request.",
+          },
+        });
+        return;
+      }
+      const {
+        events,
+        input,
+        responseBody,
+        streamEvents,
+        model: normalizedModel,
+      } = await buildMessagesPayload(body, scenarioState);
+      // Record the adapted request snapshot so /debug/requests gives the QA
+      // suite the same plannedToolName / allInputText / toolOutput signals
+      // on the Anthropic route that the OpenAI route already exposes. This
+      // is what lets a single parity run diff assertions across both lanes.
+      // Reuse the normalized model so an empty-string body.model no longer
+      // leaks through to `lastRequest.model`.
+      lastRequest = {
+        raw,
+        body: body as Record<string, unknown>,
+        prompt: extractLastUserText(input),
+        allInputText: extractAllInputTexts(input),
+        toolOutput: extractToolOutput(input),
+        model: normalizedModel,
+        providerVariant: resolveProviderVariant(normalizedModel),
+        imageInputCount: countImageInputs(input),
+        plannedToolName: extractPlannedToolName(events),
+      };
+      requests.push(lastRequest);
+      if (requests.length > 50) {
+        requests.splice(0, requests.length - 50);
+      }
+      if (body.stream === true) {
+        writeAnthropicSse(res, streamEvents);
+        return;
+      }
+      writeJson(res, 200, responseBody);
+      return;
+    }
     writeJson(res, 404, { error: "not found" });
   });
 
diff --git a/extensions/qa-lab/src/qa-gateway-config.test.ts b/extensions/qa-lab/src/qa-gateway-config.test.ts
index 74ae3bfc26a..62973aeb73e 100644
--- a/extensions/qa-lab/src/qa-gateway-config.test.ts
+++ b/extensions/qa-lab/src/qa-gateway-config.test.ts
@@ -53,6 +53,11 @@ describe("buildQaGatewayConfig", () => {
 
     expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("mock-openai/gpt-5.4");
     expect(cfg.models?.providers?.["mock-openai"]?.baseUrl).toBe("http://127.0.0.1:44080/v1");
+    expect(cfg.models?.providers?.["mock-openai"]?.request).toEqual({ allowPrivateNetwork: true });
+    expect(cfg.models?.providers?.openai?.baseUrl).toBe("http://127.0.0.1:44080/v1");
+    expect(cfg.models?.providers?.openai?.request).toEqual({ allowPrivateNetwork: true });
+    expect(cfg.models?.providers?.anthropic?.baseUrl).toBe("http://127.0.0.1:44080");
+    expect(cfg.models?.providers?.anthropic?.request).toEqual({ allowPrivateNetwork: true });
     expect(cfg.plugins?.allow).toEqual(["memory-core", "qa-channel"]);
     expect(cfg.plugins?.entries?.["memory-core"]).toEqual({ enabled: true });
     expect(cfg.plugins?.entries?.["qa-channel"]).toEqual({ enabled: true });
@@ -66,6 +71,31 @@ describe("buildQaGatewayConfig", () => {
     expect(cfg.messages?.groupChat?.mentionPatterns).toEqual(["\\b@?openclaw\\b"]);
   });
 
+  it("maps provider-qualified openai and anthropic refs through the mock provider lane", () => {
+    const cfg = buildQaGatewayConfig({
+      bind: "loopback",
+      gatewayPort: 18789,
+      gatewayToken: "token",
+      providerBaseUrl: "http://127.0.0.1:44080/v1",
+      workspaceDir: "/tmp/qa-workspace",
+      providerMode: "mock-openai",
+      primaryModel: "openai/gpt-5.4",
+      alternateModel: "anthropic/claude-opus-4-6",
+    });
+
+    expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("openai/gpt-5.4");
+    expect(cfg.models?.providers?.openai?.api).toBe("openai-responses");
+    expect(cfg.models?.providers?.openai?.request).toEqual({ allowPrivateNetwork: true });
+    expect(cfg.models?.providers?.openai?.models.map((model) => model.id)).toContain("gpt-5.4");
+    expect(cfg.models?.providers?.anthropic?.api).toBe("anthropic-messages");
+    expect(cfg.models?.providers?.anthropic?.baseUrl).toBe("http://127.0.0.1:44080");
+    expect(cfg.models?.providers?.anthropic?.request).toEqual({ allowPrivateNetwork: true });
+    expect(cfg.models?.providers?.anthropic?.models.map((model) => model.id)).toContain(
+      "claude-opus-4-6",
+    );
+    expect(cfg.plugins?.allow).toEqual(["memory-core"]);
+  });
+
   it("can omit qa-channel for live transport gateway children", () => {
     const cfg = buildQaGatewayConfig({
       bind: "loopback",
diff --git a/extensions/qa-lab/src/qa-gateway-config.ts b/extensions/qa-lab/src/qa-gateway-config.ts
index 18f3b9e4a3a..6cc5c2832de 100644
--- a/extensions/qa-lab/src/qa-gateway-config.ts
+++ b/extensions/qa-lab/src/qa-gateway-config.ts
@@ -45,6 +45,10 @@ export function normalizeQaThinkingLevel(input: unknown): QaThinkingLevel | unde
   return undefined;
 }
 
+function trimTrailingApiV1(baseUrl: string) {
+  return baseUrl.replace(/\/v1\/?$/i, "");
+}
+
 export function mergeQaControlUiAllowedOrigins(extraOrigins?: string[]) {
   const normalizedExtra = (extraOrigins ?? [])
     .map((origin) => origin.trim())
@@ -74,10 +78,14 @@ export function buildQaGatewayConfig(params: {
   thinkingDefault?: QaThinkingLevel;
 }): OpenClawConfig {
   const mockProviderBaseUrl = params.providerBaseUrl ?? "http://127.0.0.1:44080/v1";
+  const mockAnthropicBaseUrl = trimTrailingApiV1(mockProviderBaseUrl);
   const mockOpenAiProvider: ModelProviderConfig = {
     baseUrl: mockProviderBaseUrl,
     apiKey: "test",
     api: "openai-responses",
+    request: {
+      allowPrivateNetwork: true,
+    },
     models: [
       {
         id: "gpt-5.4",
@@ -126,6 +134,50 @@ export function buildQaGatewayConfig(params: {
       },
     ],
   };
+  const mockNamedOpenAiProvider: ModelProviderConfig = {
+    ...mockOpenAiProvider,
+    models: mockOpenAiProvider.models.map((model) => ({ ...model })),
+  };
+  const mockAnthropicProvider: ModelProviderConfig = {
+    baseUrl: mockAnthropicBaseUrl,
+    apiKey: "test",
+    api: "anthropic-messages",
+    request: {
+      allowPrivateNetwork: true,
+    },
+    models: [
+      {
+        id: "claude-opus-4-6",
+        name: "claude-opus-4-6",
+        api: "anthropic-messages",
+        reasoning: false,
+        input: ["text", "image"],
+        cost: {
+          input: 0,
+          output: 0,
+          cacheRead: 0,
+          cacheWrite: 0,
+        },
+        contextWindow: 200_000,
+        maxTokens: 4096,
+      },
+      {
+        id: "claude-sonnet-4-6",
+        name: "claude-sonnet-4-6",
+        api: "anthropic-messages",
+        reasoning: false,
+        input: ["text", "image"],
+        cost: {
+          input: 0,
+          output: 0,
+          cacheRead: 0,
+          cacheWrite: 0,
+        },
+        contextWindow: 200_000,
+        maxTokens: 4096,
+      },
+    ],
+  };
   const providerMode = normalizeQaProviderMode(params.providerMode ?? "mock-openai");
   const primaryModel = params.primaryModel ?? defaultQaModelForMode(providerMode);
   const alternateModel =
@@ -273,6 +325,8 @@ export function buildQaGatewayConfig(params: {
             mode: "replace",
             providers: {
               "mock-openai": mockOpenAiProvider,
+              openai: mockNamedOpenAiProvider,
+              anthropic: mockAnthropicProvider,
             },
           },
         }
diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts
index 658194fb382..fa25fc272bb 100644
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -118,6 +118,50 @@ describe("qa scenario catalog", () => {
     );
   });
 
+  it("keeps mock-only image debug assertions guarded in live-frontier runs", () => {
+    const scenario = readQaScenarioPack().scenarios.find(
+      (candidate) => candidate.id === "image-understanding-attachment",
+    );
+    const imageRequestAction = scenario?.execution.flow?.steps
+      .flatMap((step) => step.actions ?? [])
+      .find(
+        (
+          action,
+        ): action is {
+          set: string;
+          value?: { expr?: string };
+        } =>
+          typeof action === "object" &&
+          action !== null &&
+          "set" in action &&
+          action.set === "imageRequest",
+      );
+    const imageRequestExpr = imageRequestAction?.value?.expr;
+
+    expect(imageRequestExpr).toContain("env.mock ?");
+    expect(imageRequestExpr).toContain("/debug/requests");
+  });
+
+  it("adds a repo-instruction followthrough scenario to the parity pack", () => {
+    const scenario = readQaScenarioById("instruction-followthrough-repo-contract");
+    const config = readQaScenarioExecutionConfig("instruction-followthrough-repo-contract") as
+      | {
+          workspaceFiles?: Record<string, string>;
+          prompt?: string;
+          expectedReplyAll?: string[];
+        }
+      | undefined;
+
+    expect(config?.workspaceFiles?.["AGENT.md"]).toContain("Step order:");
+    expect(config?.workspaceFiles?.["SOUL.md"]).toContain("action-first");
+    expect(config?.workspaceFiles?.["FOLLOWTHROUGH_INPUT.md"]).toContain(
+      "Mission: prove you followed the repo contract.",
+    );
+    expect(config?.prompt).toContain("Repo contract followthrough check.");
+    expect(config?.expectedReplyAll).toEqual(["read:", "wrote:", "status:"]);
+    expect(scenario.title).toBe("Instruction followthrough repo contract");
+  });
+
   it("rejects malformed string matcher lists before running a flow", () => {
     expect(() =>
       validateQaScenarioExecutionConfig({
diff --git a/extensions/qa-lab/src/suite.summary-json.test.ts b/extensions/qa-lab/src/suite.summary-json.test.ts
new file mode 100644
index 00000000000..5db4a6646f4
--- /dev/null
+++ b/extensions/qa-lab/src/suite.summary-json.test.ts
@@ -0,0 +1,101 @@
+import { describe, expect, it } from "vitest";
+import { buildQaSuiteSummaryJson } from "./suite.js";
+
+describe("buildQaSuiteSummaryJson", () => {
+  const baseParams = {
+    // Test scenarios include a `steps: []` field to match the real suite
+    // scenario-result shape so downstream consumers that rely on the shape
+    // (parity gate, report render) stay aligned.
+    scenarios: [
+      { name: "Scenario A", status: "pass" as const, steps: [] },
+      { name: "Scenario B", status: "fail" as const, details: "something broke", steps: [] },
+    ],
+    startedAt: new Date("2026-04-11T00:00:00.000Z"),
+    finishedAt: new Date("2026-04-11T00:05:00.000Z"),
+    providerMode: "mock-openai" as const,
+    primaryModel: "openai/gpt-5.4",
+    alternateModel: "openai/gpt-5.4-alt",
+    fastMode: true,
+    concurrency: 2,
+  };
+
+  it("records provider/model/mode so parity gates can verify labels", () => {
+    const json = buildQaSuiteSummaryJson(baseParams);
+    expect(json.run).toMatchObject({
+      startedAt: "2026-04-11T00:00:00.000Z",
+      finishedAt: "2026-04-11T00:05:00.000Z",
+      providerMode: "mock-openai",
+      primaryModel: "openai/gpt-5.4",
+      primaryProvider: "openai",
+      primaryModelName: "gpt-5.4",
+      alternateModel: "openai/gpt-5.4-alt",
+      alternateProvider: "openai",
+      alternateModelName: "gpt-5.4-alt",
+      fastMode: true,
+      concurrency: 2,
+      scenarioIds: null,
+    });
+  });
+
+  it("includes scenarioIds in run metadata when provided", () => {
+    const scenarioIds = ["approval-turn-tool-followthrough", "subagent-handoff", "memory-recall"];
+    const json = buildQaSuiteSummaryJson({
+      ...baseParams,
+      scenarioIds,
+    });
+    expect(json.run.scenarioIds).toEqual(scenarioIds);
+  });
+
+  it("treats an empty scenarioIds array as unspecified (no filter)", () => {
+    // A CLI path that omits --scenario passes an empty array to runQaSuite.
+    // The summary must encode that as null so downstream parity/report
+    // tooling doesn't interpret a full run as an explicit empty selection.
+    const json = buildQaSuiteSummaryJson({
+      ...baseParams,
+      scenarioIds: [],
+    });
+    expect(json.run.scenarioIds).toBeNull();
+  });
+
+  it("records an Anthropic baseline lane cleanly for parity runs", () => {
+    const json = buildQaSuiteSummaryJson({
+      ...baseParams,
+      primaryModel: "anthropic/claude-opus-4-6",
+      alternateModel: "anthropic/claude-sonnet-4-6",
+    });
+    expect(json.run).toMatchObject({
+      primaryModel: "anthropic/claude-opus-4-6",
+      primaryProvider: "anthropic",
+      primaryModelName: "claude-opus-4-6",
+      alternateModel: "anthropic/claude-sonnet-4-6",
+      alternateProvider: "anthropic",
+      alternateModelName: "claude-sonnet-4-6",
+    });
+  });
+
+  it("leaves split fields null when a model ref is malformed", () => {
+    const json = buildQaSuiteSummaryJson({
+      ...baseParams,
+      primaryModel: "not-a-real-ref",
+      alternateModel: "",
+    });
+    expect(json.run).toMatchObject({
+      primaryModel: "not-a-real-ref",
+      primaryProvider: null,
+      primaryModelName: null,
+      alternateModel: "",
+      alternateProvider: null,
+      alternateModelName: null,
+    });
+  });
+
+  it("keeps scenarios and counts alongside the run metadata", () => {
+    const json = buildQaSuiteSummaryJson(baseParams);
+    expect(json.scenarios).toHaveLength(2);
+    expect(json.counts).toEqual({
+      total: 2,
+      passed: 1,
+      failed: 1,
+    });
+  });
+});
diff --git a/extensions/qa-lab/src/suite.ts b/extensions/qa-lab/src/suite.ts
index bde9b6786dd..d9f916eac21 100644
--- a/extensions/qa-lab/src/suite.ts
+++ b/extensions/qa-lab/src/suite.ts
@@ -81,7 +81,7 @@ type QaSuiteStep = {
   run: () => Promise<string | void>;
 };
 
-type QaSuiteScenarioResult = {
+export type QaSuiteScenarioResult = {
   name: string;
   status: "pass" | "fail";
   steps: QaReportCheck[];
@@ -1365,17 +1365,105 @@ function createQaSuiteReportNotes(params: {
   return params.transport.createReportNotes(params);
 }
 
+export type QaSuiteSummaryJsonParams = {
+  scenarios: QaSuiteScenarioResult[];
+  startedAt: Date;
+  finishedAt: Date;
+  providerMode: QaProviderMode;
+  primaryModel: string;
+  alternateModel: string;
+  fastMode: boolean;
+  concurrency: number;
+  scenarioIds?: readonly string[];
+};
+
+/**
+ * Strongly-typed shape of `qa-suite-summary.json`. The GPT-5.4 parity gate
+ * (agentic-parity-report.ts, #64441) and any future parity wrapper can
+ * import this type instead of re-declaring the shape, so changes to the
+ * summary schema propagate through to every consumer at type-check time.
+ */
+export type QaSuiteSummaryJson = {
+  scenarios: QaSuiteScenarioResult[];
+  counts: {
+    total: number;
+    passed: number;
+    failed: number;
+  };
+  run: {
+    startedAt: string;
+    finishedAt: string;
+    providerMode: QaProviderMode;
+    primaryModel: string;
+    primaryProvider: string | null;
+    primaryModelName: string | null;
+    alternateModel: string;
+    alternateProvider: string | null;
+    alternateModelName: string | null;
+    fastMode: boolean;
+    concurrency: number;
+    scenarioIds: string[] | null;
+  };
+};
+
+/**
+ * Pure-ish JSON builder for qa-suite-summary.json. Exported so the GPT-5.4
+ * parity gate (agentic-parity-report.ts, #64441) and any future parity
+ * runner can assert-and-trust the provider/model that produced a given
+ * summary instead of blindly accepting the caller's candidateLabel /
+ * baselineLabel. Without the `run` block, a maintainer who swaps candidate
+ * and baseline summary paths could silently produce a mislabeled verdict.
+ *
+ * `scenarioIds` is only recorded when the caller passed a non-empty array
+ * (an explicit scenario selection). A missing or empty array means "no
+ * filter, full lane-selected catalog", which the summary encodes as `null`
+ * so parity/report tooling doesn't mistake a full run for an explicit
+ * empty selection.
+ */
+export function buildQaSuiteSummaryJson(params: QaSuiteSummaryJsonParams): QaSuiteSummaryJson {
+  const primarySplit = splitModelRef(params.primaryModel);
+  const alternateSplit = splitModelRef(params.alternateModel);
+  return {
+    scenarios: params.scenarios,
+    counts: {
+      total: params.scenarios.length,
+      passed: params.scenarios.filter((scenario) => scenario.status === "pass").length,
+      failed: params.scenarios.filter((scenario) => scenario.status === "fail").length,
+    },
+    run: {
+      startedAt: params.startedAt.toISOString(),
+      finishedAt: params.finishedAt.toISOString(),
+      providerMode: params.providerMode,
+      primaryModel: params.primaryModel,
+      primaryProvider: primarySplit?.provider ?? null,
+      primaryModelName: primarySplit?.model ?? null,
+      alternateModel: params.alternateModel,
+      alternateProvider: alternateSplit?.provider ?? null,
+      alternateModelName: alternateSplit?.model ?? null,
+      fastMode: params.fastMode,
+      concurrency: params.concurrency,
+      scenarioIds:
+        params.scenarioIds && params.scenarioIds.length > 0 ? [...params.scenarioIds] : null,
+    },
+  };
+}
+
 async function writeQaSuiteArtifacts(params: {
   outputDir: string;
   startedAt: Date;
   finishedAt: Date;
   scenarios: QaSuiteScenarioResult[];
   transport: QaTransportAdapter;
-  providerMode: "mock-openai" | "live-frontier";
+  // Reuse the canonical QaProviderMode union instead of re-declaring it
+  // inline. Loop 6 already unified `QaSuiteSummaryJsonParams.providerMode`
+  // on this type; keeping the writer in sync prevents drift when model-
+  // selection.ts adds a new provider mode.
+  providerMode: QaProviderMode;
   primaryModel: string;
   alternateModel: string;
   fastMode: boolean;
   concurrency: number;
+  scenarioIds?: readonly string[];
 }) {
   const report = renderQaMarkdownReport({
     title: "OpenClaw QA Scenario Suite",
@@ -1395,18 +1483,7 @@ async function writeQaSuiteArtifacts(params: {
   await fs.writeFile(reportPath, report, "utf8");
   await fs.writeFile(
     summaryPath,
-    `${JSON.stringify(
-      {
-        scenarios: params.scenarios,
-        counts: {
-          total: params.scenarios.length,
-          passed: params.scenarios.filter((scenario) => scenario.status === "pass").length,
-          failed: params.scenarios.filter((scenario) => scenario.status === "fail").length,
-        },
-      },
-      null,
-      2,
-    )}\n`,
+    `${JSON.stringify(buildQaSuiteSummaryJson(params), null, 2)}\n`,
     "utf8",
   );
   return { report, reportPath, summaryPath };
@@ -1576,6 +1653,16 @@ export async function runQaSuite(params?: QaSuiteRunParams): Promise<QaSuiteResu
         alternateModel,
         fastMode,
         concurrency,
+        // When the caller supplied an explicit non-empty --scenario filter,
+        // record the executed (post-selectQaSuiteScenarios-normalized) ids
+        // so the summary matches what actually ran. When the caller passed
+        // nothing or an empty array ("no filter, full lane catalog"),
+        // preserve the unfiltered = null semantic so the summary stays
+        // distinguishable from an explicit all-scenarios selection.
+        scenarioIds:
+          params?.scenarioIds && params.scenarioIds.length > 0
+            ? selectedCatalogScenarios.map((scenario) => scenario.id)
+            : undefined,
       });
       lab.setLatestReport({
         outputPath: reportPath,
@@ -1737,6 +1824,12 @@ export async function runQaSuite(params?: QaSuiteRunParams): Promise<QaSuiteResu
       alternateModel,
       fastMode,
       concurrency,
+      // Same "filtered → executed list, unfiltered → null" convention as
+      // the concurrent-path writeQaSuiteArtifacts call above.
+      scenarioIds:
+        params?.scenarioIds && params.scenarioIds.length > 0
+          ? selectedCatalogScenarios.map((scenario) => scenario.id)
+          : undefined,
     });
     const latestReport = {
       outputPath: reportPath,
diff --git a/qa/scenarios/config-restart-capability-flip.md b/qa/scenarios/config-restart-capability-flip.md
index 1717941e804..b25cb5e40b7 100644
--- a/qa/scenarios/config-restart-capability-flip.md
+++ b/qa/scenarios/config-restart-capability-flip.md
@@ -151,6 +151,20 @@ steps:
                     ref: imageStartedAtMs
                   timeoutMs:
                     expr: liveTurnTimeoutMs(env, 45000)
+            # Tool-call assertion (criterion 2 of the parity completion
+            # gate in #64227): the restored `image_generate` capability
+            # must have actually fired as a real tool call. Without this
+            # assertion, a prose reply that just mentions a MEDIA path
+            # could satisfy the scenario, so strengthen it by requiring
+            # the mock to have recorded `plannedToolName: "image_generate"`
+            # against a post-restart request. The `!env.mock || ...`
+            # guard means this check only runs in mock mode (where
+            # `/debug/requests` is available); live-frontier runs skip
+            # it and still pass the rest of the scenario.
+            - assert:
+                expr: "!env.mock || [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].some((request) => String(request.allInputText ?? '').toLowerCase().includes('capability flip image check') && request.plannedToolName === 'image_generate')"
+                message:
+                  expr: "`expected image_generate tool call during capability flip scenario, saw plannedToolNames=${JSON.stringify([...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].filter((request) => String(request.allInputText ?? '').toLowerCase().includes('capability flip image check')).map((request) => request.plannedToolName ?? null))}`"
           finally:
             - call: patchConfig
               args:
diff --git a/qa/scenarios/image-understanding-attachment.md b/qa/scenarios/image-understanding-attachment.md
index ed508d8aba0..31801ee207f 100644
--- a/qa/scenarios/image-understanding-attachment.md
+++ b/qa/scenarios/image-understanding-attachment.md
@@ -64,9 +64,26 @@ steps:
           expr: "!missingColorGroup"
           message:
             expr: "`missing expected colors in image description: ${outbound.text}`"
+      # Image-processing assertion: verify the mock actually received an
+      # image on the scenario-unique prompt. This is as strong as a
+      # tool-call assertion for this scenario — unlike the
+      # `source-docs-discovery-report` / `subagent-handoff` /
+      # `config-restart-capability-flip` scenarios that rely on a real
+      # tool call to satisfy the parity criterion, image understanding
+      # is handled inside the provider's vision capability and does NOT
+      # emit a tool call the mock can record as `plannedToolName`. The
+      # `imageInputCount` field IS the tool-call evidence for vision
+      # scenarios: it proves the attachment reached the provider, which
+      # is the only thing an external harness can verify in mock mode.
+      # Match on the scenario-unique prompt substring so the assertion
+      # can't be accidentally satisfied by some other scenario's image
+      # request that happens to share a debug log with this one.
+      - set: imageRequest
+        value:
+          expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].find((request) => String(request.prompt ?? '').includes('Image understanding check')) : null"
       - assert:
-          expr: "!env.mock || (((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.prompt ?? '').includes('Image understanding check'))?.imageInputCount ?? 0) >= 1)"
+          expr: "!env.mock || (imageRequest && (imageRequest.imageInputCount ?? 0) >= 1)"
           message:
-            expr: "`expected at least one input image, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.prompt ?? '').includes('Image understanding check'))?.imageInputCount ?? 0)}`"
+            expr: "`expected at least one input image on the Image understanding check request, got imageInputCount=${String(imageRequest?.imageInputCount ?? 0)}`"
     detailsExpr: outbound.text
 ```
diff --git a/qa/scenarios/instruction-followthrough-repo-contract.md b/qa/scenarios/instruction-followthrough-repo-contract.md
new file mode 100644
index 00000000000..4bbee26e63f
--- /dev/null
+++ b/qa/scenarios/instruction-followthrough-repo-contract.md
@@ -0,0 +1,127 @@
+# Instruction followthrough repo contract
+
+```yaml qa-scenario
+id: instruction-followthrough-repo-contract
+title: Instruction followthrough repo contract
+surface: repo-contract
+objective: Verify the agent reads repo instruction files first, follows the required tool order, and completes the first feasible action instead of stopping at a plan.
+successCriteria:
+  - Agent reads the seeded instruction files before writing the requested artifact.
+  - Agent writes the requested artifact in the same run instead of returning only a plan.
+  - Agent does not ask for permission before the first feasible action.
+  - Final reply makes the completed read/write sequence explicit.
+docsRefs:
+  - docs/help/testing.md
+  - docs/channels/qa-channel.md
+codeRefs:
+  - src/agents/system-prompt.ts
+  - src/agents/pi-embedded-runner/run/incomplete-turn.ts
+  - extensions/qa-lab/src/mock-openai-server.ts
+execution:
+  kind: flow
+  summary: Verify the agent reads repo instructions first, then completes the first bounded followthrough task without stalling.
+  config:
+    workspaceFiles:
+      AGENT.md: |-
+        # Repo contract
+
+        Step order:
+        1. Read AGENT.md.
+        2. Read SOUL.md.
+        3. Read FOLLOWTHROUGH_INPUT.md.
+        4. Write ./repo-contract-summary.txt.
+        5. Reply with three labeled lines exactly once: Read, Wrote, Status.
+
+        Do not stop after planning.
+        Do not ask for permission before the first feasible action.
+      SOUL.md: |-
+        # Execution style
+
+        Stay brief, honest, and action-first.
+        If the next tool action is feasible, do it before replying.
+      FOLLOWTHROUGH_INPUT.md: |-
+        Mission: prove you followed the repo contract.
+        Evidence path: AGENT.md -> SOUL.md -> FOLLOWTHROUGH_INPUT.md -> repo-contract-summary.txt
+    prompt: |-
+      Repo contract followthrough check. Read AGENT.md, SOUL.md, and FOLLOWTHROUGH_INPUT.md first.
+      Then follow the repo contract exactly, write ./repo-contract-summary.txt, and reply with
+      three labeled lines: Read, Wrote, Status.
+      Do not stop after planning and do not ask for permission before the first feasible action.
+    expectedReplyAll:
+      - "read:"
+      - "wrote:"
+      - "status:"
+    forbiddenNeedles:
+      - need permission
+      - need your approval
+      - can you approve
+      - i would
+      - i can
+      - next i would
+```
+
+```yaml qa-flow
+steps:
+  - name: follows repo instructions instead of stopping at a plan
+    actions:
+      - call: reset
+      - forEach:
+          items:
+            expr: "Object.entries(config.workspaceFiles ?? {})"
+          item: workspaceFile
+          actions:
+            - call: fs.writeFile
+              args:
+                - expr: "path.join(env.gateway.workspaceDir, String(workspaceFile[0]))"
+                - expr: "`${String(workspaceFile[1] ?? '').trimEnd()}\\n`"
+                - utf8
+      - set: artifactPath
+        value:
+          expr: "path.join(env.gateway.workspaceDir, 'repo-contract-summary.txt')"
+      - call: runAgentPrompt
+        args:
+          - ref: env
+          - sessionKey: agent:qa:repo-contract
+            message:
+              expr: config.prompt
+            timeoutMs:
+              expr: liveTurnTimeoutMs(env, 40000)
+      - call: waitForCondition
+        saveAs: artifact
+        args:
+          - lambda:
+              async: true
+              expr: "((await fs.readFile(artifactPath, 'utf8').catch(() => null))?.includes('Mission: prove you followed the repo contract.') ? await fs.readFile(artifactPath, 'utf8').catch(() => null) : undefined)"
+          - expr: liveTurnTimeoutMs(env, 30000)
+          - expr: "env.providerMode === 'mock-openai' ? 100 : 250"
+      - set: expectedReplyAll
+        value:
+          expr: config.expectedReplyAll.map(normalizeLowercaseStringOrEmpty)
+      - call: waitForCondition
+        saveAs: outbound
+        args:
+          - lambda:
+              expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && expectedReplyAll.every((needle) => normalizeLowercaseStringOrEmpty(candidate.text).includes(needle))).at(-1)"
+          - expr: liveTurnTimeoutMs(env, 30000)
+          - expr: "env.providerMode === 'mock-openai' ? 100 : 250"
+      - assert:
+          expr: "!config.forbiddenNeedles.some((needle) => normalizeLowercaseStringOrEmpty(outbound.text).includes(needle))"
+          message:
+            expr: "`repo contract followthrough bounced for permission or stalled: ${outbound.text}`"
+      - set: followthroughDebugRequests
+        value:
+          expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].filter((request) => /repo contract followthrough check/i.test(String(request.allInputText ?? ''))) : []"
+      - assert:
+          expr: "!env.mock || followthroughDebugRequests.filter((request) => request.plannedToolName === 'read').length >= 3"
+          message:
+            expr: "`expected three read tool calls before write, saw plannedToolNames=${JSON.stringify(followthroughDebugRequests.map((request) => request.plannedToolName ?? null))}`"
+      - assert:
+          expr: "!env.mock || followthroughDebugRequests.some((request) => request.plannedToolName === 'write')"
+          message:
+            expr: "`expected write tool call during repo contract followthrough, saw plannedToolNames=${JSON.stringify(followthroughDebugRequests.map((request) => request.plannedToolName ?? null))}`"
+      - assert:
+          expr: "!env.mock || (() => { const readIndices = followthroughDebugRequests.map((r, i) => r.plannedToolName === 'read' ? i : -1).filter(i => i >= 0); const firstWrite = followthroughDebugRequests.findIndex((r) => r.plannedToolName === 'write'); return readIndices.length >= 3 && firstWrite >= 0 && readIndices[2] < firstWrite; })()"
+          message:
+            expr: "`expected all 3 reads before any write during repo contract followthrough, saw plannedToolNames=${JSON.stringify(followthroughDebugRequests.map((request) => request.plannedToolName ?? null))}`"
+    detailsExpr: outbound.text
+```
diff --git a/qa/scenarios/memory-recall.md b/qa/scenarios/memory-recall.md
index a6886afcd01..908cbdca72c 100644
--- a/qa/scenarios/memory-recall.md
+++ b/qa/scenarios/memory-recall.md
@@ -1,5 +1,36 @@
 # Memory recall after context switch
 
+<!--
+  This scenario deliberately stays prose-only and does NOT gate on a
+  `/debug/requests` tool-call assertion, even though it is one of the
+  scenarios in the parity pack. The adversarial review in the umbrella
+  #64227 thread called this out as a coverage gap, but the underlying
+  behavior the scenario tests is legitimately prose-shaped: the agent is
+  supposed to pull a prior-turn fact ("ALPHA-7") back across an
+  intervening context switch and reply with the code. In a real
+  conversation, the model can do this EITHER by calling a memory-search
+  tool (which the qa-lab mock server doesn't currently expose) OR by
+  reading the fact directly from prior-turn context in its own
+  conversation window. Both strategies are valid parity behavior.
+
+  Forcing a `plannedToolName` assertion here would either require
+  extending the mock with a synthetic `memory_search` tool lane (PR O
+  scope, not PR J) or fabricating a tool-call requirement the real
+  providers never implement. Either path would make this scenario test
+  the harness, not the models. So we keep it prose-only, covered by the
+  `recallExpectedAny` / `rememberAckAny` assertions above, and flag the
+  exception explicitly rather than silently.
+
+  Criterion 2 of the parity completion gate (no fake progress or fake
+  tool completion) is enforced for this scenario through the parity
+  report's failure-tone fake-success detector: a scenario marked `pass`
+  whose details text matches patterns like "timed out", "failed to",
+  "could not" gets flagged via `SUSPICIOUS_PASS_FAILURE_TONE_PATTERNS`
+  in `extensions/qa-lab/src/agentic-parity-report.ts`. Positive-tone
+  detection was removed because it false-positives on legitimate passes
+  where the details field is the model's outbound prose.
+-->
+
 ```yaml qa-scenario
 id: memory-recall
 title: Memory recall after context switch
diff --git a/qa/scenarios/model-switch-tool-continuity.md b/qa/scenarios/model-switch-tool-continuity.md
index cbee28b6bbd..7e162b2e331 100644
--- a/qa/scenarios/model-switch-tool-continuity.md
+++ b/qa/scenarios/model-switch-tool-continuity.md
@@ -69,13 +69,22 @@ steps:
           expr: hasModelSwitchContinuityEvidence(outbound.text)
           message:
             expr: "`switch reply missed kickoff continuity: ${outbound.text}`"
-      - assert:
-          expr: "!env.mock || (((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.plannedToolName) === 'read')"
-          message:
-            expr: "`expected read after switch, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.plannedToolName ?? '')}`"
-      - assert:
-          expr: "!env.mock || (((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.model) === 'gpt-5.4-alt')"
-          message:
-            expr: "`expected alternate model, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/requests`)).find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))?.model ?? '')}`"
+      - if:
+          expr: "Boolean(env.mock)"
+          then:
+            - set: switchDebugRequests
+              value:
+                expr: "await fetchJson(`${env.mock.baseUrl}/debug/requests`)"
+            - set: switchRequest
+              value:
+                expr: "switchDebugRequests.find((request) => String(request.allInputText ?? '').includes(config.promptSnippet))"
+            - assert:
+                expr: "switchRequest?.plannedToolName === 'read'"
+                message:
+                  expr: "`expected read after switch, got ${String(switchRequest?.plannedToolName ?? '')}`"
+            - assert:
+                expr: "String(switchRequest?.model ?? '') === String(alternate?.model ?? '')"
+                message:
+                  expr: "`expected alternate model, got ${String(switchRequest?.model ?? '')}`"
     detailsExpr: outbound.text
 ```
diff --git a/qa/scenarios/source-docs-discovery-report.md b/qa/scenarios/source-docs-discovery-report.md
index 2de07b14fcd..8a4f999478a 100644
--- a/qa/scenarios/source-docs-discovery-report.md
+++ b/qa/scenarios/source-docs-discovery-report.md
@@ -56,5 +56,20 @@ steps:
           expr: "!reportsDiscoveryScopeLeak(outbound.text)"
           message:
             expr: "`discovery report drifted beyond scope: ${outbound.text}`"
+      # Parity gate criterion 2 (no fake progress / fake tool completion):
+      # require an actual read tool call before the prose report. Without this,
+      # a model could fabricate a plausible Worked/Failed/Blocked/Follow-up
+      # report without ever touching the repo files the prompt names. The
+      # debug request log is fetched once and reused for both the assertion
+      # and its failure-message diagnostic. Each request's allInputText is
+      # lowercased inline at match time (the real prompt writes it as
+      # "Worked, Failed, Blocked") so the contains check is case-insensitive.
+      - set: discoveryDebugRequests
+        value:
+          expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))] : []"
+      - assert:
+          expr: "!env.mock || discoveryDebugRequests.some((request) => String(request.allInputText ?? '').toLowerCase().includes('worked, failed, blocked') && request.plannedToolName === 'read')"
+          message:
+            expr: "`expected at least one read tool call during discovery report scenario, saw plannedToolNames=${JSON.stringify(discoveryDebugRequests.map((request) => request.plannedToolName ?? null))}`"
     detailsExpr: outbound.text
 ```
diff --git a/qa/scenarios/subagent-fanout-synthesis.md b/qa/scenarios/subagent-fanout-synthesis.md
index 0f4f6501852..4d142151620 100644
--- a/qa/scenarios/subagent-fanout-synthesis.md
+++ b/qa/scenarios/subagent-fanout-synthesis.md
@@ -113,6 +113,28 @@ steps:
                                   expr: "sawAlpha && sawBeta"
                                   message:
                                     expr: "`fanout child sessions missing (alpha=${String(sawAlpha)} beta=${String(sawBeta)})`"
+                              # Tool-call assertion (criterion 2 of the
+                              # parity completion gate in #64227): the
+                              # scenario must have actually invoked
+                              # `sessions_spawn` at least twice with
+                              # distinct labels, not just ended up with
+                              # two rows in the session store through
+                              # prose trickery. The session store alone
+                              # can be populated by other flows or by a
+                              # model that fabricates "delegation"
+                              # narration. `plannedToolName` on the
+                              # mock's `/debug/requests` log is the
+                              # tool-call ground truth: two recorded
+                              # sessions_spawn requests with distinct
+                              # labels means the model really dispatched
+                              # both subagents.
+                              - set: fanoutSpawnRequests
+                                value:
+                                  expr: "[...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].filter((request) => request.plannedToolName === 'sessions_spawn' && /subagent fanout synthesis check/i.test(String(request.allInputText ?? '')))"
+                              - assert:
+                                  expr: "fanoutSpawnRequests.length >= 2"
+                                  message:
+                                    expr: "`expected at least two sessions_spawn tool calls during subagent fanout scenario, saw ${fanoutSpawnRequests.length}`"
                         - set: details
                           value:
                             expr: "outbound.text"
diff --git a/qa/scenarios/subagent-handoff.md b/qa/scenarios/subagent-handoff.md
index 1b61cd61f7b..74853aa65d9 100644
--- a/qa/scenarios/subagent-handoff.md
+++ b/qa/scenarios/subagent-handoff.md
@@ -46,5 +46,25 @@ steps:
           expr: "!['failed to delegate','could not delegate','subagent unavailable'].some((needle) => normalizeLowercaseStringOrEmpty(outbound.text).includes(needle))"
           message:
             expr: "`subagent handoff reported failure: ${outbound.text}`"
+      # Parity gate criterion 2 (no fake progress / fake tool completion):
+      # require an actual sessions_spawn tool call. Without this, a model
+      # could produce the three labeled sections ("Delegated task", "Result",
+      # "Evidence") as free-form prose without ever delegating to a real
+      # subagent. The assertion is pinned to THIS scenario by matching the
+      # scenario-unique prompt substring "Delegate one bounded QA task"
+      # (not a broad /delegate|subagent/ regex) so the earlier
+      # subagent-fanout-synthesis scenario — which also contains "delegate"
+      # and produces its own pre-tool sessions_spawn request — cannot
+      # satisfy the assertion here. The match is also constrained to
+      # pre-tool requests (no toolOutput) because the mock only plans
+      # sessions_spawn on requests with no toolOutput; the follow-up
+      # request after the tool runs has plannedToolName unset.
+      - set: subagentDebugRequests
+        value:
+          expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))] : []"
+      - assert:
+          expr: "!env.mock || subagentDebugRequests.some((request) => !request.toolOutput && /delegate one bounded qa task/i.test(String(request.allInputText ?? '')) && request.plannedToolName === 'sessions_spawn')"
+          message:
+            expr: "`expected sessions_spawn tool call during subagent handoff scenario, saw plannedToolNames=${JSON.stringify(subagentDebugRequests.map((request) => request.plannedToolName ?? null))}`"
     detailsExpr: outbound.text
 ```
diff --git a/src/canvas-host/a2ui/.bundle.hash b/src/canvas-host/a2ui/.bundle.hash
index a06dc954c6a..6d2e0ab9c39 100644
--- a/src/canvas-host/a2ui/.bundle.hash
+++ b/src/canvas-host/a2ui/.bundle.hash
@@ -1 +1 @@
-b92daceecab88cdb1ceeab30a7321399850a1fd13773af22dbb2035d39cdd5f8
+1d087c0991987824d78c8ac4ec2c0e66d661f4bd4afd12b193d66634c69d75a0