diff --git a/scripts/check-live-cache.ts b/scripts/check-live-cache.ts index e83a8cc3246..67306ed2174 100644 --- a/scripts/check-live-cache.ts +++ b/scripts/check-live-cache.ts @@ -7,6 +7,12 @@ if (!LIVE_CACHE_TEST_ENABLED) { } const result = await runLiveCacheRegression(); +if (result.warnings.length > 0) { + process.stderr.write("\n[live-cache] non-blocking cache observations:\n"); + for (const warning of result.warnings) { + process.stderr.write(`- ${warning}\n`); + } +} if (result.regressions.length > 0) { process.stderr.write("\n[live-cache] regressions detected:\n"); for (const regression of result.regressions) { diff --git a/src/agents/live-cache-regression-baseline.ts b/src/agents/live-cache-regression-baseline.ts index 77c30b1488e..6c76212e632 100644 --- a/src/agents/live-cache-regression-baseline.ts +++ b/src/agents/live-cache-regression-baseline.ts @@ -8,6 +8,7 @@ export type LiveCacheFloor = { minHitRate?: number; maxCacheRead?: number; maxCacheWrite?: number; + warnOnly?: boolean; }; export const LIVE_CACHE_REGRESSION_BASELINE = { @@ -56,6 +57,7 @@ export const LIVE_CACHE_REGRESSION_BASELINE = { observedHitRate: 0.954, minCacheRead: 3_840, minHitRate: 0.82, + warnOnly: true, }, mcp: { observedCacheRead: 4_608, diff --git a/src/agents/live-cache-regression-runner.test.ts b/src/agents/live-cache-regression-runner.test.ts new file mode 100644 index 00000000000..744336e5b3d --- /dev/null +++ b/src/agents/live-cache-regression-runner.test.ts @@ -0,0 +1,56 @@ +import { describe, expect, it } from "vitest"; +import { __testing } from "./live-cache-regression-runner.js"; + +describe("live cache regression runner", () => { + it("keeps OpenAI image cache floors observable without blocking release validation", () => { + const regressions: string[] = []; + const warnings: string[] = []; + + __testing.assertAgainstBaseline({ + lane: "image", + provider: "openai", + result: { + best: { + hitRate: 0, + suffix: "image-hit", + text: "CACHE-OK image-hit", + usage: { cacheRead: 0, cacheWrite: 0, input: 5_096 }, + }, + }, + regressions, + warnings, + }); + + expect(regressions).toEqual([]); + expect(warnings).toEqual([ + "openai:image cacheRead=0 < min=3840", + "openai:image hitRate=0.000 < min=0.820", + ]); + }); + + it("keeps hard cache floors blocking for required OpenAI lanes", () => { + const regressions: string[] = []; + const warnings: string[] = []; + + __testing.assertAgainstBaseline({ + lane: "stable", + provider: "openai", + result: { + best: { + hitRate: 0, + suffix: "stable-hit", + text: "CACHE-OK stable-hit", + usage: { cacheRead: 0, cacheWrite: 0, input: 5_034 }, + }, + }, + regressions, + warnings, + }); + + expect(regressions).toEqual([ + "openai:stable cacheRead=0 < min=4608", + "openai:stable hitRate=0.000 < min=0.900", + ]); + expect(warnings).toEqual([]); + }); +}); diff --git a/src/agents/live-cache-regression-runner.ts b/src/agents/live-cache-regression-runner.ts index 8bed81c072c..227fe229aea 100644 --- a/src/agents/live-cache-regression-runner.ts +++ b/src/agents/live-cache-regression-runner.ts @@ -51,6 +51,7 @@ type LaneResult = { export type LiveCacheRegressionResult = { regressions: string[]; summary: Record>; + warnings: string[]; }; const NOOP_TOOL: Tool = { @@ -358,8 +359,16 @@ function assertAgainstBaseline(params: { provider: ProviderKey; result: LaneResult; regressions: string[]; + warnings: string[]; }) { const floor = resolveBaselineFloor(params.provider, params.lane); + const recordRegression = (message: string) => { + if (floor?.warnOnly) { + params.warnings.push(message); + } else { + params.regressions.push(message); + } + }; if (!floor) { params.regressions.push(`${params.provider}:${params.lane} missing baseline entry`); return; @@ -370,17 +379,17 @@ function assertAgainstBaseline(params: { if (floor.minCacheReadOrWrite !== undefined) { const cacheReadOrWrite = Math.max(usage.cacheRead ?? 0, usage.cacheWrite ?? 0); if (cacheReadOrWrite < floor.minCacheReadOrWrite) { - params.regressions.push( + recordRegression( `${params.provider}:${params.lane} cacheReadOrWrite=${cacheReadOrWrite} < min=${floor.minCacheReadOrWrite}`, ); } } else if ((usage.cacheRead ?? 0) < (floor.minCacheRead ?? 0)) { - params.regressions.push( + recordRegression( `${params.provider}:${params.lane} cacheRead=${usage.cacheRead ?? 0} < min=${floor.minCacheRead}`, ); } if (params.result.best.hitRate < (floor.minHitRate ?? 0)) { - params.regressions.push( + recordRegression( `${params.provider}:${params.lane} hitRate=${params.result.best.hitRate.toFixed(3)} < min=${floor.minHitRate?.toFixed(3)}`, ); } @@ -389,7 +398,7 @@ function assertAgainstBaseline(params: { if (params.result.warmup) { const warmupUsage = params.result.warmup.usage; if ((warmupUsage.cacheWrite ?? 0) < (floor.minCacheWrite ?? 0)) { - params.regressions.push( + recordRegression( `${params.provider}:${params.lane} warmup cacheWrite=${warmupUsage.cacheWrite ?? 0} < min=${floor.minCacheWrite}`, ); } @@ -398,18 +407,22 @@ function assertAgainstBaseline(params: { if (params.result.disabled) { const usage = params.result.disabled.usage; if ((usage.cacheRead ?? 0) > (floor.maxCacheRead ?? Number.POSITIVE_INFINITY)) { - params.regressions.push( + recordRegression( `${params.provider}:${params.lane} cacheRead=${usage.cacheRead ?? 0} > max=${floor.maxCacheRead}`, ); } if ((usage.cacheWrite ?? 0) > (floor.maxCacheWrite ?? Number.POSITIVE_INFINITY)) { - params.regressions.push( + recordRegression( `${params.provider}:${params.lane} cacheWrite=${usage.cacheWrite ?? 0} > max=${floor.maxCacheWrite}`, ); } } } +export const __testing = { + assertAgainstBaseline, +}; + export async function runLiveCacheRegression(): Promise { const pngBase64 = (await fs.readFile(LIVE_TEST_PNG_URL)).toString("base64"); const runToken = randomUUID().slice(0, 13); @@ -427,6 +440,7 @@ export async function runLiveCacheRegression(): Promise> = { anthropic: {}, openai: {}, @@ -457,6 +471,7 @@ export async function runLiveCacheRegression(): Promise 0) { + logLiveCache(`cache regression warnings ${JSON.stringify(warnings)}`); + } + return { regressions, summary, warnings }; }