test(agents): keep openai image cache probe non-blocking

This commit is contained in:
Peter Steinberger
2026-04-27 11:34:09 +01:00
parent 169d33ded2
commit ca88daad1e
4 changed files with 91 additions and 7 deletions

View File

@@ -7,6 +7,12 @@ if (!LIVE_CACHE_TEST_ENABLED) {
}
const result = await runLiveCacheRegression();
if (result.warnings.length > 0) {
process.stderr.write("\n[live-cache] non-blocking cache observations:\n");
for (const warning of result.warnings) {
process.stderr.write(`- ${warning}\n`);
}
}
if (result.regressions.length > 0) {
process.stderr.write("\n[live-cache] regressions detected:\n");
for (const regression of result.regressions) {

View File

@@ -8,6 +8,7 @@ export type LiveCacheFloor = {
minHitRate?: number;
maxCacheRead?: number;
maxCacheWrite?: number;
warnOnly?: boolean;
};
export const LIVE_CACHE_REGRESSION_BASELINE = {
@@ -56,6 +57,7 @@ export const LIVE_CACHE_REGRESSION_BASELINE = {
observedHitRate: 0.954,
minCacheRead: 3_840,
minHitRate: 0.82,
warnOnly: true,
},
mcp: {
observedCacheRead: 4_608,

View File

@@ -0,0 +1,56 @@
import { describe, expect, it } from "vitest";
import { __testing } from "./live-cache-regression-runner.js";
describe("live cache regression runner", () => {
it("keeps OpenAI image cache floors observable without blocking release validation", () => {
const regressions: string[] = [];
const warnings: string[] = [];
__testing.assertAgainstBaseline({
lane: "image",
provider: "openai",
result: {
best: {
hitRate: 0,
suffix: "image-hit",
text: "CACHE-OK image-hit",
usage: { cacheRead: 0, cacheWrite: 0, input: 5_096 },
},
},
regressions,
warnings,
});
expect(regressions).toEqual([]);
expect(warnings).toEqual([
"openai:image cacheRead=0 < min=3840",
"openai:image hitRate=0.000 < min=0.820",
]);
});
it("keeps hard cache floors blocking for required OpenAI lanes", () => {
const regressions: string[] = [];
const warnings: string[] = [];
__testing.assertAgainstBaseline({
lane: "stable",
provider: "openai",
result: {
best: {
hitRate: 0,
suffix: "stable-hit",
text: "CACHE-OK stable-hit",
usage: { cacheRead: 0, cacheWrite: 0, input: 5_034 },
},
},
regressions,
warnings,
});
expect(regressions).toEqual([
"openai:stable cacheRead=0 < min=4608",
"openai:stable hitRate=0.000 < min=0.900",
]);
expect(warnings).toEqual([]);
});
});

View File

@@ -51,6 +51,7 @@ type LaneResult = {
export type LiveCacheRegressionResult = {
regressions: string[];
summary: Record<string, Record<string, unknown>>;
warnings: string[];
};
const NOOP_TOOL: Tool = {
@@ -358,8 +359,16 @@ function assertAgainstBaseline(params: {
provider: ProviderKey;
result: LaneResult;
regressions: string[];
warnings: string[];
}) {
const floor = resolveBaselineFloor(params.provider, params.lane);
const recordRegression = (message: string) => {
if (floor?.warnOnly) {
params.warnings.push(message);
} else {
params.regressions.push(message);
}
};
if (!floor) {
params.regressions.push(`${params.provider}:${params.lane} missing baseline entry`);
return;
@@ -370,17 +379,17 @@ function assertAgainstBaseline(params: {
if (floor.minCacheReadOrWrite !== undefined) {
const cacheReadOrWrite = Math.max(usage.cacheRead ?? 0, usage.cacheWrite ?? 0);
if (cacheReadOrWrite < floor.minCacheReadOrWrite) {
params.regressions.push(
recordRegression(
`${params.provider}:${params.lane} cacheReadOrWrite=${cacheReadOrWrite} < min=${floor.minCacheReadOrWrite}`,
);
}
} else if ((usage.cacheRead ?? 0) < (floor.minCacheRead ?? 0)) {
params.regressions.push(
recordRegression(
`${params.provider}:${params.lane} cacheRead=${usage.cacheRead ?? 0} < min=${floor.minCacheRead}`,
);
}
if (params.result.best.hitRate < (floor.minHitRate ?? 0)) {
params.regressions.push(
recordRegression(
`${params.provider}:${params.lane} hitRate=${params.result.best.hitRate.toFixed(3)} < min=${floor.minHitRate?.toFixed(3)}`,
);
}
@@ -389,7 +398,7 @@ function assertAgainstBaseline(params: {
if (params.result.warmup) {
const warmupUsage = params.result.warmup.usage;
if ((warmupUsage.cacheWrite ?? 0) < (floor.minCacheWrite ?? 0)) {
params.regressions.push(
recordRegression(
`${params.provider}:${params.lane} warmup cacheWrite=${warmupUsage.cacheWrite ?? 0} < min=${floor.minCacheWrite}`,
);
}
@@ -398,18 +407,22 @@ function assertAgainstBaseline(params: {
if (params.result.disabled) {
const usage = params.result.disabled.usage;
if ((usage.cacheRead ?? 0) > (floor.maxCacheRead ?? Number.POSITIVE_INFINITY)) {
params.regressions.push(
recordRegression(
`${params.provider}:${params.lane} cacheRead=${usage.cacheRead ?? 0} > max=${floor.maxCacheRead}`,
);
}
if ((usage.cacheWrite ?? 0) > (floor.maxCacheWrite ?? Number.POSITIVE_INFINITY)) {
params.regressions.push(
recordRegression(
`${params.provider}:${params.lane} cacheWrite=${usage.cacheWrite ?? 0} > max=${floor.maxCacheWrite}`,
);
}
}
}
export const __testing = {
assertAgainstBaseline,
};
export async function runLiveCacheRegression(): Promise<LiveCacheRegressionResult> {
const pngBase64 = (await fs.readFile(LIVE_TEST_PNG_URL)).toString("base64");
const runToken = randomUUID().slice(0, 13);
@@ -427,6 +440,7 @@ export async function runLiveCacheRegression(): Promise<LiveCacheRegressionResul
});
const regressions: string[] = [];
const warnings: string[] = [];
const summary: Record<string, Record<string, unknown>> = {
anthropic: {},
openai: {},
@@ -457,6 +471,7 @@ export async function runLiveCacheRegression(): Promise<LiveCacheRegressionResul
provider: "openai",
result: openaiResult,
regressions,
warnings,
});
const anthropicResult = await runRepeatedLane({
@@ -483,6 +498,7 @@ export async function runLiveCacheRegression(): Promise<LiveCacheRegressionResul
provider: "anthropic",
result: anthropicResult,
regressions,
warnings,
});
}
@@ -500,8 +516,12 @@ export async function runLiveCacheRegression(): Promise<LiveCacheRegressionResul
provider: "anthropic",
result: disabled,
regressions,
warnings,
});
logLiveCache(`cache regression summary ${JSON.stringify(summary)}`);
return { regressions, summary };
if (warnings.length > 0) {
logLiveCache(`cache regression warnings ${JSON.stringify(warnings)}`);
}
return { regressions, summary, warnings };
}