From f6047aafe8a673d8268fd2c76e0684cab75c6116 Mon Sep 17 00:00:00 2001 From: JzoNg Date: Mon, 13 Apr 2026 14:13:04 +0800 Subject: [PATCH] feat(web): metric descriptions --- .../default-metric-descriptions.spec.ts | 34 ++++++++ .../use-metric-selector-data.ts | 18 ++++- .../components/metric-selector/utils.ts | 3 +- .../pipeline/pipeline-metric-item.tsx | 4 +- .../evaluation/default-metric-descriptions.ts | 79 +++++++++++++++++++ web/app/components/evaluation/mock.ts | 13 +-- web/app/components/evaluation/store-utils.ts | 3 +- web/i18n/en-US/evaluation.json | 9 +++ web/i18n/zh-Hans/evaluation.json | 9 +++ 9 files changed, 159 insertions(+), 13 deletions(-) create mode 100644 web/app/components/evaluation/__tests__/default-metric-descriptions.spec.ts create mode 100644 web/app/components/evaluation/default-metric-descriptions.ts diff --git a/web/app/components/evaluation/__tests__/default-metric-descriptions.spec.ts b/web/app/components/evaluation/__tests__/default-metric-descriptions.spec.ts new file mode 100644 index 0000000000..9671220334 --- /dev/null +++ b/web/app/components/evaluation/__tests__/default-metric-descriptions.spec.ts @@ -0,0 +1,34 @@ +import { getDefaultMetricDescription, getDefaultMetricDescriptionI18nKey, getTranslatedMetricDescription } from '../default-metric-descriptions' + +describe('default metric descriptions', () => { + it('should resolve descriptions for kebab-case metric ids', () => { + expect(getDefaultMetricDescription('context-precision')).toContain('retrieval pipeline returns little noise') + expect(getDefaultMetricDescription('answer-correctness')).toContain('factual accuracy and completeness') + }) + + it('should normalize snake_case metric ids from backend payloads', () => { + expect(getDefaultMetricDescription('CONTEXT_RECALL')).toContain('does not miss important supporting evidence') + expect(getDefaultMetricDescription('TOOL_CORRECTNESS')).toContain('tool-use strategy matches the expected behavior') + }) + + it('should support the legacy relevance alias', () => { + expect(getDefaultMetricDescription('relevance')).toContain('addresses the user\'s question') + }) + + it('should resolve i18n keys for builtin metrics', () => { + expect(getDefaultMetricDescriptionI18nKey('context-precision')).toBe('metrics.builtin.description.contextPrecision') + expect(getDefaultMetricDescriptionI18nKey('ANSWER_RELEVANCY')).toBe('metrics.builtin.description.answerRelevancy') + }) + + it('should use translated content when translation key exists', () => { + const t = vi.fn((key: string, options?: { defaultValue?: string }) => { + if (key === 'metrics.builtin.description.faithfulness') + return '忠实性中文文案' + + return options?.defaultValue ?? key + }) + + expect(getTranslatedMetricDescription(t as never, 'faithfulness')).toBe('忠实性中文文案') + expect(getTranslatedMetricDescription(t as never, 'latency', 'Latency fallback')).toBe('Latency fallback') + }) +}) diff --git a/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts b/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts index d2ad11bc12..24e1b494bc 100644 --- a/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts +++ b/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts @@ -1,7 +1,9 @@ import type { BuiltinMetricMap, MetricSelectorSection } from './types' import type { NodeInfo } from '@/types/evaluation' import { useEffect, useMemo } from 'react' +import { useTranslation } from 'react-i18next' import { useAvailableEvaluationMetrics, useEvaluationNodeInfoMutation } from '@/service/use-evaluation' +import { getTranslatedMetricDescription } from '../../default-metric-descriptions' import { getEvaluationMockConfig } from '../../mock' import { useEvaluationResource, useEvaluationStore } from '../../store' import { @@ -34,6 +36,7 @@ export const useMetricSelectorData = ({ nodeInfoMap, setNodeInfoMap, }: UseMetricSelectorDataOptions): UseMetricSelectorDataResult => { + const { t } = useTranslation('evaluation') const config = getEvaluationMockConfig(resourceType) const metrics = useEvaluationResource(resourceType, resourceId).metrics const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric) @@ -102,9 +105,10 @@ export const useMetricSelectorData = ({ const keyword = query.trim().toLowerCase() return resolvedMetrics.map((metric) => { + const metricDescription = getTranslatedMetricDescription(t, metric.id, metric.description) const metricMatches = !keyword || metric.label.toLowerCase().includes(keyword) - || metric.description.toLowerCase().includes(keyword) + || metricDescription.toLowerCase().includes(keyword) const metricNodes = nodeInfoMap[metric.id] ?? [] const supportsNodeSelection = resourceType !== 'datasets' const hasNoNodeInfo = supportsNodeSelection && metricNodes.length === 0 @@ -114,7 +118,10 @@ export const useMetricSelectorData = ({ return null return { - metric, + metric: { + ...metric, + description: metricDescription, + }, hasNoNodeInfo: true, visibleNodes: [] as NodeInfo[], } @@ -132,12 +139,15 @@ export const useMetricSelectorData = ({ return null return { - metric, + metric: { + ...metric, + description: metricDescription, + }, hasNoNodeInfo: false, visibleNodes, } }).filter(section => !!section) - }, [nodeInfoMap, query, resolvedMetrics, resourceType]) + }, [nodeInfoMap, query, resolvedMetrics, resourceType, t]) const toggleNodeSelection = (metricId: string, nodeInfo: NodeInfo) => { const addedMetric = builtinMetricMap.get(metricId) diff --git a/web/app/components/evaluation/components/metric-selector/utils.ts b/web/app/components/evaluation/components/metric-selector/utils.ts index 274737127e..2a6ca53158 100644 --- a/web/app/components/evaluation/components/metric-selector/utils.ts +++ b/web/app/components/evaluation/components/metric-selector/utils.ts @@ -1,6 +1,7 @@ import type { MetricOption } from '../../types' import type { MetricVisualTone } from './types' import type { EvaluationTargetType, NodeInfo } from '@/types/evaluation' +import { getDefaultMetricDescription } from '../../default-metric-descriptions' export const toEvaluationTargetType = (resourceType: 'apps' | 'snippets'): EvaluationTargetType => { return resourceType === 'snippets' ? 'snippets' : 'apps' @@ -17,7 +18,7 @@ const humanizeMetricId = (metricId: string) => { export const buildMetricOption = (metricId: string): MetricOption => ({ id: metricId, label: humanizeMetricId(metricId), - description: '', + description: getDefaultMetricDescription(metricId), valueType: 'number', }) diff --git a/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx b/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx index e535da2204..314d3cfae8 100644 --- a/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx +++ b/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx @@ -6,6 +6,7 @@ import Checkbox from '@/app/components/base/checkbox' import Input from '@/app/components/base/input' import { Tooltip, TooltipContent, TooltipTrigger } from '@/app/components/base/ui/tooltip' import { cn } from '@/utils/classnames' +import { getTranslatedMetricDescription } from '../../default-metric-descriptions' import { DEFAULT_PIPELINE_METRIC_THRESHOLD } from '../../store-utils' type PipelineMetricItemProps = { @@ -26,6 +27,7 @@ const PipelineMetricItem = ({ onThresholdChange, }: PipelineMetricItemProps) => { const { t } = useTranslation('evaluation') + const metricDescription = getTranslatedMetricDescription(t, metric.id, metric.description) return (
@@ -45,7 +47,7 @@ const PipelineMetricItem = ({ )} /> - {metric.description} + {metricDescription} diff --git a/web/app/components/evaluation/default-metric-descriptions.ts b/web/app/components/evaluation/default-metric-descriptions.ts new file mode 100644 index 0000000000..c989a45d62 --- /dev/null +++ b/web/app/components/evaluation/default-metric-descriptions.ts @@ -0,0 +1,79 @@ +import type { TFunction } from 'i18next' + +const DEFAULT_METRIC_DESCRIPTION = { + FAITHFULNESS: 'Measures whether every claim in the model\'s response is grounded in the provided retrieved context. A high score means the answer contains no hallucinated content; each statement can be traced back to a passage in the context.', + ANSWER_RELEVANCY: 'Measures how well the model\'s response addresses the user\'s question. A high score means the answer stays on-topic; a low score indicates irrelevant content or a failure to answer the actual question.', + ANSWER_CORRECTNESS: 'Measures the factual accuracy and completeness of the model\'s answer relative to a ground-truth reference. It combines semantic similarity with key-fact coverage, so both meaning and content matter.', + SEMANTIC_SIMILARITY: 'Measures the cosine similarity between the model\'s response and the reference answer in an embedding space. It evaluates whether the two texts convey the same meaning, independent of factual correctness.', + CONTEXT_PRECISION: 'Measures the proportion of retrieved context chunks that are actually relevant to the question (precision). A high score means the retrieval pipeline returns little noise.', + CONTEXT_RECALL: 'Measures the proportion of ground-truth information that is covered by the retrieved context chunks (recall). A high score means the retrieval pipeline does not miss important supporting evidence.', + CONTEXT_RELEVANCE: 'Measures how relevant each individual retrieved chunk is to the query. Similar to CONTEXT_PRECISION but evaluated at the chunk level rather than against a reference answer.', + TOOL_CORRECTNESS: 'Measures the correctness of the tool calls made by the agent during task execution: both the choice of tool and the arguments passed. A high score means the agent\'s tool-use strategy matches the expected behavior.', + TASK_COMPLETION: 'Measures whether the agent ultimately achieves the user\'s stated goal. It evaluates the reasoning chain, intermediate steps, and final output holistically; a high score means the task was fully accomplished.', +} as const + +type DefaultMetricDescription = typeof DEFAULT_METRIC_DESCRIPTION[keyof typeof DEFAULT_METRIC_DESCRIPTION] + +const DEFAULT_METRIC_DESCRIPTION_KEYS = { + FAITHFULNESS: 'metrics.builtin.description.faithfulness', + ANSWER_RELEVANCY: 'metrics.builtin.description.answerRelevancy', + ANSWER_CORRECTNESS: 'metrics.builtin.description.answerCorrectness', + SEMANTIC_SIMILARITY: 'metrics.builtin.description.semanticSimilarity', + CONTEXT_PRECISION: 'metrics.builtin.description.contextPrecision', + CONTEXT_RECALL: 'metrics.builtin.description.contextRecall', + CONTEXT_RELEVANCE: 'metrics.builtin.description.contextRelevance', + TOOL_CORRECTNESS: 'metrics.builtin.description.toolCorrectness', + TASK_COMPLETION: 'metrics.builtin.description.taskCompletion', +} as const + +type DefaultMetricDescriptionKey = typeof DEFAULT_METRIC_DESCRIPTION_KEYS[keyof typeof DEFAULT_METRIC_DESCRIPTION_KEYS] + +const DEFAULT_METRIC_DESCRIPTIONS: Record = { + 'faithfulness': DEFAULT_METRIC_DESCRIPTION.FAITHFULNESS, + 'answer-relevancy': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY, + 'answer-correctness': DEFAULT_METRIC_DESCRIPTION.ANSWER_CORRECTNESS, + 'semantic-similarity': DEFAULT_METRIC_DESCRIPTION.SEMANTIC_SIMILARITY, + 'context-precision': DEFAULT_METRIC_DESCRIPTION.CONTEXT_PRECISION, + 'context-recall': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RECALL, + 'context-relevance': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RELEVANCE, + 'tool-correctness': DEFAULT_METRIC_DESCRIPTION.TOOL_CORRECTNESS, + 'task-completion': DEFAULT_METRIC_DESCRIPTION.TASK_COMPLETION, + 'relevance': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY, +} + +const DEFAULT_METRIC_DESCRIPTION_I18N_KEYS: Record = { + 'faithfulness': DEFAULT_METRIC_DESCRIPTION_KEYS.FAITHFULNESS, + 'answer-relevancy': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY, + 'answer-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_CORRECTNESS, + 'semantic-similarity': DEFAULT_METRIC_DESCRIPTION_KEYS.SEMANTIC_SIMILARITY, + 'context-precision': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_PRECISION, + 'context-recall': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RECALL, + 'context-relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RELEVANCE, + 'tool-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.TOOL_CORRECTNESS, + 'task-completion': DEFAULT_METRIC_DESCRIPTION_KEYS.TASK_COMPLETION, + 'relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY, +} + +const normalizeMetricId = (metricId: string) => metricId.trim().toLowerCase().replace(/_/g, '-') + +export const getDefaultMetricDescription = (metricId: string) => { + return DEFAULT_METRIC_DESCRIPTIONS[normalizeMetricId(metricId)] ?? '' +} + +export const getDefaultMetricDescriptionI18nKey = (metricId: string) => { + return DEFAULT_METRIC_DESCRIPTION_I18N_KEYS[normalizeMetricId(metricId)] ?? null +} + +export const getTranslatedMetricDescription = ( + t: TFunction<'evaluation'>, + metricId: string, + fallbackDescription = '', +) => { + const defaultDescription = fallbackDescription || getDefaultMetricDescription(metricId) + const descriptionI18nKey = getDefaultMetricDescriptionI18nKey(metricId) + + if (!descriptionI18nKey) + return defaultDescription + + return t(descriptionI18nKey, { defaultValue: defaultDescription }) +} diff --git a/web/app/components/evaluation/mock.ts b/web/app/components/evaluation/mock.ts index 9425672b93..61f6d57b39 100644 --- a/web/app/components/evaluation/mock.ts +++ b/web/app/components/evaluation/mock.ts @@ -4,6 +4,7 @@ import type { EvaluationResourceType, MetricOption, } from './types' +import { getDefaultMetricDescription } from './default-metric-descriptions' const judgeModels = [ { @@ -27,19 +28,19 @@ const builtinMetrics: MetricOption[] = [ { id: 'answer-correctness', label: 'Answer Correctness', - description: 'Compares the response with the expected answer and scores factual alignment.', + description: getDefaultMetricDescription('answer-correctness'), valueType: 'number', }, { id: 'faithfulness', label: 'Faithfulness', - description: 'Checks whether the answer stays grounded in the retrieved evidence.', + description: getDefaultMetricDescription('faithfulness'), valueType: 'number', }, { id: 'relevance', label: 'Relevance', - description: 'Evaluates how directly the answer addresses the original request.', + description: getDefaultMetricDescription('relevance'), valueType: 'number', }, { @@ -66,19 +67,19 @@ const pipelineBuiltinMetrics: MetricOption[] = [ { id: 'context-precision', label: 'Context Precision', - description: 'Measures whether retrieved chunks stay tightly aligned to the request.', + description: getDefaultMetricDescription('context-precision'), valueType: 'number', }, { id: 'context-recall', label: 'Context Recall', - description: 'Checks whether the retrieval result includes the evidence needed to answer.', + description: getDefaultMetricDescription('context-recall'), valueType: 'number', }, { id: 'context-relevance', label: 'Context Relevance', - description: 'Scores how useful the retrieved context is for downstream generation.', + description: getDefaultMetricDescription('context-relevance'), valueType: 'number', }, ] diff --git a/web/app/components/evaluation/store-utils.ts b/web/app/components/evaluation/store-utils.ts index c34e643baa..b8903442d8 100644 --- a/web/app/components/evaluation/store-utils.ts +++ b/web/app/components/evaluation/store-utils.ts @@ -20,6 +20,7 @@ import type { EvaluationRunRequest, NodeInfo, } from '@/types/evaluation' +import { getDefaultMetricDescription } from './default-metric-descriptions' import { getEvaluationMockConfig } from './mock' import { buildConditionMetricOptions, @@ -51,7 +52,7 @@ const resolveMetricOption = (resourceType: EvaluationResourceType, metricId: str return config.builtinMetrics.find(metric => metric.id === metricId) ?? { id: metricId, label: humanizeMetricId(metricId), - description: '', + description: getDefaultMetricDescription(metricId), valueType: 'number', } } diff --git a/web/i18n/en-US/evaluation.json b/web/i18n/en-US/evaluation.json index 57535ee836..9802bc904c 100644 --- a/web/i18n/en-US/evaluation.json +++ b/web/i18n/en-US/evaluation.json @@ -83,6 +83,15 @@ "metrics.addCustom": "Add Custom Metrics", "metrics.addNode": "Add Node", "metrics.added": "Added", + "metrics.builtin.description.answerCorrectness": "Measures the factual accuracy and completeness of the model's answer relative to a ground-truth reference. It combines semantic similarity with key-fact coverage, so both meaning and content matter.", + "metrics.builtin.description.answerRelevancy": "Measures how well the model's response addresses the user's question. A high score means the answer stays on-topic; a low score indicates irrelevant content or a failure to answer the actual question.", + "metrics.builtin.description.contextPrecision": "Measures the proportion of retrieved context chunks that are actually relevant to the question (precision). A high score means the retrieval pipeline returns little noise.", + "metrics.builtin.description.contextRecall": "Measures the proportion of ground-truth information that is covered by the retrieved context chunks (recall). A high score means the retrieval pipeline does not miss important supporting evidence.", + "metrics.builtin.description.contextRelevance": "Measures how relevant each individual retrieved chunk is to the query. Similar to CONTEXT_PRECISION but evaluated at the chunk level rather than against a reference answer.", + "metrics.builtin.description.faithfulness": "Measures whether every claim in the model's response is grounded in the provided retrieved context. A high score means the answer contains no hallucinated content; each statement can be traced back to a passage in the context.", + "metrics.builtin.description.semanticSimilarity": "Measures the cosine similarity between the model's response and the reference answer in an embedding space. It evaluates whether the two texts convey the same meaning, independent of factual correctness.", + "metrics.builtin.description.taskCompletion": "Measures whether the agent ultimately achieves the user's stated goal. It evaluates the reasoning chain, intermediate steps, and final output holistically; a high score means the task was fully accomplished.", + "metrics.builtin.description.toolCorrectness": "Measures the correctness of the tool calls made by the agent during task execution: both the choice of tool and the arguments passed. A high score means the agent's tool-use strategy matches the expected behavior.", "metrics.collapseNodes": "Collapse nodes", "metrics.custom.description": "Select an evaluation workflow and map your variables before running tests.", "metrics.custom.footerDescription": "Connect your published evaluation workflows", diff --git a/web/i18n/zh-Hans/evaluation.json b/web/i18n/zh-Hans/evaluation.json index e02558387f..0f499426e4 100644 --- a/web/i18n/zh-Hans/evaluation.json +++ b/web/i18n/zh-Hans/evaluation.json @@ -83,6 +83,15 @@ "metrics.addCustom": "添加自定义指标", "metrics.addNode": "添加节点", "metrics.added": "已添加", + "metrics.builtin.description.answerCorrectness": "衡量模型回答相对于标准答案的事实准确性与完整性。它结合了语义相似度与关键信息覆盖情况,因此不仅关注表达含义,也关注内容是否完整准确。", + "metrics.builtin.description.answerRelevancy": "衡量模型回答与用户问题的贴合程度。高分表示回答始终围绕问题展开;低分表示内容偏题,或没有真正回答用户的实际问题。", + "metrics.builtin.description.contextPrecision": "衡量检索出的上下文片段中,实际与问题相关的内容占比(Precision)。高分表示检索流程带回的噪声较少。", + "metrics.builtin.description.contextRecall": "衡量标准答案所需的真实信息,有多少被检索出的上下文片段覆盖到(Recall)。高分表示检索流程没有遗漏重要的支撑证据。", + "metrics.builtin.description.contextRelevance": "衡量每一个被检索出的上下文片段与查询的相关程度。它与 CONTEXT_PRECISION 类似,但评估粒度在单个 chunk 层面,而不是相对于参考答案整体评估。", + "metrics.builtin.description.faithfulness": "衡量模型回答中的每一个陈述,是否都能从提供的检索上下文中找到依据。高分表示回答中没有幻觉内容,每一条表述都可以追溯到上下文中的某个片段。", + "metrics.builtin.description.semanticSimilarity": "衡量模型回答与参考答案在向量语义空间中的余弦相似度。它评估的是两段文本是否表达了相同含义,而不直接判断事实是否正确。", + "metrics.builtin.description.taskCompletion": "衡量 Agent 是否最终完成了用户明确提出的目标。它会整体评估推理链路、中间步骤和最终输出;高分表示任务已被完整达成。", + "metrics.builtin.description.toolCorrectness": "衡量 Agent 在任务执行过程中发起的工具调用是否正确,包括工具选择本身以及传入参数是否合理。高分表示 Agent 的工具使用策略符合预期行为。", "metrics.custom.description": "选择评测工作流并完成变量映射后即可运行测试。", "metrics.custom.mappingTitle": "变量映射", "metrics.custom.mappingWarning": "请先完成工作流选择和所有变量映射,再运行批量测试。",