diff --git a/web/app/components/evaluation/__tests__/default-metric-descriptions.spec.ts b/web/app/components/evaluation/__tests__/default-metric-descriptions.spec.ts
new file mode 100644
index 0000000000..9671220334
--- /dev/null
+++ b/web/app/components/evaluation/__tests__/default-metric-descriptions.spec.ts
@@ -0,0 +1,34 @@
+import { getDefaultMetricDescription, getDefaultMetricDescriptionI18nKey, getTranslatedMetricDescription } from '../default-metric-descriptions'
+
+describe('default metric descriptions', () => {
+ it('should resolve descriptions for kebab-case metric ids', () => {
+ expect(getDefaultMetricDescription('context-precision')).toContain('retrieval pipeline returns little noise')
+ expect(getDefaultMetricDescription('answer-correctness')).toContain('factual accuracy and completeness')
+ })
+
+ it('should normalize snake_case metric ids from backend payloads', () => {
+ expect(getDefaultMetricDescription('CONTEXT_RECALL')).toContain('does not miss important supporting evidence')
+ expect(getDefaultMetricDescription('TOOL_CORRECTNESS')).toContain('tool-use strategy matches the expected behavior')
+ })
+
+ it('should support the legacy relevance alias', () => {
+ expect(getDefaultMetricDescription('relevance')).toContain('addresses the user\'s question')
+ })
+
+ it('should resolve i18n keys for builtin metrics', () => {
+ expect(getDefaultMetricDescriptionI18nKey('context-precision')).toBe('metrics.builtin.description.contextPrecision')
+ expect(getDefaultMetricDescriptionI18nKey('ANSWER_RELEVANCY')).toBe('metrics.builtin.description.answerRelevancy')
+ })
+
+ it('should use translated content when translation key exists', () => {
+ const t = vi.fn((key: string, options?: { defaultValue?: string }) => {
+ if (key === 'metrics.builtin.description.faithfulness')
+ return '忠实性中文文案'
+
+ return options?.defaultValue ?? key
+ })
+
+ expect(getTranslatedMetricDescription(t as never, 'faithfulness')).toBe('忠实性中文文案')
+ expect(getTranslatedMetricDescription(t as never, 'latency', 'Latency fallback')).toBe('Latency fallback')
+ })
+})
diff --git a/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts b/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts
index d2ad11bc12..24e1b494bc 100644
--- a/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts
+++ b/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts
@@ -1,7 +1,9 @@
import type { BuiltinMetricMap, MetricSelectorSection } from './types'
import type { NodeInfo } from '@/types/evaluation'
import { useEffect, useMemo } from 'react'
+import { useTranslation } from 'react-i18next'
import { useAvailableEvaluationMetrics, useEvaluationNodeInfoMutation } from '@/service/use-evaluation'
+import { getTranslatedMetricDescription } from '../../default-metric-descriptions'
import { getEvaluationMockConfig } from '../../mock'
import { useEvaluationResource, useEvaluationStore } from '../../store'
import {
@@ -34,6 +36,7 @@ export const useMetricSelectorData = ({
nodeInfoMap,
setNodeInfoMap,
}: UseMetricSelectorDataOptions): UseMetricSelectorDataResult => {
+ const { t } = useTranslation('evaluation')
const config = getEvaluationMockConfig(resourceType)
const metrics = useEvaluationResource(resourceType, resourceId).metrics
const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
@@ -102,9 +105,10 @@ export const useMetricSelectorData = ({
const keyword = query.trim().toLowerCase()
return resolvedMetrics.map((metric) => {
+ const metricDescription = getTranslatedMetricDescription(t, metric.id, metric.description)
const metricMatches = !keyword
|| metric.label.toLowerCase().includes(keyword)
- || metric.description.toLowerCase().includes(keyword)
+ || metricDescription.toLowerCase().includes(keyword)
const metricNodes = nodeInfoMap[metric.id] ?? []
const supportsNodeSelection = resourceType !== 'datasets'
const hasNoNodeInfo = supportsNodeSelection && metricNodes.length === 0
@@ -114,7 +118,10 @@ export const useMetricSelectorData = ({
return null
return {
- metric,
+ metric: {
+ ...metric,
+ description: metricDescription,
+ },
hasNoNodeInfo: true,
visibleNodes: [] as NodeInfo[],
}
@@ -132,12 +139,15 @@ export const useMetricSelectorData = ({
return null
return {
- metric,
+ metric: {
+ ...metric,
+ description: metricDescription,
+ },
hasNoNodeInfo: false,
visibleNodes,
}
}).filter(section => !!section)
- }, [nodeInfoMap, query, resolvedMetrics, resourceType])
+ }, [nodeInfoMap, query, resolvedMetrics, resourceType, t])
const toggleNodeSelection = (metricId: string, nodeInfo: NodeInfo) => {
const addedMetric = builtinMetricMap.get(metricId)
diff --git a/web/app/components/evaluation/components/metric-selector/utils.ts b/web/app/components/evaluation/components/metric-selector/utils.ts
index 274737127e..2a6ca53158 100644
--- a/web/app/components/evaluation/components/metric-selector/utils.ts
+++ b/web/app/components/evaluation/components/metric-selector/utils.ts
@@ -1,6 +1,7 @@
import type { MetricOption } from '../../types'
import type { MetricVisualTone } from './types'
import type { EvaluationTargetType, NodeInfo } from '@/types/evaluation'
+import { getDefaultMetricDescription } from '../../default-metric-descriptions'
export const toEvaluationTargetType = (resourceType: 'apps' | 'snippets'): EvaluationTargetType => {
return resourceType === 'snippets' ? 'snippets' : 'apps'
@@ -17,7 +18,7 @@ const humanizeMetricId = (metricId: string) => {
export const buildMetricOption = (metricId: string): MetricOption => ({
id: metricId,
label: humanizeMetricId(metricId),
- description: '',
+ description: getDefaultMetricDescription(metricId),
valueType: 'number',
})
diff --git a/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx b/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx
index e535da2204..314d3cfae8 100644
--- a/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx
+++ b/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx
@@ -6,6 +6,7 @@ import Checkbox from '@/app/components/base/checkbox'
import Input from '@/app/components/base/input'
import { Tooltip, TooltipContent, TooltipTrigger } from '@/app/components/base/ui/tooltip'
import { cn } from '@/utils/classnames'
+import { getTranslatedMetricDescription } from '../../default-metric-descriptions'
import { DEFAULT_PIPELINE_METRIC_THRESHOLD } from '../../store-utils'
type PipelineMetricItemProps = {
@@ -26,6 +27,7 @@ const PipelineMetricItem = ({
onThresholdChange,
}: PipelineMetricItemProps) => {
const { t } = useTranslation('evaluation')
+ const metricDescription = getTranslatedMetricDescription(t, metric.id, metric.description)
return (
@@ -45,7 +47,7 @@ const PipelineMetricItem = ({
)}
/>
- {metric.description}
+ {metricDescription}
diff --git a/web/app/components/evaluation/default-metric-descriptions.ts b/web/app/components/evaluation/default-metric-descriptions.ts
new file mode 100644
index 0000000000..c989a45d62
--- /dev/null
+++ b/web/app/components/evaluation/default-metric-descriptions.ts
@@ -0,0 +1,79 @@
+import type { TFunction } from 'i18next'
+
+const DEFAULT_METRIC_DESCRIPTION = {
+ FAITHFULNESS: 'Measures whether every claim in the model\'s response is grounded in the provided retrieved context. A high score means the answer contains no hallucinated content; each statement can be traced back to a passage in the context.',
+ ANSWER_RELEVANCY: 'Measures how well the model\'s response addresses the user\'s question. A high score means the answer stays on-topic; a low score indicates irrelevant content or a failure to answer the actual question.',
+ ANSWER_CORRECTNESS: 'Measures the factual accuracy and completeness of the model\'s answer relative to a ground-truth reference. It combines semantic similarity with key-fact coverage, so both meaning and content matter.',
+ SEMANTIC_SIMILARITY: 'Measures the cosine similarity between the model\'s response and the reference answer in an embedding space. It evaluates whether the two texts convey the same meaning, independent of factual correctness.',
+ CONTEXT_PRECISION: 'Measures the proportion of retrieved context chunks that are actually relevant to the question (precision). A high score means the retrieval pipeline returns little noise.',
+ CONTEXT_RECALL: 'Measures the proportion of ground-truth information that is covered by the retrieved context chunks (recall). A high score means the retrieval pipeline does not miss important supporting evidence.',
+ CONTEXT_RELEVANCE: 'Measures how relevant each individual retrieved chunk is to the query. Similar to CONTEXT_PRECISION but evaluated at the chunk level rather than against a reference answer.',
+ TOOL_CORRECTNESS: 'Measures the correctness of the tool calls made by the agent during task execution: both the choice of tool and the arguments passed. A high score means the agent\'s tool-use strategy matches the expected behavior.',
+ TASK_COMPLETION: 'Measures whether the agent ultimately achieves the user\'s stated goal. It evaluates the reasoning chain, intermediate steps, and final output holistically; a high score means the task was fully accomplished.',
+} as const
+
+type DefaultMetricDescription = typeof DEFAULT_METRIC_DESCRIPTION[keyof typeof DEFAULT_METRIC_DESCRIPTION]
+
+const DEFAULT_METRIC_DESCRIPTION_KEYS = {
+ FAITHFULNESS: 'metrics.builtin.description.faithfulness',
+ ANSWER_RELEVANCY: 'metrics.builtin.description.answerRelevancy',
+ ANSWER_CORRECTNESS: 'metrics.builtin.description.answerCorrectness',
+ SEMANTIC_SIMILARITY: 'metrics.builtin.description.semanticSimilarity',
+ CONTEXT_PRECISION: 'metrics.builtin.description.contextPrecision',
+ CONTEXT_RECALL: 'metrics.builtin.description.contextRecall',
+ CONTEXT_RELEVANCE: 'metrics.builtin.description.contextRelevance',
+ TOOL_CORRECTNESS: 'metrics.builtin.description.toolCorrectness',
+ TASK_COMPLETION: 'metrics.builtin.description.taskCompletion',
+} as const
+
+type DefaultMetricDescriptionKey = typeof DEFAULT_METRIC_DESCRIPTION_KEYS[keyof typeof DEFAULT_METRIC_DESCRIPTION_KEYS]
+
+const DEFAULT_METRIC_DESCRIPTIONS: Record = {
+ 'faithfulness': DEFAULT_METRIC_DESCRIPTION.FAITHFULNESS,
+ 'answer-relevancy': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
+ 'answer-correctness': DEFAULT_METRIC_DESCRIPTION.ANSWER_CORRECTNESS,
+ 'semantic-similarity': DEFAULT_METRIC_DESCRIPTION.SEMANTIC_SIMILARITY,
+ 'context-precision': DEFAULT_METRIC_DESCRIPTION.CONTEXT_PRECISION,
+ 'context-recall': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RECALL,
+ 'context-relevance': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RELEVANCE,
+ 'tool-correctness': DEFAULT_METRIC_DESCRIPTION.TOOL_CORRECTNESS,
+ 'task-completion': DEFAULT_METRIC_DESCRIPTION.TASK_COMPLETION,
+ 'relevance': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
+}
+
+const DEFAULT_METRIC_DESCRIPTION_I18N_KEYS: Record = {
+ 'faithfulness': DEFAULT_METRIC_DESCRIPTION_KEYS.FAITHFULNESS,
+ 'answer-relevancy': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
+ 'answer-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_CORRECTNESS,
+ 'semantic-similarity': DEFAULT_METRIC_DESCRIPTION_KEYS.SEMANTIC_SIMILARITY,
+ 'context-precision': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_PRECISION,
+ 'context-recall': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RECALL,
+ 'context-relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RELEVANCE,
+ 'tool-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.TOOL_CORRECTNESS,
+ 'task-completion': DEFAULT_METRIC_DESCRIPTION_KEYS.TASK_COMPLETION,
+ 'relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
+}
+
+const normalizeMetricId = (metricId: string) => metricId.trim().toLowerCase().replace(/_/g, '-')
+
+export const getDefaultMetricDescription = (metricId: string) => {
+ return DEFAULT_METRIC_DESCRIPTIONS[normalizeMetricId(metricId)] ?? ''
+}
+
+export const getDefaultMetricDescriptionI18nKey = (metricId: string) => {
+ return DEFAULT_METRIC_DESCRIPTION_I18N_KEYS[normalizeMetricId(metricId)] ?? null
+}
+
+export const getTranslatedMetricDescription = (
+ t: TFunction<'evaluation'>,
+ metricId: string,
+ fallbackDescription = '',
+) => {
+ const defaultDescription = fallbackDescription || getDefaultMetricDescription(metricId)
+ const descriptionI18nKey = getDefaultMetricDescriptionI18nKey(metricId)
+
+ if (!descriptionI18nKey)
+ return defaultDescription
+
+ return t(descriptionI18nKey, { defaultValue: defaultDescription })
+}
diff --git a/web/app/components/evaluation/mock.ts b/web/app/components/evaluation/mock.ts
index 9425672b93..61f6d57b39 100644
--- a/web/app/components/evaluation/mock.ts
+++ b/web/app/components/evaluation/mock.ts
@@ -4,6 +4,7 @@ import type {
EvaluationResourceType,
MetricOption,
} from './types'
+import { getDefaultMetricDescription } from './default-metric-descriptions'
const judgeModels = [
{
@@ -27,19 +28,19 @@ const builtinMetrics: MetricOption[] = [
{
id: 'answer-correctness',
label: 'Answer Correctness',
- description: 'Compares the response with the expected answer and scores factual alignment.',
+ description: getDefaultMetricDescription('answer-correctness'),
valueType: 'number',
},
{
id: 'faithfulness',
label: 'Faithfulness',
- description: 'Checks whether the answer stays grounded in the retrieved evidence.',
+ description: getDefaultMetricDescription('faithfulness'),
valueType: 'number',
},
{
id: 'relevance',
label: 'Relevance',
- description: 'Evaluates how directly the answer addresses the original request.',
+ description: getDefaultMetricDescription('relevance'),
valueType: 'number',
},
{
@@ -66,19 +67,19 @@ const pipelineBuiltinMetrics: MetricOption[] = [
{
id: 'context-precision',
label: 'Context Precision',
- description: 'Measures whether retrieved chunks stay tightly aligned to the request.',
+ description: getDefaultMetricDescription('context-precision'),
valueType: 'number',
},
{
id: 'context-recall',
label: 'Context Recall',
- description: 'Checks whether the retrieval result includes the evidence needed to answer.',
+ description: getDefaultMetricDescription('context-recall'),
valueType: 'number',
},
{
id: 'context-relevance',
label: 'Context Relevance',
- description: 'Scores how useful the retrieved context is for downstream generation.',
+ description: getDefaultMetricDescription('context-relevance'),
valueType: 'number',
},
]
diff --git a/web/app/components/evaluation/store-utils.ts b/web/app/components/evaluation/store-utils.ts
index c34e643baa..b8903442d8 100644
--- a/web/app/components/evaluation/store-utils.ts
+++ b/web/app/components/evaluation/store-utils.ts
@@ -20,6 +20,7 @@ import type {
EvaluationRunRequest,
NodeInfo,
} from '@/types/evaluation'
+import { getDefaultMetricDescription } from './default-metric-descriptions'
import { getEvaluationMockConfig } from './mock'
import {
buildConditionMetricOptions,
@@ -51,7 +52,7 @@ const resolveMetricOption = (resourceType: EvaluationResourceType, metricId: str
return config.builtinMetrics.find(metric => metric.id === metricId) ?? {
id: metricId,
label: humanizeMetricId(metricId),
- description: '',
+ description: getDefaultMetricDescription(metricId),
valueType: 'number',
}
}
diff --git a/web/i18n/en-US/evaluation.json b/web/i18n/en-US/evaluation.json
index 57535ee836..9802bc904c 100644
--- a/web/i18n/en-US/evaluation.json
+++ b/web/i18n/en-US/evaluation.json
@@ -83,6 +83,15 @@
"metrics.addCustom": "Add Custom Metrics",
"metrics.addNode": "Add Node",
"metrics.added": "Added",
+ "metrics.builtin.description.answerCorrectness": "Measures the factual accuracy and completeness of the model's answer relative to a ground-truth reference. It combines semantic similarity with key-fact coverage, so both meaning and content matter.",
+ "metrics.builtin.description.answerRelevancy": "Measures how well the model's response addresses the user's question. A high score means the answer stays on-topic; a low score indicates irrelevant content or a failure to answer the actual question.",
+ "metrics.builtin.description.contextPrecision": "Measures the proportion of retrieved context chunks that are actually relevant to the question (precision). A high score means the retrieval pipeline returns little noise.",
+ "metrics.builtin.description.contextRecall": "Measures the proportion of ground-truth information that is covered by the retrieved context chunks (recall). A high score means the retrieval pipeline does not miss important supporting evidence.",
+ "metrics.builtin.description.contextRelevance": "Measures how relevant each individual retrieved chunk is to the query. Similar to CONTEXT_PRECISION but evaluated at the chunk level rather than against a reference answer.",
+ "metrics.builtin.description.faithfulness": "Measures whether every claim in the model's response is grounded in the provided retrieved context. A high score means the answer contains no hallucinated content; each statement can be traced back to a passage in the context.",
+ "metrics.builtin.description.semanticSimilarity": "Measures the cosine similarity between the model's response and the reference answer in an embedding space. It evaluates whether the two texts convey the same meaning, independent of factual correctness.",
+ "metrics.builtin.description.taskCompletion": "Measures whether the agent ultimately achieves the user's stated goal. It evaluates the reasoning chain, intermediate steps, and final output holistically; a high score means the task was fully accomplished.",
+ "metrics.builtin.description.toolCorrectness": "Measures the correctness of the tool calls made by the agent during task execution: both the choice of tool and the arguments passed. A high score means the agent's tool-use strategy matches the expected behavior.",
"metrics.collapseNodes": "Collapse nodes",
"metrics.custom.description": "Select an evaluation workflow and map your variables before running tests.",
"metrics.custom.footerDescription": "Connect your published evaluation workflows",
diff --git a/web/i18n/zh-Hans/evaluation.json b/web/i18n/zh-Hans/evaluation.json
index e02558387f..0f499426e4 100644
--- a/web/i18n/zh-Hans/evaluation.json
+++ b/web/i18n/zh-Hans/evaluation.json
@@ -83,6 +83,15 @@
"metrics.addCustom": "添加自定义指标",
"metrics.addNode": "添加节点",
"metrics.added": "已添加",
+ "metrics.builtin.description.answerCorrectness": "衡量模型回答相对于标准答案的事实准确性与完整性。它结合了语义相似度与关键信息覆盖情况,因此不仅关注表达含义,也关注内容是否完整准确。",
+ "metrics.builtin.description.answerRelevancy": "衡量模型回答与用户问题的贴合程度。高分表示回答始终围绕问题展开;低分表示内容偏题,或没有真正回答用户的实际问题。",
+ "metrics.builtin.description.contextPrecision": "衡量检索出的上下文片段中,实际与问题相关的内容占比(Precision)。高分表示检索流程带回的噪声较少。",
+ "metrics.builtin.description.contextRecall": "衡量标准答案所需的真实信息,有多少被检索出的上下文片段覆盖到(Recall)。高分表示检索流程没有遗漏重要的支撑证据。",
+ "metrics.builtin.description.contextRelevance": "衡量每一个被检索出的上下文片段与查询的相关程度。它与 CONTEXT_PRECISION 类似,但评估粒度在单个 chunk 层面,而不是相对于参考答案整体评估。",
+ "metrics.builtin.description.faithfulness": "衡量模型回答中的每一个陈述,是否都能从提供的检索上下文中找到依据。高分表示回答中没有幻觉内容,每一条表述都可以追溯到上下文中的某个片段。",
+ "metrics.builtin.description.semanticSimilarity": "衡量模型回答与参考答案在向量语义空间中的余弦相似度。它评估的是两段文本是否表达了相同含义,而不直接判断事实是否正确。",
+ "metrics.builtin.description.taskCompletion": "衡量 Agent 是否最终完成了用户明确提出的目标。它会整体评估推理链路、中间步骤和最终输出;高分表示任务已被完整达成。",
+ "metrics.builtin.description.toolCorrectness": "衡量 Agent 在任务执行过程中发起的工具调用是否正确,包括工具选择本身以及传入参数是否合理。高分表示 Agent 的工具使用策略符合预期行为。",
"metrics.custom.description": "选择评测工作流并完成变量映射后即可运行测试。",
"metrics.custom.mappingTitle": "变量映射",
"metrics.custom.mappingWarning": "请先完成工作流选择和所有变量映射,再运行批量测试。",