From f6047aafe8a673d8268fd2c76e0684cab75c6116 Mon Sep 17 00:00:00 2001
From: JzoNg <jzongcode@gmail.com>
Date: Mon, 13 Apr 2026 14:13:04 +0800
Subject: [PATCH] feat(web): metric descriptions

---
 .../default-metric-descriptions.spec.ts       | 34 ++++++++
 .../use-metric-selector-data.ts               | 18 ++++-
 .../components/metric-selector/utils.ts       |  3 +-
 .../pipeline/pipeline-metric-item.tsx         |  4 +-
 .../evaluation/default-metric-descriptions.ts | 79 +++++++++++++++++++
 web/app/components/evaluation/mock.ts         | 13 +--
 web/app/components/evaluation/store-utils.ts  |  3 +-
 web/i18n/en-US/evaluation.json                |  9 +++
 web/i18n/zh-Hans/evaluation.json              |  9 +++
 9 files changed, 159 insertions(+), 13 deletions(-)
 create mode 100644 web/app/components/evaluation/__tests__/default-metric-descriptions.spec.ts
 create mode 100644 web/app/components/evaluation/default-metric-descriptions.ts

diff --git a/web/app/components/evaluation/__tests__/default-metric-descriptions.spec.ts b/web/app/components/evaluation/__tests__/default-metric-descriptions.spec.ts
new file mode 100644
index 0000000000..9671220334
--- /dev/null
+++ b/web/app/components/evaluation/__tests__/default-metric-descriptions.spec.ts
@@ -0,0 +1,34 @@
+import { getDefaultMetricDescription, getDefaultMetricDescriptionI18nKey, getTranslatedMetricDescription } from '../default-metric-descriptions'
+
+describe('default metric descriptions', () => {
+  it('should resolve descriptions for kebab-case metric ids', () => {
+    expect(getDefaultMetricDescription('context-precision')).toContain('retrieval pipeline returns little noise')
+    expect(getDefaultMetricDescription('answer-correctness')).toContain('factual accuracy and completeness')
+  })
+
+  it('should normalize snake_case metric ids from backend payloads', () => {
+    expect(getDefaultMetricDescription('CONTEXT_RECALL')).toContain('does not miss important supporting evidence')
+    expect(getDefaultMetricDescription('TOOL_CORRECTNESS')).toContain('tool-use strategy matches the expected behavior')
+  })
+
+  it('should support the legacy relevance alias', () => {
+    expect(getDefaultMetricDescription('relevance')).toContain('addresses the user\'s question')
+  })
+
+  it('should resolve i18n keys for builtin metrics', () => {
+    expect(getDefaultMetricDescriptionI18nKey('context-precision')).toBe('metrics.builtin.description.contextPrecision')
+    expect(getDefaultMetricDescriptionI18nKey('ANSWER_RELEVANCY')).toBe('metrics.builtin.description.answerRelevancy')
+  })
+
+  it('should use translated content when translation key exists', () => {
+    const t = vi.fn((key: string, options?: { defaultValue?: string }) => {
+      if (key === 'metrics.builtin.description.faithfulness')
+        return '忠实性中文文案'
+
+      return options?.defaultValue ?? key
+    })
+
+    expect(getTranslatedMetricDescription(t as never, 'faithfulness')).toBe('忠实性中文文案')
+    expect(getTranslatedMetricDescription(t as never, 'latency', 'Latency fallback')).toBe('Latency fallback')
+  })
+})
diff --git a/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts b/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts
index d2ad11bc12..24e1b494bc 100644
--- a/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts
+++ b/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts
@@ -1,7 +1,9 @@
 import type { BuiltinMetricMap, MetricSelectorSection } from './types'
 import type { NodeInfo } from '@/types/evaluation'
 import { useEffect, useMemo } from 'react'
+import { useTranslation } from 'react-i18next'
 import { useAvailableEvaluationMetrics, useEvaluationNodeInfoMutation } from '@/service/use-evaluation'
+import { getTranslatedMetricDescription } from '../../default-metric-descriptions'
 import { getEvaluationMockConfig } from '../../mock'
 import { useEvaluationResource, useEvaluationStore } from '../../store'
 import {
@@ -34,6 +36,7 @@ export const useMetricSelectorData = ({
   nodeInfoMap,
   setNodeInfoMap,
 }: UseMetricSelectorDataOptions): UseMetricSelectorDataResult => {
+  const { t } = useTranslation('evaluation')
   const config = getEvaluationMockConfig(resourceType)
   const metrics = useEvaluationResource(resourceType, resourceId).metrics
   const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
@@ -102,9 +105,10 @@ export const useMetricSelectorData = ({
     const keyword = query.trim().toLowerCase()
 
     return resolvedMetrics.map((metric) => {
+      const metricDescription = getTranslatedMetricDescription(t, metric.id, metric.description)
       const metricMatches = !keyword
         || metric.label.toLowerCase().includes(keyword)
-        || metric.description.toLowerCase().includes(keyword)
+        || metricDescription.toLowerCase().includes(keyword)
       const metricNodes = nodeInfoMap[metric.id] ?? []
       const supportsNodeSelection = resourceType !== 'datasets'
       const hasNoNodeInfo = supportsNodeSelection && metricNodes.length === 0
@@ -114,7 +118,10 @@ export const useMetricSelectorData = ({
           return null
 
         return {
-          metric,
+          metric: {
+            ...metric,
+            description: metricDescription,
+          },
           hasNoNodeInfo: true,
           visibleNodes: [] as NodeInfo[],
         }
@@ -132,12 +139,15 @@ export const useMetricSelectorData = ({
         return null
 
       return {
-        metric,
+        metric: {
+          ...metric,
+          description: metricDescription,
+        },
         hasNoNodeInfo: false,
         visibleNodes,
       }
     }).filter(section => !!section)
-  }, [nodeInfoMap, query, resolvedMetrics, resourceType])
+  }, [nodeInfoMap, query, resolvedMetrics, resourceType, t])
 
   const toggleNodeSelection = (metricId: string, nodeInfo: NodeInfo) => {
     const addedMetric = builtinMetricMap.get(metricId)
diff --git a/web/app/components/evaluation/components/metric-selector/utils.ts b/web/app/components/evaluation/components/metric-selector/utils.ts
index 274737127e..2a6ca53158 100644
--- a/web/app/components/evaluation/components/metric-selector/utils.ts
+++ b/web/app/components/evaluation/components/metric-selector/utils.ts
@@ -1,6 +1,7 @@
 import type { MetricOption } from '../../types'
 import type { MetricVisualTone } from './types'
 import type { EvaluationTargetType, NodeInfo } from '@/types/evaluation'
+import { getDefaultMetricDescription } from '../../default-metric-descriptions'
 
 export const toEvaluationTargetType = (resourceType: 'apps' | 'snippets'): EvaluationTargetType => {
   return resourceType === 'snippets' ? 'snippets' : 'apps'
@@ -17,7 +18,7 @@ const humanizeMetricId = (metricId: string) => {
 export const buildMetricOption = (metricId: string): MetricOption => ({
   id: metricId,
   label: humanizeMetricId(metricId),
-  description: '',
+  description: getDefaultMetricDescription(metricId),
   valueType: 'number',
 })
 
diff --git a/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx b/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx
index e535da2204..314d3cfae8 100644
--- a/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx
+++ b/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx
@@ -6,6 +6,7 @@ import Checkbox from '@/app/components/base/checkbox'
 import Input from '@/app/components/base/input'
 import { Tooltip, TooltipContent, TooltipTrigger } from '@/app/components/base/ui/tooltip'
 import { cn } from '@/utils/classnames'
+import { getTranslatedMetricDescription } from '../../default-metric-descriptions'
 import { DEFAULT_PIPELINE_METRIC_THRESHOLD } from '../../store-utils'
 
 type PipelineMetricItemProps = {
@@ -26,6 +27,7 @@ const PipelineMetricItem = ({
   onThresholdChange,
 }: PipelineMetricItemProps) => {
   const { t } = useTranslation('evaluation')
+  const metricDescription = getTranslatedMetricDescription(t, metric.id, metric.description)
 
   return (
     <div className="flex items-center justify-between gap-3 px-1 py-1">
@@ -45,7 +47,7 @@ const PipelineMetricItem = ({
             )}
           />
           <TooltipContent>
-            {metric.description}
+            {metricDescription}
           </TooltipContent>
         </Tooltip>
       </button>
diff --git a/web/app/components/evaluation/default-metric-descriptions.ts b/web/app/components/evaluation/default-metric-descriptions.ts
new file mode 100644
index 0000000000..c989a45d62
--- /dev/null
+++ b/web/app/components/evaluation/default-metric-descriptions.ts
@@ -0,0 +1,79 @@
+import type { TFunction } from 'i18next'
+
+const DEFAULT_METRIC_DESCRIPTION = {
+  FAITHFULNESS: 'Measures whether every claim in the model\'s response is grounded in the provided retrieved context. A high score means the answer contains no hallucinated content; each statement can be traced back to a passage in the context.',
+  ANSWER_RELEVANCY: 'Measures how well the model\'s response addresses the user\'s question. A high score means the answer stays on-topic; a low score indicates irrelevant content or a failure to answer the actual question.',
+  ANSWER_CORRECTNESS: 'Measures the factual accuracy and completeness of the model\'s answer relative to a ground-truth reference. It combines semantic similarity with key-fact coverage, so both meaning and content matter.',
+  SEMANTIC_SIMILARITY: 'Measures the cosine similarity between the model\'s response and the reference answer in an embedding space. It evaluates whether the two texts convey the same meaning, independent of factual correctness.',
+  CONTEXT_PRECISION: 'Measures the proportion of retrieved context chunks that are actually relevant to the question (precision). A high score means the retrieval pipeline returns little noise.',
+  CONTEXT_RECALL: 'Measures the proportion of ground-truth information that is covered by the retrieved context chunks (recall). A high score means the retrieval pipeline does not miss important supporting evidence.',
+  CONTEXT_RELEVANCE: 'Measures how relevant each individual retrieved chunk is to the query. Similar to CONTEXT_PRECISION but evaluated at the chunk level rather than against a reference answer.',
+  TOOL_CORRECTNESS: 'Measures the correctness of the tool calls made by the agent during task execution: both the choice of tool and the arguments passed. A high score means the agent\'s tool-use strategy matches the expected behavior.',
+  TASK_COMPLETION: 'Measures whether the agent ultimately achieves the user\'s stated goal. It evaluates the reasoning chain, intermediate steps, and final output holistically; a high score means the task was fully accomplished.',
+} as const
+
+type DefaultMetricDescription = typeof DEFAULT_METRIC_DESCRIPTION[keyof typeof DEFAULT_METRIC_DESCRIPTION]
+
+const DEFAULT_METRIC_DESCRIPTION_KEYS = {
+  FAITHFULNESS: 'metrics.builtin.description.faithfulness',
+  ANSWER_RELEVANCY: 'metrics.builtin.description.answerRelevancy',
+  ANSWER_CORRECTNESS: 'metrics.builtin.description.answerCorrectness',
+  SEMANTIC_SIMILARITY: 'metrics.builtin.description.semanticSimilarity',
+  CONTEXT_PRECISION: 'metrics.builtin.description.contextPrecision',
+  CONTEXT_RECALL: 'metrics.builtin.description.contextRecall',
+  CONTEXT_RELEVANCE: 'metrics.builtin.description.contextRelevance',
+  TOOL_CORRECTNESS: 'metrics.builtin.description.toolCorrectness',
+  TASK_COMPLETION: 'metrics.builtin.description.taskCompletion',
+} as const
+
+type DefaultMetricDescriptionKey = typeof DEFAULT_METRIC_DESCRIPTION_KEYS[keyof typeof DEFAULT_METRIC_DESCRIPTION_KEYS]
+
+const DEFAULT_METRIC_DESCRIPTIONS: Record<string, DefaultMetricDescription> = {
+  'faithfulness': DEFAULT_METRIC_DESCRIPTION.FAITHFULNESS,
+  'answer-relevancy': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
+  'answer-correctness': DEFAULT_METRIC_DESCRIPTION.ANSWER_CORRECTNESS,
+  'semantic-similarity': DEFAULT_METRIC_DESCRIPTION.SEMANTIC_SIMILARITY,
+  'context-precision': DEFAULT_METRIC_DESCRIPTION.CONTEXT_PRECISION,
+  'context-recall': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RECALL,
+  'context-relevance': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RELEVANCE,
+  'tool-correctness': DEFAULT_METRIC_DESCRIPTION.TOOL_CORRECTNESS,
+  'task-completion': DEFAULT_METRIC_DESCRIPTION.TASK_COMPLETION,
+  'relevance': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
+}
+
+const DEFAULT_METRIC_DESCRIPTION_I18N_KEYS: Record<string, DefaultMetricDescriptionKey> = {
+  'faithfulness': DEFAULT_METRIC_DESCRIPTION_KEYS.FAITHFULNESS,
+  'answer-relevancy': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
+  'answer-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_CORRECTNESS,
+  'semantic-similarity': DEFAULT_METRIC_DESCRIPTION_KEYS.SEMANTIC_SIMILARITY,
+  'context-precision': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_PRECISION,
+  'context-recall': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RECALL,
+  'context-relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RELEVANCE,
+  'tool-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.TOOL_CORRECTNESS,
+  'task-completion': DEFAULT_METRIC_DESCRIPTION_KEYS.TASK_COMPLETION,
+  'relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
+}
+
+const normalizeMetricId = (metricId: string) => metricId.trim().toLowerCase().replace(/_/g, '-')
+
+export const getDefaultMetricDescription = (metricId: string) => {
+  return DEFAULT_METRIC_DESCRIPTIONS[normalizeMetricId(metricId)] ?? ''
+}
+
+export const getDefaultMetricDescriptionI18nKey = (metricId: string) => {
+  return DEFAULT_METRIC_DESCRIPTION_I18N_KEYS[normalizeMetricId(metricId)] ?? null
+}
+
+export const getTranslatedMetricDescription = (
+  t: TFunction<'evaluation'>,
+  metricId: string,
+  fallbackDescription = '',
+) => {
+  const defaultDescription = fallbackDescription || getDefaultMetricDescription(metricId)
+  const descriptionI18nKey = getDefaultMetricDescriptionI18nKey(metricId)
+
+  if (!descriptionI18nKey)
+    return defaultDescription
+
+  return t(descriptionI18nKey, { defaultValue: defaultDescription })
+}
diff --git a/web/app/components/evaluation/mock.ts b/web/app/components/evaluation/mock.ts
index 9425672b93..61f6d57b39 100644
--- a/web/app/components/evaluation/mock.ts
+++ b/web/app/components/evaluation/mock.ts
@@ -4,6 +4,7 @@ import type {
   EvaluationResourceType,
   MetricOption,
 } from './types'
+import { getDefaultMetricDescription } from './default-metric-descriptions'
 
 const judgeModels = [
   {
@@ -27,19 +28,19 @@ const builtinMetrics: MetricOption[] = [
   {
     id: 'answer-correctness',
     label: 'Answer Correctness',
-    description: 'Compares the response with the expected answer and scores factual alignment.',
+    description: getDefaultMetricDescription('answer-correctness'),
     valueType: 'number',
   },
   {
     id: 'faithfulness',
     label: 'Faithfulness',
-    description: 'Checks whether the answer stays grounded in the retrieved evidence.',
+    description: getDefaultMetricDescription('faithfulness'),
     valueType: 'number',
   },
   {
     id: 'relevance',
     label: 'Relevance',
-    description: 'Evaluates how directly the answer addresses the original request.',
+    description: getDefaultMetricDescription('relevance'),
     valueType: 'number',
   },
   {
@@ -66,19 +67,19 @@ const pipelineBuiltinMetrics: MetricOption[] = [
   {
     id: 'context-precision',
     label: 'Context Precision',
-    description: 'Measures whether retrieved chunks stay tightly aligned to the request.',
+    description: getDefaultMetricDescription('context-precision'),
     valueType: 'number',
   },
   {
     id: 'context-recall',
     label: 'Context Recall',
-    description: 'Checks whether the retrieval result includes the evidence needed to answer.',
+    description: getDefaultMetricDescription('context-recall'),
     valueType: 'number',
   },
   {
     id: 'context-relevance',
     label: 'Context Relevance',
-    description: 'Scores how useful the retrieved context is for downstream generation.',
+    description: getDefaultMetricDescription('context-relevance'),
     valueType: 'number',
   },
 ]
diff --git a/web/app/components/evaluation/store-utils.ts b/web/app/components/evaluation/store-utils.ts
index c34e643baa..b8903442d8 100644
--- a/web/app/components/evaluation/store-utils.ts
+++ b/web/app/components/evaluation/store-utils.ts
@@ -20,6 +20,7 @@ import type {
   EvaluationRunRequest,
   NodeInfo,
 } from '@/types/evaluation'
+import { getDefaultMetricDescription } from './default-metric-descriptions'
 import { getEvaluationMockConfig } from './mock'
 import {
   buildConditionMetricOptions,
@@ -51,7 +52,7 @@ const resolveMetricOption = (resourceType: EvaluationResourceType, metricId: str
   return config.builtinMetrics.find(metric => metric.id === metricId) ?? {
     id: metricId,
     label: humanizeMetricId(metricId),
-    description: '',
+    description: getDefaultMetricDescription(metricId),
     valueType: 'number',
   }
 }
diff --git a/web/i18n/en-US/evaluation.json b/web/i18n/en-US/evaluation.json
index 57535ee836..9802bc904c 100644
--- a/web/i18n/en-US/evaluation.json
+++ b/web/i18n/en-US/evaluation.json
@@ -83,6 +83,15 @@
   "metrics.addCustom": "Add Custom Metrics",
   "metrics.addNode": "Add Node",
   "metrics.added": "Added",
+  "metrics.builtin.description.answerCorrectness": "Measures the factual accuracy and completeness of the model's answer relative to a ground-truth reference. It combines semantic similarity with key-fact coverage, so both meaning and content matter.",
+  "metrics.builtin.description.answerRelevancy": "Measures how well the model's response addresses the user's question. A high score means the answer stays on-topic; a low score indicates irrelevant content or a failure to answer the actual question.",
+  "metrics.builtin.description.contextPrecision": "Measures the proportion of retrieved context chunks that are actually relevant to the question (precision). A high score means the retrieval pipeline returns little noise.",
+  "metrics.builtin.description.contextRecall": "Measures the proportion of ground-truth information that is covered by the retrieved context chunks (recall). A high score means the retrieval pipeline does not miss important supporting evidence.",
+  "metrics.builtin.description.contextRelevance": "Measures how relevant each individual retrieved chunk is to the query. Similar to CONTEXT_PRECISION but evaluated at the chunk level rather than against a reference answer.",
+  "metrics.builtin.description.faithfulness": "Measures whether every claim in the model's response is grounded in the provided retrieved context. A high score means the answer contains no hallucinated content; each statement can be traced back to a passage in the context.",
+  "metrics.builtin.description.semanticSimilarity": "Measures the cosine similarity between the model's response and the reference answer in an embedding space. It evaluates whether the two texts convey the same meaning, independent of factual correctness.",
+  "metrics.builtin.description.taskCompletion": "Measures whether the agent ultimately achieves the user's stated goal. It evaluates the reasoning chain, intermediate steps, and final output holistically; a high score means the task was fully accomplished.",
+  "metrics.builtin.description.toolCorrectness": "Measures the correctness of the tool calls made by the agent during task execution: both the choice of tool and the arguments passed. A high score means the agent's tool-use strategy matches the expected behavior.",
   "metrics.collapseNodes": "Collapse nodes",
   "metrics.custom.description": "Select an evaluation workflow and map your variables before running tests.",
   "metrics.custom.footerDescription": "Connect your published evaluation workflows",
diff --git a/web/i18n/zh-Hans/evaluation.json b/web/i18n/zh-Hans/evaluation.json
index e02558387f..0f499426e4 100644
--- a/web/i18n/zh-Hans/evaluation.json
+++ b/web/i18n/zh-Hans/evaluation.json
@@ -83,6 +83,15 @@
   "metrics.addCustom": "添加自定义指标",
   "metrics.addNode": "添加节点",
   "metrics.added": "已添加",
+  "metrics.builtin.description.answerCorrectness": "衡量模型回答相对于标准答案的事实准确性与完整性。它结合了语义相似度与关键信息覆盖情况，因此不仅关注表达含义，也关注内容是否完整准确。",
+  "metrics.builtin.description.answerRelevancy": "衡量模型回答与用户问题的贴合程度。高分表示回答始终围绕问题展开；低分表示内容偏题，或没有真正回答用户的实际问题。",
+  "metrics.builtin.description.contextPrecision": "衡量检索出的上下文片段中，实际与问题相关的内容占比（Precision）。高分表示检索流程带回的噪声较少。",
+  "metrics.builtin.description.contextRecall": "衡量标准答案所需的真实信息，有多少被检索出的上下文片段覆盖到（Recall）。高分表示检索流程没有遗漏重要的支撑证据。",
+  "metrics.builtin.description.contextRelevance": "衡量每一个被检索出的上下文片段与查询的相关程度。它与 CONTEXT_PRECISION 类似，但评估粒度在单个 chunk 层面，而不是相对于参考答案整体评估。",
+  "metrics.builtin.description.faithfulness": "衡量模型回答中的每一个陈述，是否都能从提供的检索上下文中找到依据。高分表示回答中没有幻觉内容，每一条表述都可以追溯到上下文中的某个片段。",
+  "metrics.builtin.description.semanticSimilarity": "衡量模型回答与参考答案在向量语义空间中的余弦相似度。它评估的是两段文本是否表达了相同含义，而不直接判断事实是否正确。",
+  "metrics.builtin.description.taskCompletion": "衡量 Agent 是否最终完成了用户明确提出的目标。它会整体评估推理链路、中间步骤和最终输出；高分表示任务已被完整达成。",
+  "metrics.builtin.description.toolCorrectness": "衡量 Agent 在任务执行过程中发起的工具调用是否正确，包括工具选择本身以及传入参数是否合理。高分表示 Agent 的工具使用策略符合预期行为。",
   "metrics.custom.description": "选择评测工作流并完成变量映射后即可运行测试。",
   "metrics.custom.mappingTitle": "变量映射",
   "metrics.custom.mappingWarning": "请先完成工作流选择和所有变量映射，再运行批量测试。",