feat(web): metric descriptions

2026-05-01 03:30:02 +08:00 · 2026-04-13 14:13:04 +08:00
parent dce5715982
commit f6047aafe8
9 changed files with 159 additions and 13 deletions
--- a/web/app/components/evaluation/tests/default-metric-descriptions.spec.ts
+++ b/web/app/components/evaluation/tests/default-metric-descriptions.spec.ts
@@ -0,0 +1,34 @@
+import { getDefaultMetricDescription, getDefaultMetricDescriptionI18nKey, getTranslatedMetricDescription } from '../default-metric-descriptions'
+
+describe('default metric descriptions', () => {
+  it('should resolve descriptions for kebab-case metric ids', () => {
+    expect(getDefaultMetricDescription('context-precision')).toContain('retrieval pipeline returns little noise')
+    expect(getDefaultMetricDescription('answer-correctness')).toContain('factual accuracy and completeness')
+  })
+
+  it('should normalize snake_case metric ids from backend payloads', () => {
+    expect(getDefaultMetricDescription('CONTEXT_RECALL')).toContain('does not miss important supporting evidence')
+    expect(getDefaultMetricDescription('TOOL_CORRECTNESS')).toContain('tool-use strategy matches the expected behavior')
+  })
+
+  it('should support the legacy relevance alias', () => {
+    expect(getDefaultMetricDescription('relevance')).toContain('addresses the user\'s question')
+  })
+
+  it('should resolve i18n keys for builtin metrics', () => {
+    expect(getDefaultMetricDescriptionI18nKey('context-precision')).toBe('metrics.builtin.description.contextPrecision')
+    expect(getDefaultMetricDescriptionI18nKey('ANSWER_RELEVANCY')).toBe('metrics.builtin.description.answerRelevancy')
+  })
+
+  it('should use translated content when translation key exists', () => {
+    const t = vi.fn((key: string, options?: { defaultValue?: string }) => {
+      if (key === 'metrics.builtin.description.faithfulness')
+        return '忠实性中文文案'
+
+      return options?.defaultValue ?? key
+    })
+
+    expect(getTranslatedMetricDescription(t as never, 'faithfulness')).toBe('忠实性中文文案')
+    expect(getTranslatedMetricDescription(t as never, 'latency', 'Latency fallback')).toBe('Latency fallback')
+  })
+})
--- a/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts
+++ b/web/app/components/evaluation/components/metric-selector/use-metric-selector-data.ts
@@ -1,7 +1,9 @@
 import type { BuiltinMetricMap, MetricSelectorSection } from './types'
 import type { NodeInfo } from '@/types/evaluation'
 import { useEffect, useMemo } from 'react'
+import { useTranslation } from 'react-i18next'
 import { useAvailableEvaluationMetrics, useEvaluationNodeInfoMutation } from '@/service/use-evaluation'
+import { getTranslatedMetricDescription } from '../../default-metric-descriptions'
 import { getEvaluationMockConfig } from '../../mock'
 import { useEvaluationResource, useEvaluationStore } from '../../store'
 import {
@@ -34,6 +36,7 @@ export const useMetricSelectorData = ({
  nodeInfoMap,
  setNodeInfoMap,
 }: UseMetricSelectorDataOptions): UseMetricSelectorDataResult => {
+  const { t } = useTranslation('evaluation')
  const config = getEvaluationMockConfig(resourceType)
  const metrics = useEvaluationResource(resourceType, resourceId).metrics
  const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
@@ -102,9 +105,10 @@ export const useMetricSelectorData = ({
    const keyword = query.trim().toLowerCase()

    return resolvedMetrics.map((metric) => {
+      const metricDescription = getTranslatedMetricDescription(t, metric.id, metric.description)
      const metricMatches = !keyword
        || metric.label.toLowerCase().includes(keyword)
-        || metric.description.toLowerCase().includes(keyword)
+        || metricDescription.toLowerCase().includes(keyword)
      const metricNodes = nodeInfoMap[metric.id] ?? []
      const supportsNodeSelection = resourceType !== 'datasets'
      const hasNoNodeInfo = supportsNodeSelection && metricNodes.length === 0
@@ -114,7 +118,10 @@ export const useMetricSelectorData = ({
          return null

        return {
-          metric,
+          metric: {
+            ...metric,
+            description: metricDescription,
+          },
          hasNoNodeInfo: true,
          visibleNodes: [] as NodeInfo[],
        }
@@ -132,12 +139,15 @@ export const useMetricSelectorData = ({
        return null

      return {
-        metric,
+        metric: {
+          ...metric,
+          description: metricDescription,
+        },
        hasNoNodeInfo: false,
        visibleNodes,
      }
    }).filter(section => !!section)
-  }, [nodeInfoMap, query, resolvedMetrics, resourceType])
+  }, [nodeInfoMap, query, resolvedMetrics, resourceType, t])

  const toggleNodeSelection = (metricId: string, nodeInfo: NodeInfo) => {
    const addedMetric = builtinMetricMap.get(metricId)
--- a/web/app/components/evaluation/components/metric-selector/utils.ts
+++ b/web/app/components/evaluation/components/metric-selector/utils.ts
@@ -1,6 +1,7 @@
 import type { MetricOption } from '../../types'
 import type { MetricVisualTone } from './types'
 import type { EvaluationTargetType, NodeInfo } from '@/types/evaluation'
+import { getDefaultMetricDescription } from '../../default-metric-descriptions'

 export const toEvaluationTargetType = (resourceType: 'apps' | 'snippets'): EvaluationTargetType => {
  return resourceType === 'snippets' ? 'snippets' : 'apps'
@@ -17,7 +18,7 @@ const humanizeMetricId = (metricId: string) => {
 export const buildMetricOption = (metricId: string): MetricOption => ({
  id: metricId,
  label: humanizeMetricId(metricId),
-  description: '',
+  description: getDefaultMetricDescription(metricId),
  valueType: 'number',
 })

--- a/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx
+++ b/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx
@@ -6,6 +6,7 @@ import Checkbox from '@/app/components/base/checkbox'
 import Input from '@/app/components/base/input'
 import { Tooltip, TooltipContent, TooltipTrigger } from '@/app/components/base/ui/tooltip'
 import { cn } from '@/utils/classnames'
+import { getTranslatedMetricDescription } from '../../default-metric-descriptions'
 import { DEFAULT_PIPELINE_METRIC_THRESHOLD } from '../../store-utils'

 type PipelineMetricItemProps = {
@@ -26,6 +27,7 @@ const PipelineMetricItem = ({
  onThresholdChange,
 }: PipelineMetricItemProps) => {
  const { t } = useTranslation('evaluation')
+  const metricDescription = getTranslatedMetricDescription(t, metric.id, metric.description)

  return (
    <div className="flex items-center justify-between gap-3 px-1 py-1">
@@ -45,7 +47,7 @@ const PipelineMetricItem = ({
            )}
          />
          <TooltipContent>
-            {metric.description}
+            {metricDescription}
          </TooltipContent>
        </Tooltip>
      </button>
--- a/web/app/components/evaluation/default-metric-descriptions.ts
+++ b/web/app/components/evaluation/default-metric-descriptions.ts
@@ -0,0 +1,79 @@
+import type { TFunction } from 'i18next'
+
+const DEFAULT_METRIC_DESCRIPTION = {
+  FAITHFULNESS: 'Measures whether every claim in the model\'s response is grounded in the provided retrieved context. A high score means the answer contains no hallucinated content; each statement can be traced back to a passage in the context.',
+  ANSWER_RELEVANCY: 'Measures how well the model\'s response addresses the user\'s question. A high score means the answer stays on-topic; a low score indicates irrelevant content or a failure to answer the actual question.',
+  ANSWER_CORRECTNESS: 'Measures the factual accuracy and completeness of the model\'s answer relative to a ground-truth reference. It combines semantic similarity with key-fact coverage, so both meaning and content matter.',
+  SEMANTIC_SIMILARITY: 'Measures the cosine similarity between the model\'s response and the reference answer in an embedding space. It evaluates whether the two texts convey the same meaning, independent of factual correctness.',
+  CONTEXT_PRECISION: 'Measures the proportion of retrieved context chunks that are actually relevant to the question (precision). A high score means the retrieval pipeline returns little noise.',
+  CONTEXT_RECALL: 'Measures the proportion of ground-truth information that is covered by the retrieved context chunks (recall). A high score means the retrieval pipeline does not miss important supporting evidence.',
+  CONTEXT_RELEVANCE: 'Measures how relevant each individual retrieved chunk is to the query. Similar to CONTEXT_PRECISION but evaluated at the chunk level rather than against a reference answer.',
+  TOOL_CORRECTNESS: 'Measures the correctness of the tool calls made by the agent during task execution: both the choice of tool and the arguments passed. A high score means the agent\'s tool-use strategy matches the expected behavior.',
+  TASK_COMPLETION: 'Measures whether the agent ultimately achieves the user\'s stated goal. It evaluates the reasoning chain, intermediate steps, and final output holistically; a high score means the task was fully accomplished.',
+} as const
+
+type DefaultMetricDescription = typeof DEFAULT_METRIC_DESCRIPTION[keyof typeof DEFAULT_METRIC_DESCRIPTION]
+
+const DEFAULT_METRIC_DESCRIPTION_KEYS = {
+  FAITHFULNESS: 'metrics.builtin.description.faithfulness',
+  ANSWER_RELEVANCY: 'metrics.builtin.description.answerRelevancy',
+  ANSWER_CORRECTNESS: 'metrics.builtin.description.answerCorrectness',
+  SEMANTIC_SIMILARITY: 'metrics.builtin.description.semanticSimilarity',
+  CONTEXT_PRECISION: 'metrics.builtin.description.contextPrecision',
+  CONTEXT_RECALL: 'metrics.builtin.description.contextRecall',
+  CONTEXT_RELEVANCE: 'metrics.builtin.description.contextRelevance',
+  TOOL_CORRECTNESS: 'metrics.builtin.description.toolCorrectness',
+  TASK_COMPLETION: 'metrics.builtin.description.taskCompletion',
+} as const
+
+type DefaultMetricDescriptionKey = typeof DEFAULT_METRIC_DESCRIPTION_KEYS[keyof typeof DEFAULT_METRIC_DESCRIPTION_KEYS]
+
+const DEFAULT_METRIC_DESCRIPTIONS: Record<string, DefaultMetricDescription> = {
+  'faithfulness': DEFAULT_METRIC_DESCRIPTION.FAITHFULNESS,
+  'answer-relevancy': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
+  'answer-correctness': DEFAULT_METRIC_DESCRIPTION.ANSWER_CORRECTNESS,
+  'semantic-similarity': DEFAULT_METRIC_DESCRIPTION.SEMANTIC_SIMILARITY,
+  'context-precision': DEFAULT_METRIC_DESCRIPTION.CONTEXT_PRECISION,
+  'context-recall': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RECALL,
+  'context-relevance': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RELEVANCE,
+  'tool-correctness': DEFAULT_METRIC_DESCRIPTION.TOOL_CORRECTNESS,
+  'task-completion': DEFAULT_METRIC_DESCRIPTION.TASK_COMPLETION,
+  'relevance': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
+}
+
+const DEFAULT_METRIC_DESCRIPTION_I18N_KEYS: Record<string, DefaultMetricDescriptionKey> = {
+  'faithfulness': DEFAULT_METRIC_DESCRIPTION_KEYS.FAITHFULNESS,
+  'answer-relevancy': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
+  'answer-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_CORRECTNESS,
+  'semantic-similarity': DEFAULT_METRIC_DESCRIPTION_KEYS.SEMANTIC_SIMILARITY,
+  'context-precision': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_PRECISION,
+  'context-recall': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RECALL,
+  'context-relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RELEVANCE,
+  'tool-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.TOOL_CORRECTNESS,
+  'task-completion': DEFAULT_METRIC_DESCRIPTION_KEYS.TASK_COMPLETION,
+  'relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
+}
+
+const normalizeMetricId = (metricId: string) => metricId.trim().toLowerCase().replace(/_/g, '-')
+
+export const getDefaultMetricDescription = (metricId: string) => {
+  return DEFAULT_METRIC_DESCRIPTIONS[normalizeMetricId(metricId)] ?? ''
+}
+
+export const getDefaultMetricDescriptionI18nKey = (metricId: string) => {
+  return DEFAULT_METRIC_DESCRIPTION_I18N_KEYS[normalizeMetricId(metricId)] ?? null
+}
+
+export const getTranslatedMetricDescription = (
+  t: TFunction<'evaluation'>,
+  metricId: string,
+  fallbackDescription = '',
+) => {
+  const defaultDescription = fallbackDescription || getDefaultMetricDescription(metricId)
+  const descriptionI18nKey = getDefaultMetricDescriptionI18nKey(metricId)
+
+  if (!descriptionI18nKey)
+    return defaultDescription
+
+  return t(descriptionI18nKey, { defaultValue: defaultDescription })
+}
--- a/web/app/components/evaluation/mock.ts
+++ b/web/app/components/evaluation/mock.ts
@@ -4,6 +4,7 @@ import type {
  EvaluationResourceType,
  MetricOption,
 } from './types'
+import { getDefaultMetricDescription } from './default-metric-descriptions'

 const judgeModels = [
  {
@@ -27,19 +28,19 @@ const builtinMetrics: MetricOption[] = [
  {
    id: 'answer-correctness',
    label: 'Answer Correctness',
-    description: 'Compares the response with the expected answer and scores factual alignment.',
+    description: getDefaultMetricDescription('answer-correctness'),
    valueType: 'number',
  },
  {
    id: 'faithfulness',
    label: 'Faithfulness',
-    description: 'Checks whether the answer stays grounded in the retrieved evidence.',
+    description: getDefaultMetricDescription('faithfulness'),
    valueType: 'number',
  },
  {
    id: 'relevance',
    label: 'Relevance',
-    description: 'Evaluates how directly the answer addresses the original request.',
+    description: getDefaultMetricDescription('relevance'),
    valueType: 'number',
  },
  {
@@ -66,19 +67,19 @@ const pipelineBuiltinMetrics: MetricOption[] = [
  {
    id: 'context-precision',
    label: 'Context Precision',
-    description: 'Measures whether retrieved chunks stay tightly aligned to the request.',
+    description: getDefaultMetricDescription('context-precision'),
    valueType: 'number',
  },
  {
    id: 'context-recall',
    label: 'Context Recall',
-    description: 'Checks whether the retrieval result includes the evidence needed to answer.',
+    description: getDefaultMetricDescription('context-recall'),
    valueType: 'number',
  },
  {
    id: 'context-relevance',
    label: 'Context Relevance',
-    description: 'Scores how useful the retrieved context is for downstream generation.',
+    description: getDefaultMetricDescription('context-relevance'),
    valueType: 'number',
  },
 ]
--- a/web/app/components/evaluation/store-utils.ts
+++ b/web/app/components/evaluation/store-utils.ts
@@ -20,6 +20,7 @@ import type {
  EvaluationRunRequest,
  NodeInfo,
 } from '@/types/evaluation'
+import { getDefaultMetricDescription } from './default-metric-descriptions'
 import { getEvaluationMockConfig } from './mock'
 import {
  buildConditionMetricOptions,
@@ -51,7 +52,7 @@ const resolveMetricOption = (resourceType: EvaluationResourceType, metricId: str
  return config.builtinMetrics.find(metric => metric.id === metricId) ?? {
    id: metricId,
    label: humanizeMetricId(metricId),
-    description: '',
+    description: getDefaultMetricDescription(metricId),
    valueType: 'number',
  }
 }
--- a/web/i18n/en-US/evaluation.json
+++ b/web/i18n/en-US/evaluation.json
@@ -83,6 +83,15 @@
  "metrics.addCustom": "Add Custom Metrics",
  "metrics.addNode": "Add Node",
  "metrics.added": "Added",
+  "metrics.builtin.description.answerCorrectness": "Measures the factual accuracy and completeness of the model's answer relative to a ground-truth reference. It combines semantic similarity with key-fact coverage, so both meaning and content matter.",
+  "metrics.builtin.description.answerRelevancy": "Measures how well the model's response addresses the user's question. A high score means the answer stays on-topic; a low score indicates irrelevant content or a failure to answer the actual question.",
+  "metrics.builtin.description.contextPrecision": "Measures the proportion of retrieved context chunks that are actually relevant to the question (precision). A high score means the retrieval pipeline returns little noise.",
+  "metrics.builtin.description.contextRecall": "Measures the proportion of ground-truth information that is covered by the retrieved context chunks (recall). A high score means the retrieval pipeline does not miss important supporting evidence.",
+  "metrics.builtin.description.contextRelevance": "Measures how relevant each individual retrieved chunk is to the query. Similar to CONTEXT_PRECISION but evaluated at the chunk level rather than against a reference answer.",
+  "metrics.builtin.description.faithfulness": "Measures whether every claim in the model's response is grounded in the provided retrieved context. A high score means the answer contains no hallucinated content; each statement can be traced back to a passage in the context.",
+  "metrics.builtin.description.semanticSimilarity": "Measures the cosine similarity between the model's response and the reference answer in an embedding space. It evaluates whether the two texts convey the same meaning, independent of factual correctness.",
+  "metrics.builtin.description.taskCompletion": "Measures whether the agent ultimately achieves the user's stated goal. It evaluates the reasoning chain, intermediate steps, and final output holistically; a high score means the task was fully accomplished.",
+  "metrics.builtin.description.toolCorrectness": "Measures the correctness of the tool calls made by the agent during task execution: both the choice of tool and the arguments passed. A high score means the agent's tool-use strategy matches the expected behavior.",
  "metrics.collapseNodes": "Collapse nodes",
  "metrics.custom.description": "Select an evaluation workflow and map your variables before running tests.",
  "metrics.custom.footerDescription": "Connect your published evaluation workflows",
--- a/web/i18n/zh-Hans/evaluation.json
+++ b/web/i18n/zh-Hans/evaluation.json
@@ -83,6 +83,15 @@
  "metrics.addCustom": "添加自定义指标",
  "metrics.addNode": "添加节点",
  "metrics.added": "已添加",
+  "metrics.builtin.description.answerCorrectness": "衡量模型回答相对于标准答案的事实准确性与完整性。它结合了语义相似度与关键信息覆盖情况，因此不仅关注表达含义，也关注内容是否完整准确。",
+  "metrics.builtin.description.answerRelevancy": "衡量模型回答与用户问题的贴合程度。高分表示回答始终围绕问题展开；低分表示内容偏题，或没有真正回答用户的实际问题。",
+  "metrics.builtin.description.contextPrecision": "衡量检索出的上下文片段中，实际与问题相关的内容占比（Precision）。高分表示检索流程带回的噪声较少。",
+  "metrics.builtin.description.contextRecall": "衡量标准答案所需的真实信息，有多少被检索出的上下文片段覆盖到（Recall）。高分表示检索流程没有遗漏重要的支撑证据。",
+  "metrics.builtin.description.contextRelevance": "衡量每一个被检索出的上下文片段与查询的相关程度。它与 CONTEXT_PRECISION 类似，但评估粒度在单个 chunk 层面，而不是相对于参考答案整体评估。",
+  "metrics.builtin.description.faithfulness": "衡量模型回答中的每一个陈述，是否都能从提供的检索上下文中找到依据。高分表示回答中没有幻觉内容，每一条表述都可以追溯到上下文中的某个片段。",
+  "metrics.builtin.description.semanticSimilarity": "衡量模型回答与参考答案在向量语义空间中的余弦相似度。它评估的是两段文本是否表达了相同含义，而不直接判断事实是否正确。",
+  "metrics.builtin.description.taskCompletion": "衡量 Agent 是否最终完成了用户明确提出的目标。它会整体评估推理链路、中间步骤和最终输出；高分表示任务已被完整达成。",
+  "metrics.builtin.description.toolCorrectness": "衡量 Agent 在任务执行过程中发起的工具调用是否正确，包括工具选择本身以及传入参数是否合理。高分表示 Agent 的工具使用策略符合预期行为。",
  "metrics.custom.description": "选择评测工作流并完成变量映射后即可运行测试。",
  "metrics.custom.mappingTitle": "变量映射",
  "metrics.custom.mappingWarning": "请先完成工作流选择和所有变量映射，再运行批量测试。",