feat(web): metric descriptions

This commit is contained in:
JzoNg
2026-04-13 14:13:04 +08:00
parent dce5715982
commit f6047aafe8
9 changed files with 159 additions and 13 deletions

View File

@@ -0,0 +1,34 @@
import { getDefaultMetricDescription, getDefaultMetricDescriptionI18nKey, getTranslatedMetricDescription } from '../default-metric-descriptions'
describe('default metric descriptions', () => {
it('should resolve descriptions for kebab-case metric ids', () => {
expect(getDefaultMetricDescription('context-precision')).toContain('retrieval pipeline returns little noise')
expect(getDefaultMetricDescription('answer-correctness')).toContain('factual accuracy and completeness')
})
it('should normalize snake_case metric ids from backend payloads', () => {
expect(getDefaultMetricDescription('CONTEXT_RECALL')).toContain('does not miss important supporting evidence')
expect(getDefaultMetricDescription('TOOL_CORRECTNESS')).toContain('tool-use strategy matches the expected behavior')
})
it('should support the legacy relevance alias', () => {
expect(getDefaultMetricDescription('relevance')).toContain('addresses the user\'s question')
})
it('should resolve i18n keys for builtin metrics', () => {
expect(getDefaultMetricDescriptionI18nKey('context-precision')).toBe('metrics.builtin.description.contextPrecision')
expect(getDefaultMetricDescriptionI18nKey('ANSWER_RELEVANCY')).toBe('metrics.builtin.description.answerRelevancy')
})
it('should use translated content when translation key exists', () => {
const t = vi.fn((key: string, options?: { defaultValue?: string }) => {
if (key === 'metrics.builtin.description.faithfulness')
return '忠实性中文文案'
return options?.defaultValue ?? key
})
expect(getTranslatedMetricDescription(t as never, 'faithfulness')).toBe('忠实性中文文案')
expect(getTranslatedMetricDescription(t as never, 'latency', 'Latency fallback')).toBe('Latency fallback')
})
})

View File

@@ -1,7 +1,9 @@
import type { BuiltinMetricMap, MetricSelectorSection } from './types'
import type { NodeInfo } from '@/types/evaluation'
import { useEffect, useMemo } from 'react'
import { useTranslation } from 'react-i18next'
import { useAvailableEvaluationMetrics, useEvaluationNodeInfoMutation } from '@/service/use-evaluation'
import { getTranslatedMetricDescription } from '../../default-metric-descriptions'
import { getEvaluationMockConfig } from '../../mock'
import { useEvaluationResource, useEvaluationStore } from '../../store'
import {
@@ -34,6 +36,7 @@ export const useMetricSelectorData = ({
nodeInfoMap,
setNodeInfoMap,
}: UseMetricSelectorDataOptions): UseMetricSelectorDataResult => {
const { t } = useTranslation('evaluation')
const config = getEvaluationMockConfig(resourceType)
const metrics = useEvaluationResource(resourceType, resourceId).metrics
const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
@@ -102,9 +105,10 @@ export const useMetricSelectorData = ({
const keyword = query.trim().toLowerCase()
return resolvedMetrics.map((metric) => {
const metricDescription = getTranslatedMetricDescription(t, metric.id, metric.description)
const metricMatches = !keyword
|| metric.label.toLowerCase().includes(keyword)
|| metric.description.toLowerCase().includes(keyword)
|| metricDescription.toLowerCase().includes(keyword)
const metricNodes = nodeInfoMap[metric.id] ?? []
const supportsNodeSelection = resourceType !== 'datasets'
const hasNoNodeInfo = supportsNodeSelection && metricNodes.length === 0
@@ -114,7 +118,10 @@ export const useMetricSelectorData = ({
return null
return {
metric,
metric: {
...metric,
description: metricDescription,
},
hasNoNodeInfo: true,
visibleNodes: [] as NodeInfo[],
}
@@ -132,12 +139,15 @@ export const useMetricSelectorData = ({
return null
return {
metric,
metric: {
...metric,
description: metricDescription,
},
hasNoNodeInfo: false,
visibleNodes,
}
}).filter(section => !!section)
}, [nodeInfoMap, query, resolvedMetrics, resourceType])
}, [nodeInfoMap, query, resolvedMetrics, resourceType, t])
const toggleNodeSelection = (metricId: string, nodeInfo: NodeInfo) => {
const addedMetric = builtinMetricMap.get(metricId)

View File

@@ -1,6 +1,7 @@
import type { MetricOption } from '../../types'
import type { MetricVisualTone } from './types'
import type { EvaluationTargetType, NodeInfo } from '@/types/evaluation'
import { getDefaultMetricDescription } from '../../default-metric-descriptions'
export const toEvaluationTargetType = (resourceType: 'apps' | 'snippets'): EvaluationTargetType => {
return resourceType === 'snippets' ? 'snippets' : 'apps'
@@ -17,7 +18,7 @@ const humanizeMetricId = (metricId: string) => {
export const buildMetricOption = (metricId: string): MetricOption => ({
id: metricId,
label: humanizeMetricId(metricId),
description: '',
description: getDefaultMetricDescription(metricId),
valueType: 'number',
})

View File

@@ -6,6 +6,7 @@ import Checkbox from '@/app/components/base/checkbox'
import Input from '@/app/components/base/input'
import { Tooltip, TooltipContent, TooltipTrigger } from '@/app/components/base/ui/tooltip'
import { cn } from '@/utils/classnames'
import { getTranslatedMetricDescription } from '../../default-metric-descriptions'
import { DEFAULT_PIPELINE_METRIC_THRESHOLD } from '../../store-utils'
type PipelineMetricItemProps = {
@@ -26,6 +27,7 @@ const PipelineMetricItem = ({
onThresholdChange,
}: PipelineMetricItemProps) => {
const { t } = useTranslation('evaluation')
const metricDescription = getTranslatedMetricDescription(t, metric.id, metric.description)
return (
<div className="flex items-center justify-between gap-3 px-1 py-1">
@@ -45,7 +47,7 @@ const PipelineMetricItem = ({
)}
/>
<TooltipContent>
{metric.description}
{metricDescription}
</TooltipContent>
</Tooltip>
</button>

View File

@@ -0,0 +1,79 @@
import type { TFunction } from 'i18next'
const DEFAULT_METRIC_DESCRIPTION = {
FAITHFULNESS: 'Measures whether every claim in the model\'s response is grounded in the provided retrieved context. A high score means the answer contains no hallucinated content; each statement can be traced back to a passage in the context.',
ANSWER_RELEVANCY: 'Measures how well the model\'s response addresses the user\'s question. A high score means the answer stays on-topic; a low score indicates irrelevant content or a failure to answer the actual question.',
ANSWER_CORRECTNESS: 'Measures the factual accuracy and completeness of the model\'s answer relative to a ground-truth reference. It combines semantic similarity with key-fact coverage, so both meaning and content matter.',
SEMANTIC_SIMILARITY: 'Measures the cosine similarity between the model\'s response and the reference answer in an embedding space. It evaluates whether the two texts convey the same meaning, independent of factual correctness.',
CONTEXT_PRECISION: 'Measures the proportion of retrieved context chunks that are actually relevant to the question (precision). A high score means the retrieval pipeline returns little noise.',
CONTEXT_RECALL: 'Measures the proportion of ground-truth information that is covered by the retrieved context chunks (recall). A high score means the retrieval pipeline does not miss important supporting evidence.',
CONTEXT_RELEVANCE: 'Measures how relevant each individual retrieved chunk is to the query. Similar to CONTEXT_PRECISION but evaluated at the chunk level rather than against a reference answer.',
TOOL_CORRECTNESS: 'Measures the correctness of the tool calls made by the agent during task execution: both the choice of tool and the arguments passed. A high score means the agent\'s tool-use strategy matches the expected behavior.',
TASK_COMPLETION: 'Measures whether the agent ultimately achieves the user\'s stated goal. It evaluates the reasoning chain, intermediate steps, and final output holistically; a high score means the task was fully accomplished.',
} as const
type DefaultMetricDescription = typeof DEFAULT_METRIC_DESCRIPTION[keyof typeof DEFAULT_METRIC_DESCRIPTION]
const DEFAULT_METRIC_DESCRIPTION_KEYS = {
FAITHFULNESS: 'metrics.builtin.description.faithfulness',
ANSWER_RELEVANCY: 'metrics.builtin.description.answerRelevancy',
ANSWER_CORRECTNESS: 'metrics.builtin.description.answerCorrectness',
SEMANTIC_SIMILARITY: 'metrics.builtin.description.semanticSimilarity',
CONTEXT_PRECISION: 'metrics.builtin.description.contextPrecision',
CONTEXT_RECALL: 'metrics.builtin.description.contextRecall',
CONTEXT_RELEVANCE: 'metrics.builtin.description.contextRelevance',
TOOL_CORRECTNESS: 'metrics.builtin.description.toolCorrectness',
TASK_COMPLETION: 'metrics.builtin.description.taskCompletion',
} as const
type DefaultMetricDescriptionKey = typeof DEFAULT_METRIC_DESCRIPTION_KEYS[keyof typeof DEFAULT_METRIC_DESCRIPTION_KEYS]
const DEFAULT_METRIC_DESCRIPTIONS: Record<string, DefaultMetricDescription> = {
'faithfulness': DEFAULT_METRIC_DESCRIPTION.FAITHFULNESS,
'answer-relevancy': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
'answer-correctness': DEFAULT_METRIC_DESCRIPTION.ANSWER_CORRECTNESS,
'semantic-similarity': DEFAULT_METRIC_DESCRIPTION.SEMANTIC_SIMILARITY,
'context-precision': DEFAULT_METRIC_DESCRIPTION.CONTEXT_PRECISION,
'context-recall': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RECALL,
'context-relevance': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RELEVANCE,
'tool-correctness': DEFAULT_METRIC_DESCRIPTION.TOOL_CORRECTNESS,
'task-completion': DEFAULT_METRIC_DESCRIPTION.TASK_COMPLETION,
'relevance': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
}
const DEFAULT_METRIC_DESCRIPTION_I18N_KEYS: Record<string, DefaultMetricDescriptionKey> = {
'faithfulness': DEFAULT_METRIC_DESCRIPTION_KEYS.FAITHFULNESS,
'answer-relevancy': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
'answer-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_CORRECTNESS,
'semantic-similarity': DEFAULT_METRIC_DESCRIPTION_KEYS.SEMANTIC_SIMILARITY,
'context-precision': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_PRECISION,
'context-recall': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RECALL,
'context-relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RELEVANCE,
'tool-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.TOOL_CORRECTNESS,
'task-completion': DEFAULT_METRIC_DESCRIPTION_KEYS.TASK_COMPLETION,
'relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
}
const normalizeMetricId = (metricId: string) => metricId.trim().toLowerCase().replace(/_/g, '-')
export const getDefaultMetricDescription = (metricId: string) => {
return DEFAULT_METRIC_DESCRIPTIONS[normalizeMetricId(metricId)] ?? ''
}
export const getDefaultMetricDescriptionI18nKey = (metricId: string) => {
return DEFAULT_METRIC_DESCRIPTION_I18N_KEYS[normalizeMetricId(metricId)] ?? null
}
export const getTranslatedMetricDescription = (
t: TFunction<'evaluation'>,
metricId: string,
fallbackDescription = '',
) => {
const defaultDescription = fallbackDescription || getDefaultMetricDescription(metricId)
const descriptionI18nKey = getDefaultMetricDescriptionI18nKey(metricId)
if (!descriptionI18nKey)
return defaultDescription
return t(descriptionI18nKey, { defaultValue: defaultDescription })
}

View File

@@ -4,6 +4,7 @@ import type {
EvaluationResourceType,
MetricOption,
} from './types'
import { getDefaultMetricDescription } from './default-metric-descriptions'
const judgeModels = [
{
@@ -27,19 +28,19 @@ const builtinMetrics: MetricOption[] = [
{
id: 'answer-correctness',
label: 'Answer Correctness',
description: 'Compares the response with the expected answer and scores factual alignment.',
description: getDefaultMetricDescription('answer-correctness'),
valueType: 'number',
},
{
id: 'faithfulness',
label: 'Faithfulness',
description: 'Checks whether the answer stays grounded in the retrieved evidence.',
description: getDefaultMetricDescription('faithfulness'),
valueType: 'number',
},
{
id: 'relevance',
label: 'Relevance',
description: 'Evaluates how directly the answer addresses the original request.',
description: getDefaultMetricDescription('relevance'),
valueType: 'number',
},
{
@@ -66,19 +67,19 @@ const pipelineBuiltinMetrics: MetricOption[] = [
{
id: 'context-precision',
label: 'Context Precision',
description: 'Measures whether retrieved chunks stay tightly aligned to the request.',
description: getDefaultMetricDescription('context-precision'),
valueType: 'number',
},
{
id: 'context-recall',
label: 'Context Recall',
description: 'Checks whether the retrieval result includes the evidence needed to answer.',
description: getDefaultMetricDescription('context-recall'),
valueType: 'number',
},
{
id: 'context-relevance',
label: 'Context Relevance',
description: 'Scores how useful the retrieved context is for downstream generation.',
description: getDefaultMetricDescription('context-relevance'),
valueType: 'number',
},
]

View File

@@ -20,6 +20,7 @@ import type {
EvaluationRunRequest,
NodeInfo,
} from '@/types/evaluation'
import { getDefaultMetricDescription } from './default-metric-descriptions'
import { getEvaluationMockConfig } from './mock'
import {
buildConditionMetricOptions,
@@ -51,7 +52,7 @@ const resolveMetricOption = (resourceType: EvaluationResourceType, metricId: str
return config.builtinMetrics.find(metric => metric.id === metricId) ?? {
id: metricId,
label: humanizeMetricId(metricId),
description: '',
description: getDefaultMetricDescription(metricId),
valueType: 'number',
}
}

View File

@@ -83,6 +83,15 @@
"metrics.addCustom": "Add Custom Metrics",
"metrics.addNode": "Add Node",
"metrics.added": "Added",
"metrics.builtin.description.answerCorrectness": "Measures the factual accuracy and completeness of the model's answer relative to a ground-truth reference. It combines semantic similarity with key-fact coverage, so both meaning and content matter.",
"metrics.builtin.description.answerRelevancy": "Measures how well the model's response addresses the user's question. A high score means the answer stays on-topic; a low score indicates irrelevant content or a failure to answer the actual question.",
"metrics.builtin.description.contextPrecision": "Measures the proportion of retrieved context chunks that are actually relevant to the question (precision). A high score means the retrieval pipeline returns little noise.",
"metrics.builtin.description.contextRecall": "Measures the proportion of ground-truth information that is covered by the retrieved context chunks (recall). A high score means the retrieval pipeline does not miss important supporting evidence.",
"metrics.builtin.description.contextRelevance": "Measures how relevant each individual retrieved chunk is to the query. Similar to CONTEXT_PRECISION but evaluated at the chunk level rather than against a reference answer.",
"metrics.builtin.description.faithfulness": "Measures whether every claim in the model's response is grounded in the provided retrieved context. A high score means the answer contains no hallucinated content; each statement can be traced back to a passage in the context.",
"metrics.builtin.description.semanticSimilarity": "Measures the cosine similarity between the model's response and the reference answer in an embedding space. It evaluates whether the two texts convey the same meaning, independent of factual correctness.",
"metrics.builtin.description.taskCompletion": "Measures whether the agent ultimately achieves the user's stated goal. It evaluates the reasoning chain, intermediate steps, and final output holistically; a high score means the task was fully accomplished.",
"metrics.builtin.description.toolCorrectness": "Measures the correctness of the tool calls made by the agent during task execution: both the choice of tool and the arguments passed. A high score means the agent's tool-use strategy matches the expected behavior.",
"metrics.collapseNodes": "Collapse nodes",
"metrics.custom.description": "Select an evaluation workflow and map your variables before running tests.",
"metrics.custom.footerDescription": "Connect your published evaluation workflows",

View File

@@ -83,6 +83,15 @@
"metrics.addCustom": "添加自定义指标",
"metrics.addNode": "添加节点",
"metrics.added": "已添加",
"metrics.builtin.description.answerCorrectness": "衡量模型回答相对于标准答案的事实准确性与完整性。它结合了语义相似度与关键信息覆盖情况,因此不仅关注表达含义,也关注内容是否完整准确。",
"metrics.builtin.description.answerRelevancy": "衡量模型回答与用户问题的贴合程度。高分表示回答始终围绕问题展开;低分表示内容偏题,或没有真正回答用户的实际问题。",
"metrics.builtin.description.contextPrecision": "衡量检索出的上下文片段中实际与问题相关的内容占比Precision。高分表示检索流程带回的噪声较少。",
"metrics.builtin.description.contextRecall": "衡量标准答案所需的真实信息有多少被检索出的上下文片段覆盖到Recall。高分表示检索流程没有遗漏重要的支撑证据。",
"metrics.builtin.description.contextRelevance": "衡量每一个被检索出的上下文片段与查询的相关程度。它与 CONTEXT_PRECISION 类似,但评估粒度在单个 chunk 层面,而不是相对于参考答案整体评估。",
"metrics.builtin.description.faithfulness": "衡量模型回答中的每一个陈述,是否都能从提供的检索上下文中找到依据。高分表示回答中没有幻觉内容,每一条表述都可以追溯到上下文中的某个片段。",
"metrics.builtin.description.semanticSimilarity": "衡量模型回答与参考答案在向量语义空间中的余弦相似度。它评估的是两段文本是否表达了相同含义,而不直接判断事实是否正确。",
"metrics.builtin.description.taskCompletion": "衡量 Agent 是否最终完成了用户明确提出的目标。它会整体评估推理链路、中间步骤和最终输出;高分表示任务已被完整达成。",
"metrics.builtin.description.toolCorrectness": "衡量 Agent 在任务执行过程中发起的工具调用是否正确,包括工具选择本身以及传入参数是否合理。高分表示 Agent 的工具使用策略符合预期行为。",
"metrics.custom.description": "选择评测工作流并完成变量映射后即可运行测试。",
"metrics.custom.mappingTitle": "变量映射",
"metrics.custom.mappingWarning": "请先完成工作流选择和所有变量映射,再运行批量测试。",