mirror of
https://mirror.skon.top/github.com/langgenius/dify.git
synced 2026-05-01 03:30:02 +08:00
feat(web): metric descriptions
This commit is contained in:
@@ -0,0 +1,34 @@
|
||||
import { getDefaultMetricDescription, getDefaultMetricDescriptionI18nKey, getTranslatedMetricDescription } from '../default-metric-descriptions'
|
||||
|
||||
describe('default metric descriptions', () => {
|
||||
it('should resolve descriptions for kebab-case metric ids', () => {
|
||||
expect(getDefaultMetricDescription('context-precision')).toContain('retrieval pipeline returns little noise')
|
||||
expect(getDefaultMetricDescription('answer-correctness')).toContain('factual accuracy and completeness')
|
||||
})
|
||||
|
||||
it('should normalize snake_case metric ids from backend payloads', () => {
|
||||
expect(getDefaultMetricDescription('CONTEXT_RECALL')).toContain('does not miss important supporting evidence')
|
||||
expect(getDefaultMetricDescription('TOOL_CORRECTNESS')).toContain('tool-use strategy matches the expected behavior')
|
||||
})
|
||||
|
||||
it('should support the legacy relevance alias', () => {
|
||||
expect(getDefaultMetricDescription('relevance')).toContain('addresses the user\'s question')
|
||||
})
|
||||
|
||||
it('should resolve i18n keys for builtin metrics', () => {
|
||||
expect(getDefaultMetricDescriptionI18nKey('context-precision')).toBe('metrics.builtin.description.contextPrecision')
|
||||
expect(getDefaultMetricDescriptionI18nKey('ANSWER_RELEVANCY')).toBe('metrics.builtin.description.answerRelevancy')
|
||||
})
|
||||
|
||||
it('should use translated content when translation key exists', () => {
|
||||
const t = vi.fn((key: string, options?: { defaultValue?: string }) => {
|
||||
if (key === 'metrics.builtin.description.faithfulness')
|
||||
return '忠实性中文文案'
|
||||
|
||||
return options?.defaultValue ?? key
|
||||
})
|
||||
|
||||
expect(getTranslatedMetricDescription(t as never, 'faithfulness')).toBe('忠实性中文文案')
|
||||
expect(getTranslatedMetricDescription(t as never, 'latency', 'Latency fallback')).toBe('Latency fallback')
|
||||
})
|
||||
})
|
||||
@@ -1,7 +1,9 @@
|
||||
import type { BuiltinMetricMap, MetricSelectorSection } from './types'
|
||||
import type { NodeInfo } from '@/types/evaluation'
|
||||
import { useEffect, useMemo } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { useAvailableEvaluationMetrics, useEvaluationNodeInfoMutation } from '@/service/use-evaluation'
|
||||
import { getTranslatedMetricDescription } from '../../default-metric-descriptions'
|
||||
import { getEvaluationMockConfig } from '../../mock'
|
||||
import { useEvaluationResource, useEvaluationStore } from '../../store'
|
||||
import {
|
||||
@@ -34,6 +36,7 @@ export const useMetricSelectorData = ({
|
||||
nodeInfoMap,
|
||||
setNodeInfoMap,
|
||||
}: UseMetricSelectorDataOptions): UseMetricSelectorDataResult => {
|
||||
const { t } = useTranslation('evaluation')
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
const metrics = useEvaluationResource(resourceType, resourceId).metrics
|
||||
const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
|
||||
@@ -102,9 +105,10 @@ export const useMetricSelectorData = ({
|
||||
const keyword = query.trim().toLowerCase()
|
||||
|
||||
return resolvedMetrics.map((metric) => {
|
||||
const metricDescription = getTranslatedMetricDescription(t, metric.id, metric.description)
|
||||
const metricMatches = !keyword
|
||||
|| metric.label.toLowerCase().includes(keyword)
|
||||
|| metric.description.toLowerCase().includes(keyword)
|
||||
|| metricDescription.toLowerCase().includes(keyword)
|
||||
const metricNodes = nodeInfoMap[metric.id] ?? []
|
||||
const supportsNodeSelection = resourceType !== 'datasets'
|
||||
const hasNoNodeInfo = supportsNodeSelection && metricNodes.length === 0
|
||||
@@ -114,7 +118,10 @@ export const useMetricSelectorData = ({
|
||||
return null
|
||||
|
||||
return {
|
||||
metric,
|
||||
metric: {
|
||||
...metric,
|
||||
description: metricDescription,
|
||||
},
|
||||
hasNoNodeInfo: true,
|
||||
visibleNodes: [] as NodeInfo[],
|
||||
}
|
||||
@@ -132,12 +139,15 @@ export const useMetricSelectorData = ({
|
||||
return null
|
||||
|
||||
return {
|
||||
metric,
|
||||
metric: {
|
||||
...metric,
|
||||
description: metricDescription,
|
||||
},
|
||||
hasNoNodeInfo: false,
|
||||
visibleNodes,
|
||||
}
|
||||
}).filter(section => !!section)
|
||||
}, [nodeInfoMap, query, resolvedMetrics, resourceType])
|
||||
}, [nodeInfoMap, query, resolvedMetrics, resourceType, t])
|
||||
|
||||
const toggleNodeSelection = (metricId: string, nodeInfo: NodeInfo) => {
|
||||
const addedMetric = builtinMetricMap.get(metricId)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import type { MetricOption } from '../../types'
|
||||
import type { MetricVisualTone } from './types'
|
||||
import type { EvaluationTargetType, NodeInfo } from '@/types/evaluation'
|
||||
import { getDefaultMetricDescription } from '../../default-metric-descriptions'
|
||||
|
||||
export const toEvaluationTargetType = (resourceType: 'apps' | 'snippets'): EvaluationTargetType => {
|
||||
return resourceType === 'snippets' ? 'snippets' : 'apps'
|
||||
@@ -17,7 +18,7 @@ const humanizeMetricId = (metricId: string) => {
|
||||
export const buildMetricOption = (metricId: string): MetricOption => ({
|
||||
id: metricId,
|
||||
label: humanizeMetricId(metricId),
|
||||
description: '',
|
||||
description: getDefaultMetricDescription(metricId),
|
||||
valueType: 'number',
|
||||
})
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ import Checkbox from '@/app/components/base/checkbox'
|
||||
import Input from '@/app/components/base/input'
|
||||
import { Tooltip, TooltipContent, TooltipTrigger } from '@/app/components/base/ui/tooltip'
|
||||
import { cn } from '@/utils/classnames'
|
||||
import { getTranslatedMetricDescription } from '../../default-metric-descriptions'
|
||||
import { DEFAULT_PIPELINE_METRIC_THRESHOLD } from '../../store-utils'
|
||||
|
||||
type PipelineMetricItemProps = {
|
||||
@@ -26,6 +27,7 @@ const PipelineMetricItem = ({
|
||||
onThresholdChange,
|
||||
}: PipelineMetricItemProps) => {
|
||||
const { t } = useTranslation('evaluation')
|
||||
const metricDescription = getTranslatedMetricDescription(t, metric.id, metric.description)
|
||||
|
||||
return (
|
||||
<div className="flex items-center justify-between gap-3 px-1 py-1">
|
||||
@@ -45,7 +47,7 @@ const PipelineMetricItem = ({
|
||||
)}
|
||||
/>
|
||||
<TooltipContent>
|
||||
{metric.description}
|
||||
{metricDescription}
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
</button>
|
||||
|
||||
79
web/app/components/evaluation/default-metric-descriptions.ts
Normal file
79
web/app/components/evaluation/default-metric-descriptions.ts
Normal file
@@ -0,0 +1,79 @@
|
||||
import type { TFunction } from 'i18next'
|
||||
|
||||
const DEFAULT_METRIC_DESCRIPTION = {
|
||||
FAITHFULNESS: 'Measures whether every claim in the model\'s response is grounded in the provided retrieved context. A high score means the answer contains no hallucinated content; each statement can be traced back to a passage in the context.',
|
||||
ANSWER_RELEVANCY: 'Measures how well the model\'s response addresses the user\'s question. A high score means the answer stays on-topic; a low score indicates irrelevant content or a failure to answer the actual question.',
|
||||
ANSWER_CORRECTNESS: 'Measures the factual accuracy and completeness of the model\'s answer relative to a ground-truth reference. It combines semantic similarity with key-fact coverage, so both meaning and content matter.',
|
||||
SEMANTIC_SIMILARITY: 'Measures the cosine similarity between the model\'s response and the reference answer in an embedding space. It evaluates whether the two texts convey the same meaning, independent of factual correctness.',
|
||||
CONTEXT_PRECISION: 'Measures the proportion of retrieved context chunks that are actually relevant to the question (precision). A high score means the retrieval pipeline returns little noise.',
|
||||
CONTEXT_RECALL: 'Measures the proportion of ground-truth information that is covered by the retrieved context chunks (recall). A high score means the retrieval pipeline does not miss important supporting evidence.',
|
||||
CONTEXT_RELEVANCE: 'Measures how relevant each individual retrieved chunk is to the query. Similar to CONTEXT_PRECISION but evaluated at the chunk level rather than against a reference answer.',
|
||||
TOOL_CORRECTNESS: 'Measures the correctness of the tool calls made by the agent during task execution: both the choice of tool and the arguments passed. A high score means the agent\'s tool-use strategy matches the expected behavior.',
|
||||
TASK_COMPLETION: 'Measures whether the agent ultimately achieves the user\'s stated goal. It evaluates the reasoning chain, intermediate steps, and final output holistically; a high score means the task was fully accomplished.',
|
||||
} as const
|
||||
|
||||
type DefaultMetricDescription = typeof DEFAULT_METRIC_DESCRIPTION[keyof typeof DEFAULT_METRIC_DESCRIPTION]
|
||||
|
||||
const DEFAULT_METRIC_DESCRIPTION_KEYS = {
|
||||
FAITHFULNESS: 'metrics.builtin.description.faithfulness',
|
||||
ANSWER_RELEVANCY: 'metrics.builtin.description.answerRelevancy',
|
||||
ANSWER_CORRECTNESS: 'metrics.builtin.description.answerCorrectness',
|
||||
SEMANTIC_SIMILARITY: 'metrics.builtin.description.semanticSimilarity',
|
||||
CONTEXT_PRECISION: 'metrics.builtin.description.contextPrecision',
|
||||
CONTEXT_RECALL: 'metrics.builtin.description.contextRecall',
|
||||
CONTEXT_RELEVANCE: 'metrics.builtin.description.contextRelevance',
|
||||
TOOL_CORRECTNESS: 'metrics.builtin.description.toolCorrectness',
|
||||
TASK_COMPLETION: 'metrics.builtin.description.taskCompletion',
|
||||
} as const
|
||||
|
||||
type DefaultMetricDescriptionKey = typeof DEFAULT_METRIC_DESCRIPTION_KEYS[keyof typeof DEFAULT_METRIC_DESCRIPTION_KEYS]
|
||||
|
||||
const DEFAULT_METRIC_DESCRIPTIONS: Record<string, DefaultMetricDescription> = {
|
||||
'faithfulness': DEFAULT_METRIC_DESCRIPTION.FAITHFULNESS,
|
||||
'answer-relevancy': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
|
||||
'answer-correctness': DEFAULT_METRIC_DESCRIPTION.ANSWER_CORRECTNESS,
|
||||
'semantic-similarity': DEFAULT_METRIC_DESCRIPTION.SEMANTIC_SIMILARITY,
|
||||
'context-precision': DEFAULT_METRIC_DESCRIPTION.CONTEXT_PRECISION,
|
||||
'context-recall': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RECALL,
|
||||
'context-relevance': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RELEVANCE,
|
||||
'tool-correctness': DEFAULT_METRIC_DESCRIPTION.TOOL_CORRECTNESS,
|
||||
'task-completion': DEFAULT_METRIC_DESCRIPTION.TASK_COMPLETION,
|
||||
'relevance': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
|
||||
}
|
||||
|
||||
const DEFAULT_METRIC_DESCRIPTION_I18N_KEYS: Record<string, DefaultMetricDescriptionKey> = {
|
||||
'faithfulness': DEFAULT_METRIC_DESCRIPTION_KEYS.FAITHFULNESS,
|
||||
'answer-relevancy': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
|
||||
'answer-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_CORRECTNESS,
|
||||
'semantic-similarity': DEFAULT_METRIC_DESCRIPTION_KEYS.SEMANTIC_SIMILARITY,
|
||||
'context-precision': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_PRECISION,
|
||||
'context-recall': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RECALL,
|
||||
'context-relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RELEVANCE,
|
||||
'tool-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.TOOL_CORRECTNESS,
|
||||
'task-completion': DEFAULT_METRIC_DESCRIPTION_KEYS.TASK_COMPLETION,
|
||||
'relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
|
||||
}
|
||||
|
||||
const normalizeMetricId = (metricId: string) => metricId.trim().toLowerCase().replace(/_/g, '-')
|
||||
|
||||
export const getDefaultMetricDescription = (metricId: string) => {
|
||||
return DEFAULT_METRIC_DESCRIPTIONS[normalizeMetricId(metricId)] ?? ''
|
||||
}
|
||||
|
||||
export const getDefaultMetricDescriptionI18nKey = (metricId: string) => {
|
||||
return DEFAULT_METRIC_DESCRIPTION_I18N_KEYS[normalizeMetricId(metricId)] ?? null
|
||||
}
|
||||
|
||||
export const getTranslatedMetricDescription = (
|
||||
t: TFunction<'evaluation'>,
|
||||
metricId: string,
|
||||
fallbackDescription = '',
|
||||
) => {
|
||||
const defaultDescription = fallbackDescription || getDefaultMetricDescription(metricId)
|
||||
const descriptionI18nKey = getDefaultMetricDescriptionI18nKey(metricId)
|
||||
|
||||
if (!descriptionI18nKey)
|
||||
return defaultDescription
|
||||
|
||||
return t(descriptionI18nKey, { defaultValue: defaultDescription })
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import type {
|
||||
EvaluationResourceType,
|
||||
MetricOption,
|
||||
} from './types'
|
||||
import { getDefaultMetricDescription } from './default-metric-descriptions'
|
||||
|
||||
const judgeModels = [
|
||||
{
|
||||
@@ -27,19 +28,19 @@ const builtinMetrics: MetricOption[] = [
|
||||
{
|
||||
id: 'answer-correctness',
|
||||
label: 'Answer Correctness',
|
||||
description: 'Compares the response with the expected answer and scores factual alignment.',
|
||||
description: getDefaultMetricDescription('answer-correctness'),
|
||||
valueType: 'number',
|
||||
},
|
||||
{
|
||||
id: 'faithfulness',
|
||||
label: 'Faithfulness',
|
||||
description: 'Checks whether the answer stays grounded in the retrieved evidence.',
|
||||
description: getDefaultMetricDescription('faithfulness'),
|
||||
valueType: 'number',
|
||||
},
|
||||
{
|
||||
id: 'relevance',
|
||||
label: 'Relevance',
|
||||
description: 'Evaluates how directly the answer addresses the original request.',
|
||||
description: getDefaultMetricDescription('relevance'),
|
||||
valueType: 'number',
|
||||
},
|
||||
{
|
||||
@@ -66,19 +67,19 @@ const pipelineBuiltinMetrics: MetricOption[] = [
|
||||
{
|
||||
id: 'context-precision',
|
||||
label: 'Context Precision',
|
||||
description: 'Measures whether retrieved chunks stay tightly aligned to the request.',
|
||||
description: getDefaultMetricDescription('context-precision'),
|
||||
valueType: 'number',
|
||||
},
|
||||
{
|
||||
id: 'context-recall',
|
||||
label: 'Context Recall',
|
||||
description: 'Checks whether the retrieval result includes the evidence needed to answer.',
|
||||
description: getDefaultMetricDescription('context-recall'),
|
||||
valueType: 'number',
|
||||
},
|
||||
{
|
||||
id: 'context-relevance',
|
||||
label: 'Context Relevance',
|
||||
description: 'Scores how useful the retrieved context is for downstream generation.',
|
||||
description: getDefaultMetricDescription('context-relevance'),
|
||||
valueType: 'number',
|
||||
},
|
||||
]
|
||||
|
||||
@@ -20,6 +20,7 @@ import type {
|
||||
EvaluationRunRequest,
|
||||
NodeInfo,
|
||||
} from '@/types/evaluation'
|
||||
import { getDefaultMetricDescription } from './default-metric-descriptions'
|
||||
import { getEvaluationMockConfig } from './mock'
|
||||
import {
|
||||
buildConditionMetricOptions,
|
||||
@@ -51,7 +52,7 @@ const resolveMetricOption = (resourceType: EvaluationResourceType, metricId: str
|
||||
return config.builtinMetrics.find(metric => metric.id === metricId) ?? {
|
||||
id: metricId,
|
||||
label: humanizeMetricId(metricId),
|
||||
description: '',
|
||||
description: getDefaultMetricDescription(metricId),
|
||||
valueType: 'number',
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,6 +83,15 @@
|
||||
"metrics.addCustom": "Add Custom Metrics",
|
||||
"metrics.addNode": "Add Node",
|
||||
"metrics.added": "Added",
|
||||
"metrics.builtin.description.answerCorrectness": "Measures the factual accuracy and completeness of the model's answer relative to a ground-truth reference. It combines semantic similarity with key-fact coverage, so both meaning and content matter.",
|
||||
"metrics.builtin.description.answerRelevancy": "Measures how well the model's response addresses the user's question. A high score means the answer stays on-topic; a low score indicates irrelevant content or a failure to answer the actual question.",
|
||||
"metrics.builtin.description.contextPrecision": "Measures the proportion of retrieved context chunks that are actually relevant to the question (precision). A high score means the retrieval pipeline returns little noise.",
|
||||
"metrics.builtin.description.contextRecall": "Measures the proportion of ground-truth information that is covered by the retrieved context chunks (recall). A high score means the retrieval pipeline does not miss important supporting evidence.",
|
||||
"metrics.builtin.description.contextRelevance": "Measures how relevant each individual retrieved chunk is to the query. Similar to CONTEXT_PRECISION but evaluated at the chunk level rather than against a reference answer.",
|
||||
"metrics.builtin.description.faithfulness": "Measures whether every claim in the model's response is grounded in the provided retrieved context. A high score means the answer contains no hallucinated content; each statement can be traced back to a passage in the context.",
|
||||
"metrics.builtin.description.semanticSimilarity": "Measures the cosine similarity between the model's response and the reference answer in an embedding space. It evaluates whether the two texts convey the same meaning, independent of factual correctness.",
|
||||
"metrics.builtin.description.taskCompletion": "Measures whether the agent ultimately achieves the user's stated goal. It evaluates the reasoning chain, intermediate steps, and final output holistically; a high score means the task was fully accomplished.",
|
||||
"metrics.builtin.description.toolCorrectness": "Measures the correctness of the tool calls made by the agent during task execution: both the choice of tool and the arguments passed. A high score means the agent's tool-use strategy matches the expected behavior.",
|
||||
"metrics.collapseNodes": "Collapse nodes",
|
||||
"metrics.custom.description": "Select an evaluation workflow and map your variables before running tests.",
|
||||
"metrics.custom.footerDescription": "Connect your published evaluation workflows",
|
||||
|
||||
@@ -83,6 +83,15 @@
|
||||
"metrics.addCustom": "添加自定义指标",
|
||||
"metrics.addNode": "添加节点",
|
||||
"metrics.added": "已添加",
|
||||
"metrics.builtin.description.answerCorrectness": "衡量模型回答相对于标准答案的事实准确性与完整性。它结合了语义相似度与关键信息覆盖情况,因此不仅关注表达含义,也关注内容是否完整准确。",
|
||||
"metrics.builtin.description.answerRelevancy": "衡量模型回答与用户问题的贴合程度。高分表示回答始终围绕问题展开;低分表示内容偏题,或没有真正回答用户的实际问题。",
|
||||
"metrics.builtin.description.contextPrecision": "衡量检索出的上下文片段中,实际与问题相关的内容占比(Precision)。高分表示检索流程带回的噪声较少。",
|
||||
"metrics.builtin.description.contextRecall": "衡量标准答案所需的真实信息,有多少被检索出的上下文片段覆盖到(Recall)。高分表示检索流程没有遗漏重要的支撑证据。",
|
||||
"metrics.builtin.description.contextRelevance": "衡量每一个被检索出的上下文片段与查询的相关程度。它与 CONTEXT_PRECISION 类似,但评估粒度在单个 chunk 层面,而不是相对于参考答案整体评估。",
|
||||
"metrics.builtin.description.faithfulness": "衡量模型回答中的每一个陈述,是否都能从提供的检索上下文中找到依据。高分表示回答中没有幻觉内容,每一条表述都可以追溯到上下文中的某个片段。",
|
||||
"metrics.builtin.description.semanticSimilarity": "衡量模型回答与参考答案在向量语义空间中的余弦相似度。它评估的是两段文本是否表达了相同含义,而不直接判断事实是否正确。",
|
||||
"metrics.builtin.description.taskCompletion": "衡量 Agent 是否最终完成了用户明确提出的目标。它会整体评估推理链路、中间步骤和最终输出;高分表示任务已被完整达成。",
|
||||
"metrics.builtin.description.toolCorrectness": "衡量 Agent 在任务执行过程中发起的工具调用是否正确,包括工具选择本身以及传入参数是否合理。高分表示 Agent 的工具使用策略符合预期行为。",
|
||||
"metrics.custom.description": "选择评测工作流并完成变量映射后即可运行测试。",
|
||||
"metrics.custom.mappingTitle": "变量映射",
|
||||
"metrics.custom.mappingWarning": "请先完成工作流选择和所有变量映射,再运行批量测试。",
|
||||
|
||||
Reference in New Issue
Block a user