mirror of
https://mirror.skon.top/github.com/langgenius/dify.git
synced 2026-05-01 03:30:02 +08:00
feat(web): rag-pipeline evaluation configuration
This commit is contained in:
@@ -11,6 +11,7 @@ const mockUseEvaluationConfig = vi.hoisted(() => vi.fn())
|
||||
const mockUseEvaluationNodeInfoMutation = vi.hoisted(() => vi.fn())
|
||||
const mockUseSaveEvaluationConfigMutation = vi.hoisted(() => vi.fn())
|
||||
const mockUseStartEvaluationRunMutation = vi.hoisted(() => vi.fn())
|
||||
const mockUsePublishedPipelineInfo = vi.hoisted(() => vi.fn())
|
||||
|
||||
vi.mock('@/app/components/header/account-setting/model-provider-page/hooks', () => ({
|
||||
useModelList: () => ({
|
||||
@@ -55,6 +56,15 @@ vi.mock('@/service/use-evaluation', () => ({
|
||||
useStartEvaluationRunMutation: (...args: unknown[]) => mockUseStartEvaluationRunMutation(...args),
|
||||
}))
|
||||
|
||||
vi.mock('@/service/use-pipeline', () => ({
|
||||
usePublishedPipelineInfo: (...args: unknown[]) => mockUsePublishedPipelineInfo(...args),
|
||||
}))
|
||||
|
||||
vi.mock('@/context/dataset-detail', () => ({
|
||||
useDatasetDetailContextWithSelector: (selector: (state: { dataset: { pipeline_id: string } }) => unknown) =>
|
||||
selector({ dataset: { pipeline_id: 'pipeline-1' } }),
|
||||
}))
|
||||
|
||||
vi.mock('@/service/use-workflow', () => ({
|
||||
useAppWorkflow: () => ({
|
||||
data: {
|
||||
@@ -152,6 +162,20 @@ describe('Evaluation', () => {
|
||||
isPending: false,
|
||||
mutate: vi.fn(),
|
||||
})
|
||||
mockUsePublishedPipelineInfo.mockReturnValue({
|
||||
data: {
|
||||
graph: {
|
||||
nodes: [{
|
||||
id: 'knowledge-node',
|
||||
data: {
|
||||
type: 'knowledge-index',
|
||||
title: 'Knowledge Base',
|
||||
},
|
||||
}],
|
||||
edges: [],
|
||||
},
|
||||
},
|
||||
})
|
||||
mockUpload.mockResolvedValue({
|
||||
id: 'uploaded-file-id',
|
||||
name: 'evaluation.csv',
|
||||
@@ -471,10 +495,19 @@ describe('Evaluation', () => {
|
||||
default_metrics: [{
|
||||
metric: 'context-precision',
|
||||
value_type: 'number',
|
||||
node_info_list: [],
|
||||
node_info_list: [
|
||||
{ node_id: 'knowledge-node', title: 'Knowledge Base', type: 'knowledge-index' },
|
||||
],
|
||||
}],
|
||||
customized_metrics: null,
|
||||
judgment_config: null,
|
||||
judgment_config: {
|
||||
logical_operator: 'and',
|
||||
conditions: [{
|
||||
variable_selector: ['knowledge-node', 'context-precision'],
|
||||
comparison_operator: '≥',
|
||||
value: '0.85',
|
||||
}],
|
||||
},
|
||||
file_id: 'file-1',
|
||||
},
|
||||
}, {
|
||||
|
||||
@@ -338,8 +338,117 @@ describe('evaluation store', () => {
|
||||
},
|
||||
}
|
||||
|
||||
expect(buildEvaluationConfigPayload(resource)).toEqual(expectedPayload)
|
||||
expect(buildEvaluationRunRequest(resource, 'file-1')).toEqual({
|
||||
expect(buildEvaluationConfigPayload(resource, resourceType)).toEqual(expectedPayload)
|
||||
expect(buildEvaluationRunRequest(resource, 'file-1', resourceType)).toEqual({
|
||||
...expectedPayload,
|
||||
file_id: 'file-1',
|
||||
})
|
||||
})
|
||||
|
||||
it('should hydrate pipeline metrics from fixed knowledge-index conditions', () => {
|
||||
const resourceType = 'datasets'
|
||||
const resourceId = 'dataset-hydrate'
|
||||
const store = useEvaluationStore.getState()
|
||||
const config: EvaluationConfig = {
|
||||
evaluation_model: 'gpt-4o-mini',
|
||||
evaluation_model_provider: 'openai',
|
||||
default_metrics: [{
|
||||
metric: 'context-precision',
|
||||
node_info_list: [
|
||||
{ node_id: 'knowledge-node', title: 'Knowledge Base', type: 'knowledge-index' },
|
||||
],
|
||||
}],
|
||||
customized_metrics: {
|
||||
evaluation_workflow_id: 'should-be-ignored',
|
||||
input_fields: {
|
||||
query: 'answer',
|
||||
},
|
||||
output_fields: [{
|
||||
variable: 'score',
|
||||
value_type: 'number',
|
||||
}],
|
||||
},
|
||||
judgment_config: {
|
||||
logical_operator: 'or',
|
||||
conditions: [{
|
||||
variable_selector: ['knowledge-node', 'context-precision'],
|
||||
comparison_operator: '≥',
|
||||
value: '0.92',
|
||||
}],
|
||||
},
|
||||
}
|
||||
|
||||
store.hydrateResource(resourceType, resourceId, config)
|
||||
|
||||
const hydratedState = useEvaluationStore.getState().resources['datasets:dataset-hydrate']
|
||||
|
||||
expect(hydratedState.judgeModelId).toBe('openai::gpt-4o-mini')
|
||||
expect(hydratedState.metrics).toHaveLength(1)
|
||||
expect(hydratedState.metrics[0]).toMatchObject({
|
||||
optionId: 'context-precision',
|
||||
kind: 'builtin',
|
||||
valueType: 'number',
|
||||
threshold: 0.92,
|
||||
nodeInfoList: [
|
||||
{ node_id: 'knowledge-node', title: 'Knowledge Base', type: 'knowledge-index' },
|
||||
],
|
||||
})
|
||||
})
|
||||
|
||||
it('should build pipeline judgment payload from metric thresholds', () => {
|
||||
const resourceType = 'datasets'
|
||||
const resourceId = 'dataset-save-config'
|
||||
const store = useEvaluationStore.getState()
|
||||
const knowledgeNodeInfo = [{ node_id: 'knowledge-node', title: 'Knowledge Base', type: 'knowledge-index' }]
|
||||
|
||||
store.ensureResource(resourceType, resourceId)
|
||||
store.setJudgeModel(resourceType, resourceId, 'openai::gpt-4o-mini')
|
||||
store.addBuiltinMetric(resourceType, resourceId, 'context-precision', knowledgeNodeInfo)
|
||||
store.addBuiltinMetric(resourceType, resourceId, 'context-recall', knowledgeNodeInfo)
|
||||
|
||||
const resourceWithMetrics = useEvaluationStore.getState().resources['datasets:dataset-save-config']
|
||||
const contextPrecisionMetric = resourceWithMetrics.metrics.find(metric => metric.optionId === 'context-precision')!
|
||||
const contextRecallMetric = resourceWithMetrics.metrics.find(metric => metric.optionId === 'context-recall')!
|
||||
|
||||
store.updateMetricThreshold(resourceType, resourceId, contextPrecisionMetric.id, 0.91)
|
||||
store.updateMetricThreshold(resourceType, resourceId, contextRecallMetric.id, 0.88)
|
||||
|
||||
const resource = useEvaluationStore.getState().resources['datasets:dataset-save-config']
|
||||
const expectedPayload = {
|
||||
evaluation_model: 'gpt-4o-mini',
|
||||
evaluation_model_provider: 'openai',
|
||||
default_metrics: [
|
||||
{
|
||||
metric: 'context-precision',
|
||||
value_type: 'number',
|
||||
node_info_list: knowledgeNodeInfo,
|
||||
},
|
||||
{
|
||||
metric: 'context-recall',
|
||||
value_type: 'number',
|
||||
node_info_list: knowledgeNodeInfo,
|
||||
},
|
||||
],
|
||||
customized_metrics: null,
|
||||
judgment_config: {
|
||||
logical_operator: 'and',
|
||||
conditions: [
|
||||
{
|
||||
variable_selector: ['knowledge-node', 'context-precision'],
|
||||
comparison_operator: '≥',
|
||||
value: '0.91',
|
||||
},
|
||||
{
|
||||
variable_selector: ['knowledge-node', 'context-recall'],
|
||||
comparison_operator: '≥',
|
||||
value: '0.88',
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
expect(buildEvaluationConfigPayload(resource, resourceType)).toEqual(expectedPayload)
|
||||
expect(buildEvaluationRunRequest(resource, 'file-1', resourceType)).toEqual({
|
||||
...expectedPayload,
|
||||
file_id: 'file-1',
|
||||
})
|
||||
|
||||
@@ -36,7 +36,7 @@ const BatchTestPanel = ({
|
||||
return
|
||||
}
|
||||
|
||||
const body = buildEvaluationConfigPayload(resource)
|
||||
const body = buildEvaluationConfigPayload(resource, resourceType)
|
||||
|
||||
if (!body) {
|
||||
toast.warning(t('batch.validation'))
|
||||
|
||||
@@ -102,7 +102,7 @@ export const useInputFieldsActions = ({
|
||||
return
|
||||
}
|
||||
|
||||
const body = buildEvaluationRunRequest(resource, uploadedFileId)
|
||||
const body = buildEvaluationRunRequest(resource, uploadedFileId, resourceType)
|
||||
|
||||
if (!body) {
|
||||
toast.warning(t('batch.validation'))
|
||||
|
||||
@@ -1,25 +1,62 @@
|
||||
'use client'
|
||||
|
||||
import type { EvaluationResourceProps } from '../../types'
|
||||
import { useMemo } from 'react'
|
||||
import type { Node } from '@/app/components/workflow/types'
|
||||
import type { NodeInfo } from '@/types/evaluation'
|
||||
import { useEffect, useMemo } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { BlockEnum } from '@/app/components/workflow/types'
|
||||
import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail'
|
||||
import { useAvailableEvaluationMetrics } from '@/service/use-evaluation'
|
||||
import { usePublishedPipelineInfo } from '@/service/use-pipeline'
|
||||
import { getEvaluationMockConfig } from '../../mock'
|
||||
import { useEvaluationResource, useEvaluationStore } from '../../store'
|
||||
import { InlineSectionHeader } from '../section-header'
|
||||
import PipelineMetricItem from './pipeline-metric-item'
|
||||
|
||||
const getKnowledgeIndexNodeInfo = (nodes: Node[] | undefined): NodeInfo[] => {
|
||||
const knowledgeIndexNode = nodes?.find(node => node.data.type === BlockEnum.KnowledgeBase)
|
||||
if (!knowledgeIndexNode?.id)
|
||||
return []
|
||||
|
||||
return [{
|
||||
node_id: knowledgeIndexNode.id,
|
||||
title: typeof knowledgeIndexNode.data?.title === 'string' && knowledgeIndexNode.data.title
|
||||
? knowledgeIndexNode.data.title
|
||||
: knowledgeIndexNode.id,
|
||||
type: 'knowledge-index',
|
||||
}]
|
||||
}
|
||||
|
||||
const isSameNodeInfoList = (left: NodeInfo[] | undefined, right: NodeInfo[]) => {
|
||||
if ((left?.length ?? 0) !== right.length)
|
||||
return false
|
||||
|
||||
return (left ?? []).every((nodeInfo, index) => {
|
||||
const target = right[index]
|
||||
return nodeInfo.node_id === target?.node_id
|
||||
&& nodeInfo.title === target?.title
|
||||
&& nodeInfo.type === target?.type
|
||||
})
|
||||
}
|
||||
|
||||
const PipelineMetricsSection = ({
|
||||
resourceType,
|
||||
resourceId,
|
||||
}: EvaluationResourceProps) => {
|
||||
const { t } = useTranslation('evaluation')
|
||||
const pipelineId = useDatasetDetailContextWithSelector(state => state.dataset?.pipeline_id)
|
||||
const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
|
||||
const removeMetric = useEvaluationStore(state => state.removeMetric)
|
||||
const updateMetricThreshold = useEvaluationStore(state => state.updateMetricThreshold)
|
||||
const { data: availableMetricsData } = useAvailableEvaluationMetrics()
|
||||
const { data: publishedPipeline } = usePublishedPipelineInfo(pipelineId || '')
|
||||
const resource = useEvaluationResource(resourceType, resourceId)
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
const knowledgeIndexNodeInfoList = useMemo(
|
||||
() => getKnowledgeIndexNodeInfo(publishedPipeline?.graph.nodes),
|
||||
[publishedPipeline?.graph.nodes],
|
||||
)
|
||||
const builtinMetricMap = useMemo(() => new Map(
|
||||
resource.metrics
|
||||
.filter(metric => metric.kind === 'builtin')
|
||||
@@ -32,6 +69,18 @@ const PipelineMetricsSection = ({
|
||||
)
|
||||
}, [availableMetricIds, builtinMetricMap, config.builtinMetrics])
|
||||
|
||||
useEffect(() => {
|
||||
if (!knowledgeIndexNodeInfoList.length)
|
||||
return
|
||||
|
||||
resource.metrics.forEach((metric) => {
|
||||
if (metric.kind !== 'builtin' || isSameNodeInfoList(metric.nodeInfoList, knowledgeIndexNodeInfoList))
|
||||
return
|
||||
|
||||
addBuiltinMetric(resourceType, resourceId, metric.optionId, knowledgeIndexNodeInfoList)
|
||||
})
|
||||
}, [addBuiltinMetric, knowledgeIndexNodeInfoList, resource.metrics, resourceId, resourceType])
|
||||
|
||||
const handleToggleMetric = (metricId: string) => {
|
||||
const selectedMetric = builtinMetricMap.get(metricId)
|
||||
if (selectedMetric) {
|
||||
@@ -39,7 +88,7 @@ const PipelineMetricsSection = ({
|
||||
return
|
||||
}
|
||||
|
||||
addBuiltinMetric(resourceType, resourceId, metricId)
|
||||
addBuiltinMetric(resourceType, resourceId, metricId, knowledgeIndexNodeInfoList)
|
||||
}
|
||||
|
||||
return (
|
||||
|
||||
@@ -34,6 +34,8 @@ type EvaluationStoreResources = Record<string, EvaluationResourceState>
|
||||
|
||||
export const DEFAULT_PIPELINE_METRIC_THRESHOLD = 0.85
|
||||
|
||||
const PIPELINE_LOGICAL_OPERATOR: JudgmentConfig['logicalOperator'] = 'and'
|
||||
|
||||
const createId = (prefix: string) => `${prefix}-${Math.random().toString(36).slice(2, 10)}`
|
||||
|
||||
const humanizeMetricId = (metricId: string) => {
|
||||
@@ -54,6 +56,15 @@ const resolveMetricOption = (resourceType: EvaluationResourceType, metricId: str
|
||||
}
|
||||
}
|
||||
|
||||
const pipelineMetricIds = new Set(getEvaluationMockConfig('datasets').builtinMetrics.map(metric => metric.id))
|
||||
|
||||
const isPipelineResourceType = (resourceType: EvaluationResourceType) => resourceType === 'datasets'
|
||||
|
||||
const isPipelineResourceState = (resource: EvaluationResourceState) => {
|
||||
return resource.metrics.length > 0
|
||||
&& resource.metrics.every(metric => metric.kind === 'builtin' && pipelineMetricIds.has(metric.optionId))
|
||||
}
|
||||
|
||||
const normalizeNodeInfoList = (value: NodeInfo[] | undefined): NodeInfo[] => {
|
||||
if (!value?.length)
|
||||
return []
|
||||
@@ -164,6 +175,46 @@ const normalizeVariableSelector = (value: string[] | undefined): [string, string
|
||||
: null
|
||||
}
|
||||
|
||||
const getConditionNumericValue = (value: EvaluationJudgmentCondition['value']) => {
|
||||
if (typeof value === 'number')
|
||||
return value
|
||||
|
||||
if (typeof value !== 'string')
|
||||
return null
|
||||
|
||||
const parsedValue = Number(value)
|
||||
return Number.isFinite(parsedValue) ? parsedValue : null
|
||||
}
|
||||
|
||||
const getPipelineMetricThreshold = (
|
||||
metric: EvaluationMetric,
|
||||
config: EvaluationConfig,
|
||||
) => {
|
||||
const matchingCondition = (config.judgment_config?.conditions ?? []).find((condition) => {
|
||||
const variableSelector = normalizeVariableSelector(condition.variable_selector)
|
||||
if (!variableSelector || variableSelector[1] !== metric.optionId || condition.comparison_operator !== '≥')
|
||||
return false
|
||||
|
||||
if (!metric.nodeInfoList?.length)
|
||||
return true
|
||||
|
||||
return metric.nodeInfoList.some(nodeInfo => nodeInfo.node_id === variableSelector[0])
|
||||
})
|
||||
|
||||
return getConditionNumericValue(matchingCondition?.value) ?? metric.threshold ?? DEFAULT_PIPELINE_METRIC_THRESHOLD
|
||||
}
|
||||
|
||||
const normalizePipelineMetrics = (
|
||||
config: EvaluationConfig,
|
||||
metrics: EvaluationMetric[],
|
||||
) => {
|
||||
return metrics.map(metric => ({
|
||||
...metric,
|
||||
valueType: 'number' as const,
|
||||
threshold: getPipelineMetricThreshold(metric, config),
|
||||
}))
|
||||
}
|
||||
|
||||
const getNormalizedConditionValue = (
|
||||
operator: ComparisonOperator,
|
||||
previousValue: EvaluationJudgmentConditionValue | string | number | boolean | null | undefined,
|
||||
@@ -404,8 +455,10 @@ export const buildStateFromEvaluationConfig = (
|
||||
config: EvaluationConfig,
|
||||
): EvaluationResourceState => {
|
||||
const defaultMetrics = normalizeDefaultMetrics(resourceType, config.default_metrics)
|
||||
const customMetrics = normalizeCustomMetric(config.customized_metrics)
|
||||
const metrics = [...defaultMetrics, ...customMetrics]
|
||||
const customMetrics = isPipelineResourceType(resourceType) ? [] : normalizeCustomMetric(config.customized_metrics)
|
||||
const metrics = isPipelineResourceType(resourceType)
|
||||
? normalizePipelineMetrics(config, defaultMetrics)
|
||||
: [...defaultMetrics, ...customMetrics]
|
||||
|
||||
return {
|
||||
...buildInitialState(resourceType),
|
||||
@@ -458,7 +511,40 @@ const buildCustomizedMetricsPayload = (metrics: EvaluationMetric[]): EvaluationC
|
||||
}
|
||||
}
|
||||
|
||||
const buildJudgmentConfigPayload = (resource: EvaluationResourceState): EvaluationConfigData['judgment_config'] => {
|
||||
const buildPipelineJudgmentConfigPayload = (
|
||||
resource: EvaluationResourceState,
|
||||
): EvaluationConfigData['judgment_config'] => {
|
||||
const conditions = resource.metrics
|
||||
.filter((metric): metric is EvaluationMetric & { kind: 'builtin' } => metric.kind === 'builtin')
|
||||
.map((metric) => {
|
||||
const nodeInfo = metric.nodeInfoList?.[0]
|
||||
if (!nodeInfo)
|
||||
return null
|
||||
|
||||
return {
|
||||
variable_selector: [nodeInfo.node_id, metric.optionId],
|
||||
comparison_operator: '≥',
|
||||
value: String(metric.threshold ?? DEFAULT_PIPELINE_METRIC_THRESHOLD),
|
||||
}
|
||||
})
|
||||
.filter((condition): condition is NonNullable<typeof condition> => !!condition)
|
||||
|
||||
if (!conditions.length)
|
||||
return null
|
||||
|
||||
return {
|
||||
logical_operator: PIPELINE_LOGICAL_OPERATOR,
|
||||
conditions,
|
||||
}
|
||||
}
|
||||
|
||||
const buildJudgmentConfigPayload = (
|
||||
resource: EvaluationResourceState,
|
||||
resourceType?: EvaluationResourceType,
|
||||
): EvaluationConfigData['judgment_config'] => {
|
||||
if ((resourceType && isPipelineResourceType(resourceType)) || isPipelineResourceState(resource))
|
||||
return buildPipelineJudgmentConfigPayload(resource)
|
||||
|
||||
const conditions = resource.judgmentConfig.conditions
|
||||
.filter(condition => !!condition.variableSelector)
|
||||
.map((condition) => {
|
||||
@@ -488,6 +574,7 @@ const buildJudgmentConfigPayload = (resource: EvaluationResourceState): Evaluati
|
||||
|
||||
export const buildEvaluationConfigPayload = (
|
||||
resource: EvaluationResourceState,
|
||||
resourceType?: EvaluationResourceType,
|
||||
): EvaluationConfigData | null => {
|
||||
const selectedModel = decodeModelSelection(resource.judgeModelId)
|
||||
|
||||
@@ -504,16 +591,19 @@ export const buildEvaluationConfigPayload = (
|
||||
value_type: metric.valueType,
|
||||
node_info_list: metric.nodeInfoList ?? [],
|
||||
})),
|
||||
customized_metrics: buildCustomizedMetricsPayload(resource.metrics),
|
||||
judgment_config: buildJudgmentConfigPayload(resource),
|
||||
customized_metrics: (resourceType && isPipelineResourceType(resourceType)) || isPipelineResourceState(resource)
|
||||
? null
|
||||
: buildCustomizedMetricsPayload(resource.metrics),
|
||||
judgment_config: buildJudgmentConfigPayload(resource, resourceType),
|
||||
}
|
||||
}
|
||||
|
||||
export const buildEvaluationRunRequest = (
|
||||
resource: EvaluationResourceState,
|
||||
fileId: string,
|
||||
resourceType?: EvaluationResourceType,
|
||||
): EvaluationRunRequest | null => {
|
||||
const configPayload = buildEvaluationConfigPayload(resource)
|
||||
const configPayload = buildEvaluationConfigPayload(resource, resourceType)
|
||||
|
||||
if (!configPayload)
|
||||
return null
|
||||
|
||||
Reference in New Issue
Block a user