diff --git a/web/app/components/evaluation/__tests__/index.spec.tsx b/web/app/components/evaluation/__tests__/index.spec.tsx index 8a1055326b..4f8db5a32d 100644 --- a/web/app/components/evaluation/__tests__/index.spec.tsx +++ b/web/app/components/evaluation/__tests__/index.spec.tsx @@ -11,6 +11,7 @@ const mockUseEvaluationConfig = vi.hoisted(() => vi.fn()) const mockUseEvaluationNodeInfoMutation = vi.hoisted(() => vi.fn()) const mockUseSaveEvaluationConfigMutation = vi.hoisted(() => vi.fn()) const mockUseStartEvaluationRunMutation = vi.hoisted(() => vi.fn()) +const mockUsePublishedPipelineInfo = vi.hoisted(() => vi.fn()) vi.mock('@/app/components/header/account-setting/model-provider-page/hooks', () => ({ useModelList: () => ({ @@ -55,6 +56,15 @@ vi.mock('@/service/use-evaluation', () => ({ useStartEvaluationRunMutation: (...args: unknown[]) => mockUseStartEvaluationRunMutation(...args), })) +vi.mock('@/service/use-pipeline', () => ({ + usePublishedPipelineInfo: (...args: unknown[]) => mockUsePublishedPipelineInfo(...args), +})) + +vi.mock('@/context/dataset-detail', () => ({ + useDatasetDetailContextWithSelector: (selector: (state: { dataset: { pipeline_id: string } }) => unknown) => + selector({ dataset: { pipeline_id: 'pipeline-1' } }), +})) + vi.mock('@/service/use-workflow', () => ({ useAppWorkflow: () => ({ data: { @@ -152,6 +162,20 @@ describe('Evaluation', () => { isPending: false, mutate: vi.fn(), }) + mockUsePublishedPipelineInfo.mockReturnValue({ + data: { + graph: { + nodes: [{ + id: 'knowledge-node', + data: { + type: 'knowledge-index', + title: 'Knowledge Base', + }, + }], + edges: [], + }, + }, + }) mockUpload.mockResolvedValue({ id: 'uploaded-file-id', name: 'evaluation.csv', @@ -471,10 +495,19 @@ describe('Evaluation', () => { default_metrics: [{ metric: 'context-precision', value_type: 'number', - node_info_list: [], + node_info_list: [ + { node_id: 'knowledge-node', title: 'Knowledge Base', type: 'knowledge-index' }, + ], }], customized_metrics: null, - judgment_config: null, + judgment_config: { + logical_operator: 'and', + conditions: [{ + variable_selector: ['knowledge-node', 'context-precision'], + comparison_operator: '≥', + value: '0.85', + }], + }, file_id: 'file-1', }, }, { diff --git a/web/app/components/evaluation/__tests__/store.spec.ts b/web/app/components/evaluation/__tests__/store.spec.ts index c3b122c18a..150f285b52 100644 --- a/web/app/components/evaluation/__tests__/store.spec.ts +++ b/web/app/components/evaluation/__tests__/store.spec.ts @@ -338,8 +338,117 @@ describe('evaluation store', () => { }, } - expect(buildEvaluationConfigPayload(resource)).toEqual(expectedPayload) - expect(buildEvaluationRunRequest(resource, 'file-1')).toEqual({ + expect(buildEvaluationConfigPayload(resource, resourceType)).toEqual(expectedPayload) + expect(buildEvaluationRunRequest(resource, 'file-1', resourceType)).toEqual({ + ...expectedPayload, + file_id: 'file-1', + }) + }) + + it('should hydrate pipeline metrics from fixed knowledge-index conditions', () => { + const resourceType = 'datasets' + const resourceId = 'dataset-hydrate' + const store = useEvaluationStore.getState() + const config: EvaluationConfig = { + evaluation_model: 'gpt-4o-mini', + evaluation_model_provider: 'openai', + default_metrics: [{ + metric: 'context-precision', + node_info_list: [ + { node_id: 'knowledge-node', title: 'Knowledge Base', type: 'knowledge-index' }, + ], + }], + customized_metrics: { + evaluation_workflow_id: 'should-be-ignored', + input_fields: { + query: 'answer', + }, + output_fields: [{ + variable: 'score', + value_type: 'number', + }], + }, + judgment_config: { + logical_operator: 'or', + conditions: [{ + variable_selector: ['knowledge-node', 'context-precision'], + comparison_operator: '≥', + value: '0.92', + }], + }, + } + + store.hydrateResource(resourceType, resourceId, config) + + const hydratedState = useEvaluationStore.getState().resources['datasets:dataset-hydrate'] + + expect(hydratedState.judgeModelId).toBe('openai::gpt-4o-mini') + expect(hydratedState.metrics).toHaveLength(1) + expect(hydratedState.metrics[0]).toMatchObject({ + optionId: 'context-precision', + kind: 'builtin', + valueType: 'number', + threshold: 0.92, + nodeInfoList: [ + { node_id: 'knowledge-node', title: 'Knowledge Base', type: 'knowledge-index' }, + ], + }) + }) + + it('should build pipeline judgment payload from metric thresholds', () => { + const resourceType = 'datasets' + const resourceId = 'dataset-save-config' + const store = useEvaluationStore.getState() + const knowledgeNodeInfo = [{ node_id: 'knowledge-node', title: 'Knowledge Base', type: 'knowledge-index' }] + + store.ensureResource(resourceType, resourceId) + store.setJudgeModel(resourceType, resourceId, 'openai::gpt-4o-mini') + store.addBuiltinMetric(resourceType, resourceId, 'context-precision', knowledgeNodeInfo) + store.addBuiltinMetric(resourceType, resourceId, 'context-recall', knowledgeNodeInfo) + + const resourceWithMetrics = useEvaluationStore.getState().resources['datasets:dataset-save-config'] + const contextPrecisionMetric = resourceWithMetrics.metrics.find(metric => metric.optionId === 'context-precision')! + const contextRecallMetric = resourceWithMetrics.metrics.find(metric => metric.optionId === 'context-recall')! + + store.updateMetricThreshold(resourceType, resourceId, contextPrecisionMetric.id, 0.91) + store.updateMetricThreshold(resourceType, resourceId, contextRecallMetric.id, 0.88) + + const resource = useEvaluationStore.getState().resources['datasets:dataset-save-config'] + const expectedPayload = { + evaluation_model: 'gpt-4o-mini', + evaluation_model_provider: 'openai', + default_metrics: [ + { + metric: 'context-precision', + value_type: 'number', + node_info_list: knowledgeNodeInfo, + }, + { + metric: 'context-recall', + value_type: 'number', + node_info_list: knowledgeNodeInfo, + }, + ], + customized_metrics: null, + judgment_config: { + logical_operator: 'and', + conditions: [ + { + variable_selector: ['knowledge-node', 'context-precision'], + comparison_operator: '≥', + value: '0.91', + }, + { + variable_selector: ['knowledge-node', 'context-recall'], + comparison_operator: '≥', + value: '0.88', + }, + ], + }, + } + + expect(buildEvaluationConfigPayload(resource, resourceType)).toEqual(expectedPayload) + expect(buildEvaluationRunRequest(resource, 'file-1', resourceType)).toEqual({ ...expectedPayload, file_id: 'file-1', }) diff --git a/web/app/components/evaluation/components/batch-test-panel/index.tsx b/web/app/components/evaluation/components/batch-test-panel/index.tsx index 0f7226af00..11af19f4d5 100644 --- a/web/app/components/evaluation/components/batch-test-panel/index.tsx +++ b/web/app/components/evaluation/components/batch-test-panel/index.tsx @@ -36,7 +36,7 @@ const BatchTestPanel = ({ return } - const body = buildEvaluationConfigPayload(resource) + const body = buildEvaluationConfigPayload(resource, resourceType) if (!body) { toast.warning(t('batch.validation')) diff --git a/web/app/components/evaluation/components/batch-test-panel/input-fields/use-input-fields-actions.ts b/web/app/components/evaluation/components/batch-test-panel/input-fields/use-input-fields-actions.ts index f7ffe49813..8db1b0fbdd 100644 --- a/web/app/components/evaluation/components/batch-test-panel/input-fields/use-input-fields-actions.ts +++ b/web/app/components/evaluation/components/batch-test-panel/input-fields/use-input-fields-actions.ts @@ -102,7 +102,7 @@ export const useInputFieldsActions = ({ return } - const body = buildEvaluationRunRequest(resource, uploadedFileId) + const body = buildEvaluationRunRequest(resource, uploadedFileId, resourceType) if (!body) { toast.warning(t('batch.validation')) diff --git a/web/app/components/evaluation/components/pipeline/pipeline-metrics-section.tsx b/web/app/components/evaluation/components/pipeline/pipeline-metrics-section.tsx index f9eb852f89..2457b081c3 100644 --- a/web/app/components/evaluation/components/pipeline/pipeline-metrics-section.tsx +++ b/web/app/components/evaluation/components/pipeline/pipeline-metrics-section.tsx @@ -1,25 +1,62 @@ 'use client' import type { EvaluationResourceProps } from '../../types' -import { useMemo } from 'react' +import type { Node } from '@/app/components/workflow/types' +import type { NodeInfo } from '@/types/evaluation' +import { useEffect, useMemo } from 'react' import { useTranslation } from 'react-i18next' +import { BlockEnum } from '@/app/components/workflow/types' +import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail' import { useAvailableEvaluationMetrics } from '@/service/use-evaluation' +import { usePublishedPipelineInfo } from '@/service/use-pipeline' import { getEvaluationMockConfig } from '../../mock' import { useEvaluationResource, useEvaluationStore } from '../../store' import { InlineSectionHeader } from '../section-header' import PipelineMetricItem from './pipeline-metric-item' +const getKnowledgeIndexNodeInfo = (nodes: Node[] | undefined): NodeInfo[] => { + const knowledgeIndexNode = nodes?.find(node => node.data.type === BlockEnum.KnowledgeBase) + if (!knowledgeIndexNode?.id) + return [] + + return [{ + node_id: knowledgeIndexNode.id, + title: typeof knowledgeIndexNode.data?.title === 'string' && knowledgeIndexNode.data.title + ? knowledgeIndexNode.data.title + : knowledgeIndexNode.id, + type: 'knowledge-index', + }] +} + +const isSameNodeInfoList = (left: NodeInfo[] | undefined, right: NodeInfo[]) => { + if ((left?.length ?? 0) !== right.length) + return false + + return (left ?? []).every((nodeInfo, index) => { + const target = right[index] + return nodeInfo.node_id === target?.node_id + && nodeInfo.title === target?.title + && nodeInfo.type === target?.type + }) +} + const PipelineMetricsSection = ({ resourceType, resourceId, }: EvaluationResourceProps) => { const { t } = useTranslation('evaluation') + const pipelineId = useDatasetDetailContextWithSelector(state => state.dataset?.pipeline_id) const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric) const removeMetric = useEvaluationStore(state => state.removeMetric) const updateMetricThreshold = useEvaluationStore(state => state.updateMetricThreshold) const { data: availableMetricsData } = useAvailableEvaluationMetrics() + const { data: publishedPipeline } = usePublishedPipelineInfo(pipelineId || '') const resource = useEvaluationResource(resourceType, resourceId) const config = getEvaluationMockConfig(resourceType) + const knowledgeIndexNodeInfoList = useMemo( + () => getKnowledgeIndexNodeInfo(publishedPipeline?.graph.nodes), + [publishedPipeline?.graph.nodes], + ) const builtinMetricMap = useMemo(() => new Map( resource.metrics .filter(metric => metric.kind === 'builtin') @@ -32,6 +69,18 @@ const PipelineMetricsSection = ({ ) }, [availableMetricIds, builtinMetricMap, config.builtinMetrics]) + useEffect(() => { + if (!knowledgeIndexNodeInfoList.length) + return + + resource.metrics.forEach((metric) => { + if (metric.kind !== 'builtin' || isSameNodeInfoList(metric.nodeInfoList, knowledgeIndexNodeInfoList)) + return + + addBuiltinMetric(resourceType, resourceId, metric.optionId, knowledgeIndexNodeInfoList) + }) + }, [addBuiltinMetric, knowledgeIndexNodeInfoList, resource.metrics, resourceId, resourceType]) + const handleToggleMetric = (metricId: string) => { const selectedMetric = builtinMetricMap.get(metricId) if (selectedMetric) { @@ -39,7 +88,7 @@ const PipelineMetricsSection = ({ return } - addBuiltinMetric(resourceType, resourceId, metricId) + addBuiltinMetric(resourceType, resourceId, metricId, knowledgeIndexNodeInfoList) } return ( diff --git a/web/app/components/evaluation/store-utils.ts b/web/app/components/evaluation/store-utils.ts index 2942bfb4b6..c34e643baa 100644 --- a/web/app/components/evaluation/store-utils.ts +++ b/web/app/components/evaluation/store-utils.ts @@ -34,6 +34,8 @@ type EvaluationStoreResources = Record export const DEFAULT_PIPELINE_METRIC_THRESHOLD = 0.85 +const PIPELINE_LOGICAL_OPERATOR: JudgmentConfig['logicalOperator'] = 'and' + const createId = (prefix: string) => `${prefix}-${Math.random().toString(36).slice(2, 10)}` const humanizeMetricId = (metricId: string) => { @@ -54,6 +56,15 @@ const resolveMetricOption = (resourceType: EvaluationResourceType, metricId: str } } +const pipelineMetricIds = new Set(getEvaluationMockConfig('datasets').builtinMetrics.map(metric => metric.id)) + +const isPipelineResourceType = (resourceType: EvaluationResourceType) => resourceType === 'datasets' + +const isPipelineResourceState = (resource: EvaluationResourceState) => { + return resource.metrics.length > 0 + && resource.metrics.every(metric => metric.kind === 'builtin' && pipelineMetricIds.has(metric.optionId)) +} + const normalizeNodeInfoList = (value: NodeInfo[] | undefined): NodeInfo[] => { if (!value?.length) return [] @@ -164,6 +175,46 @@ const normalizeVariableSelector = (value: string[] | undefined): [string, string : null } +const getConditionNumericValue = (value: EvaluationJudgmentCondition['value']) => { + if (typeof value === 'number') + return value + + if (typeof value !== 'string') + return null + + const parsedValue = Number(value) + return Number.isFinite(parsedValue) ? parsedValue : null +} + +const getPipelineMetricThreshold = ( + metric: EvaluationMetric, + config: EvaluationConfig, +) => { + const matchingCondition = (config.judgment_config?.conditions ?? []).find((condition) => { + const variableSelector = normalizeVariableSelector(condition.variable_selector) + if (!variableSelector || variableSelector[1] !== metric.optionId || condition.comparison_operator !== '≥') + return false + + if (!metric.nodeInfoList?.length) + return true + + return metric.nodeInfoList.some(nodeInfo => nodeInfo.node_id === variableSelector[0]) + }) + + return getConditionNumericValue(matchingCondition?.value) ?? metric.threshold ?? DEFAULT_PIPELINE_METRIC_THRESHOLD +} + +const normalizePipelineMetrics = ( + config: EvaluationConfig, + metrics: EvaluationMetric[], +) => { + return metrics.map(metric => ({ + ...metric, + valueType: 'number' as const, + threshold: getPipelineMetricThreshold(metric, config), + })) +} + const getNormalizedConditionValue = ( operator: ComparisonOperator, previousValue: EvaluationJudgmentConditionValue | string | number | boolean | null | undefined, @@ -404,8 +455,10 @@ export const buildStateFromEvaluationConfig = ( config: EvaluationConfig, ): EvaluationResourceState => { const defaultMetrics = normalizeDefaultMetrics(resourceType, config.default_metrics) - const customMetrics = normalizeCustomMetric(config.customized_metrics) - const metrics = [...defaultMetrics, ...customMetrics] + const customMetrics = isPipelineResourceType(resourceType) ? [] : normalizeCustomMetric(config.customized_metrics) + const metrics = isPipelineResourceType(resourceType) + ? normalizePipelineMetrics(config, defaultMetrics) + : [...defaultMetrics, ...customMetrics] return { ...buildInitialState(resourceType), @@ -458,7 +511,40 @@ const buildCustomizedMetricsPayload = (metrics: EvaluationMetric[]): EvaluationC } } -const buildJudgmentConfigPayload = (resource: EvaluationResourceState): EvaluationConfigData['judgment_config'] => { +const buildPipelineJudgmentConfigPayload = ( + resource: EvaluationResourceState, +): EvaluationConfigData['judgment_config'] => { + const conditions = resource.metrics + .filter((metric): metric is EvaluationMetric & { kind: 'builtin' } => metric.kind === 'builtin') + .map((metric) => { + const nodeInfo = metric.nodeInfoList?.[0] + if (!nodeInfo) + return null + + return { + variable_selector: [nodeInfo.node_id, metric.optionId], + comparison_operator: '≥', + value: String(metric.threshold ?? DEFAULT_PIPELINE_METRIC_THRESHOLD), + } + }) + .filter((condition): condition is NonNullable => !!condition) + + if (!conditions.length) + return null + + return { + logical_operator: PIPELINE_LOGICAL_OPERATOR, + conditions, + } +} + +const buildJudgmentConfigPayload = ( + resource: EvaluationResourceState, + resourceType?: EvaluationResourceType, +): EvaluationConfigData['judgment_config'] => { + if ((resourceType && isPipelineResourceType(resourceType)) || isPipelineResourceState(resource)) + return buildPipelineJudgmentConfigPayload(resource) + const conditions = resource.judgmentConfig.conditions .filter(condition => !!condition.variableSelector) .map((condition) => { @@ -488,6 +574,7 @@ const buildJudgmentConfigPayload = (resource: EvaluationResourceState): Evaluati export const buildEvaluationConfigPayload = ( resource: EvaluationResourceState, + resourceType?: EvaluationResourceType, ): EvaluationConfigData | null => { const selectedModel = decodeModelSelection(resource.judgeModelId) @@ -504,16 +591,19 @@ export const buildEvaluationConfigPayload = ( value_type: metric.valueType, node_info_list: metric.nodeInfoList ?? [], })), - customized_metrics: buildCustomizedMetricsPayload(resource.metrics), - judgment_config: buildJudgmentConfigPayload(resource), + customized_metrics: (resourceType && isPipelineResourceType(resourceType)) || isPipelineResourceState(resource) + ? null + : buildCustomizedMetricsPayload(resource.metrics), + judgment_config: buildJudgmentConfigPayload(resource, resourceType), } } export const buildEvaluationRunRequest = ( resource: EvaluationResourceState, fileId: string, + resourceType?: EvaluationResourceType, ): EvaluationRunRequest | null => { - const configPayload = buildEvaluationConfigPayload(resource) + const configPayload = buildEvaluationConfigPayload(resource, resourceType) if (!configPayload) return null