feat(web): rag-pipeline evaluation configuration

2026-05-01 03:30:02 +08:00 · 2026-04-12 10:36:20 +08:00
parent e4c056a57a
commit 627fbd2e86
6 changed files with 295 additions and 14 deletions
--- a/web/app/components/evaluation/tests/index.spec.tsx
+++ b/web/app/components/evaluation/tests/index.spec.tsx
@@ -11,6 +11,7 @@ const mockUseEvaluationConfig = vi.hoisted(() => vi.fn())
 const mockUseEvaluationNodeInfoMutation = vi.hoisted(() => vi.fn())
 const mockUseSaveEvaluationConfigMutation = vi.hoisted(() => vi.fn())
 const mockUseStartEvaluationRunMutation = vi.hoisted(() => vi.fn())
+const mockUsePublishedPipelineInfo = vi.hoisted(() => vi.fn())

 vi.mock('@/app/components/header/account-setting/model-provider-page/hooks', () => ({
  useModelList: () => ({
@@ -55,6 +56,15 @@ vi.mock('@/service/use-evaluation', () => ({
  useStartEvaluationRunMutation: (...args: unknown[]) => mockUseStartEvaluationRunMutation(...args),
 }))

+vi.mock('@/service/use-pipeline', () => ({
+  usePublishedPipelineInfo: (...args: unknown[]) => mockUsePublishedPipelineInfo(...args),
+}))
+
+vi.mock('@/context/dataset-detail', () => ({
+  useDatasetDetailContextWithSelector: (selector: (state: { dataset: { pipeline_id: string } }) => unknown) =>
+    selector({ dataset: { pipeline_id: 'pipeline-1' } }),
+}))
+
 vi.mock('@/service/use-workflow', () => ({
  useAppWorkflow: () => ({
    data: {
@@ -152,6 +162,20 @@ describe('Evaluation', () => {
      isPending: false,
      mutate: vi.fn(),
    })
+    mockUsePublishedPipelineInfo.mockReturnValue({
+      data: {
+        graph: {
+          nodes: [{
+            id: 'knowledge-node',
+            data: {
+              type: 'knowledge-index',
+              title: 'Knowledge Base',
+            },
+          }],
+          edges: [],
+        },
+      },
+    })
    mockUpload.mockResolvedValue({
      id: 'uploaded-file-id',
      name: 'evaluation.csv',
@@ -471,10 +495,19 @@ describe('Evaluation', () => {
          default_metrics: [{
            metric: 'context-precision',
            value_type: 'number',
-            node_info_list: [],
+            node_info_list: [
+              { node_id: 'knowledge-node', title: 'Knowledge Base', type: 'knowledge-index' },
+            ],
          }],
          customized_metrics: null,
-          judgment_config: null,
+          judgment_config: {
+            logical_operator: 'and',
+            conditions: [{
+              variable_selector: ['knowledge-node', 'context-precision'],
+              comparison_operator: '≥',
+              value: '0.85',
+            }],
+          },
          file_id: 'file-1',
        },
      }, {
--- a/web/app/components/evaluation/tests/store.spec.ts
+++ b/web/app/components/evaluation/tests/store.spec.ts
@@ -338,8 +338,117 @@ describe('evaluation store', () => {
      },
    }

-    expect(buildEvaluationConfigPayload(resource)).toEqual(expectedPayload)
-    expect(buildEvaluationRunRequest(resource, 'file-1')).toEqual({
+    expect(buildEvaluationConfigPayload(resource, resourceType)).toEqual(expectedPayload)
+    expect(buildEvaluationRunRequest(resource, 'file-1', resourceType)).toEqual({
+      ...expectedPayload,
+      file_id: 'file-1',
+    })
+  })
+
+  it('should hydrate pipeline metrics from fixed knowledge-index conditions', () => {
+    const resourceType = 'datasets'
+    const resourceId = 'dataset-hydrate'
+    const store = useEvaluationStore.getState()
+    const config: EvaluationConfig = {
+      evaluation_model: 'gpt-4o-mini',
+      evaluation_model_provider: 'openai',
+      default_metrics: [{
+        metric: 'context-precision',
+        node_info_list: [
+          { node_id: 'knowledge-node', title: 'Knowledge Base', type: 'knowledge-index' },
+        ],
+      }],
+      customized_metrics: {
+        evaluation_workflow_id: 'should-be-ignored',
+        input_fields: {
+          query: 'answer',
+        },
+        output_fields: [{
+          variable: 'score',
+          value_type: 'number',
+        }],
+      },
+      judgment_config: {
+        logical_operator: 'or',
+        conditions: [{
+          variable_selector: ['knowledge-node', 'context-precision'],
+          comparison_operator: '≥',
+          value: '0.92',
+        }],
+      },
+    }
+
+    store.hydrateResource(resourceType, resourceId, config)
+
+    const hydratedState = useEvaluationStore.getState().resources['datasets:dataset-hydrate']
+
+    expect(hydratedState.judgeModelId).toBe('openai::gpt-4o-mini')
+    expect(hydratedState.metrics).toHaveLength(1)
+    expect(hydratedState.metrics[0]).toMatchObject({
+      optionId: 'context-precision',
+      kind: 'builtin',
+      valueType: 'number',
+      threshold: 0.92,
+      nodeInfoList: [
+        { node_id: 'knowledge-node', title: 'Knowledge Base', type: 'knowledge-index' },
+      ],
+    })
+  })
+
+  it('should build pipeline judgment payload from metric thresholds', () => {
+    const resourceType = 'datasets'
+    const resourceId = 'dataset-save-config'
+    const store = useEvaluationStore.getState()
+    const knowledgeNodeInfo = [{ node_id: 'knowledge-node', title: 'Knowledge Base', type: 'knowledge-index' }]
+
+    store.ensureResource(resourceType, resourceId)
+    store.setJudgeModel(resourceType, resourceId, 'openai::gpt-4o-mini')
+    store.addBuiltinMetric(resourceType, resourceId, 'context-precision', knowledgeNodeInfo)
+    store.addBuiltinMetric(resourceType, resourceId, 'context-recall', knowledgeNodeInfo)
+
+    const resourceWithMetrics = useEvaluationStore.getState().resources['datasets:dataset-save-config']
+    const contextPrecisionMetric = resourceWithMetrics.metrics.find(metric => metric.optionId === 'context-precision')!
+    const contextRecallMetric = resourceWithMetrics.metrics.find(metric => metric.optionId === 'context-recall')!
+
+    store.updateMetricThreshold(resourceType, resourceId, contextPrecisionMetric.id, 0.91)
+    store.updateMetricThreshold(resourceType, resourceId, contextRecallMetric.id, 0.88)
+
+    const resource = useEvaluationStore.getState().resources['datasets:dataset-save-config']
+    const expectedPayload = {
+      evaluation_model: 'gpt-4o-mini',
+      evaluation_model_provider: 'openai',
+      default_metrics: [
+        {
+          metric: 'context-precision',
+          value_type: 'number',
+          node_info_list: knowledgeNodeInfo,
+        },
+        {
+          metric: 'context-recall',
+          value_type: 'number',
+          node_info_list: knowledgeNodeInfo,
+        },
+      ],
+      customized_metrics: null,
+      judgment_config: {
+        logical_operator: 'and',
+        conditions: [
+          {
+            variable_selector: ['knowledge-node', 'context-precision'],
+            comparison_operator: '≥',
+            value: '0.91',
+          },
+          {
+            variable_selector: ['knowledge-node', 'context-recall'],
+            comparison_operator: '≥',
+            value: '0.88',
+          },
+        ],
+      },
+    }
+
+    expect(buildEvaluationConfigPayload(resource, resourceType)).toEqual(expectedPayload)
+    expect(buildEvaluationRunRequest(resource, 'file-1', resourceType)).toEqual({
      ...expectedPayload,
      file_id: 'file-1',
    })
--- a/web/app/components/evaluation/components/batch-test-panel/index.tsx
+++ b/web/app/components/evaluation/components/batch-test-panel/index.tsx
@@ -36,7 +36,7 @@ const BatchTestPanel = ({
      return
    }

-    const body = buildEvaluationConfigPayload(resource)
+    const body = buildEvaluationConfigPayload(resource, resourceType)

    if (!body) {
      toast.warning(t('batch.validation'))
--- a/web/app/components/evaluation/components/batch-test-panel/input-fields/use-input-fields-actions.ts
+++ b/web/app/components/evaluation/components/batch-test-panel/input-fields/use-input-fields-actions.ts
@@ -102,7 +102,7 @@ export const useInputFieldsActions = ({
      return
    }

-    const body = buildEvaluationRunRequest(resource, uploadedFileId)
+    const body = buildEvaluationRunRequest(resource, uploadedFileId, resourceType)

    if (!body) {
      toast.warning(t('batch.validation'))
--- a/web/app/components/evaluation/components/pipeline/pipeline-metrics-section.tsx
+++ b/web/app/components/evaluation/components/pipeline/pipeline-metrics-section.tsx
@@ -1,25 +1,62 @@
 'use client'

 import type { EvaluationResourceProps } from '../../types'
-import { useMemo } from 'react'
+import type { Node } from '@/app/components/workflow/types'
+import type { NodeInfo } from '@/types/evaluation'
+import { useEffect, useMemo } from 'react'
 import { useTranslation } from 'react-i18next'
+import { BlockEnum } from '@/app/components/workflow/types'
+import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail'
 import { useAvailableEvaluationMetrics } from '@/service/use-evaluation'
+import { usePublishedPipelineInfo } from '@/service/use-pipeline'
 import { getEvaluationMockConfig } from '../../mock'
 import { useEvaluationResource, useEvaluationStore } from '../../store'
 import { InlineSectionHeader } from '../section-header'
 import PipelineMetricItem from './pipeline-metric-item'

+const getKnowledgeIndexNodeInfo = (nodes: Node[] | undefined): NodeInfo[] => {
+  const knowledgeIndexNode = nodes?.find(node => node.data.type === BlockEnum.KnowledgeBase)
+  if (!knowledgeIndexNode?.id)
+    return []
+
+  return [{
+    node_id: knowledgeIndexNode.id,
+    title: typeof knowledgeIndexNode.data?.title === 'string' && knowledgeIndexNode.data.title
+      ? knowledgeIndexNode.data.title
+      : knowledgeIndexNode.id,
+    type: 'knowledge-index',
+  }]
+}
+
+const isSameNodeInfoList = (left: NodeInfo[] | undefined, right: NodeInfo[]) => {
+  if ((left?.length ?? 0) !== right.length)
+    return false
+
+  return (left ?? []).every((nodeInfo, index) => {
+    const target = right[index]
+    return nodeInfo.node_id === target?.node_id
+      && nodeInfo.title === target?.title
+      && nodeInfo.type === target?.type
+  })
+}
+
 const PipelineMetricsSection = ({
  resourceType,
  resourceId,
 }: EvaluationResourceProps) => {
  const { t } = useTranslation('evaluation')
+  const pipelineId = useDatasetDetailContextWithSelector(state => state.dataset?.pipeline_id)
  const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
  const removeMetric = useEvaluationStore(state => state.removeMetric)
  const updateMetricThreshold = useEvaluationStore(state => state.updateMetricThreshold)
  const { data: availableMetricsData } = useAvailableEvaluationMetrics()
+  const { data: publishedPipeline } = usePublishedPipelineInfo(pipelineId || '')
  const resource = useEvaluationResource(resourceType, resourceId)
  const config = getEvaluationMockConfig(resourceType)
+  const knowledgeIndexNodeInfoList = useMemo(
+    () => getKnowledgeIndexNodeInfo(publishedPipeline?.graph.nodes),
+    [publishedPipeline?.graph.nodes],
+  )
  const builtinMetricMap = useMemo(() => new Map(
    resource.metrics
      .filter(metric => metric.kind === 'builtin')
@@ -32,6 +69,18 @@ const PipelineMetricsSection = ({
    )
  }, [availableMetricIds, builtinMetricMap, config.builtinMetrics])

+  useEffect(() => {
+    if (!knowledgeIndexNodeInfoList.length)
+      return
+
+    resource.metrics.forEach((metric) => {
+      if (metric.kind !== 'builtin' || isSameNodeInfoList(metric.nodeInfoList, knowledgeIndexNodeInfoList))
+        return
+
+      addBuiltinMetric(resourceType, resourceId, metric.optionId, knowledgeIndexNodeInfoList)
+    })
+  }, [addBuiltinMetric, knowledgeIndexNodeInfoList, resource.metrics, resourceId, resourceType])
+
  const handleToggleMetric = (metricId: string) => {
    const selectedMetric = builtinMetricMap.get(metricId)
    if (selectedMetric) {
@@ -39,7 +88,7 @@ const PipelineMetricsSection = ({
      return
    }

-    addBuiltinMetric(resourceType, resourceId, metricId)
+    addBuiltinMetric(resourceType, resourceId, metricId, knowledgeIndexNodeInfoList)
  }

  return (
--- a/web/app/components/evaluation/store-utils.ts
+++ b/web/app/components/evaluation/store-utils.ts
@@ -34,6 +34,8 @@ type EvaluationStoreResources = Record<string, EvaluationResourceState>

 export const DEFAULT_PIPELINE_METRIC_THRESHOLD = 0.85

+const PIPELINE_LOGICAL_OPERATOR: JudgmentConfig['logicalOperator'] = 'and'
+
 const createId = (prefix: string) => `${prefix}-${Math.random().toString(36).slice(2, 10)}`

 const humanizeMetricId = (metricId: string) => {
@@ -54,6 +56,15 @@ const resolveMetricOption = (resourceType: EvaluationResourceType, metricId: str
  }
 }

+const pipelineMetricIds = new Set(getEvaluationMockConfig('datasets').builtinMetrics.map(metric => metric.id))
+
+const isPipelineResourceType = (resourceType: EvaluationResourceType) => resourceType === 'datasets'
+
+const isPipelineResourceState = (resource: EvaluationResourceState) => {
+  return resource.metrics.length > 0
+    && resource.metrics.every(metric => metric.kind === 'builtin' && pipelineMetricIds.has(metric.optionId))
+}
+
 const normalizeNodeInfoList = (value: NodeInfo[] | undefined): NodeInfo[] => {
  if (!value?.length)
    return []
@@ -164,6 +175,46 @@ const normalizeVariableSelector = (value: string[] | undefined): [string, string
    : null
 }

+const getConditionNumericValue = (value: EvaluationJudgmentCondition['value']) => {
+  if (typeof value === 'number')
+    return value
+
+  if (typeof value !== 'string')
+    return null
+
+  const parsedValue = Number(value)
+  return Number.isFinite(parsedValue) ? parsedValue : null
+}
+
+const getPipelineMetricThreshold = (
+  metric: EvaluationMetric,
+  config: EvaluationConfig,
+) => {
+  const matchingCondition = (config.judgment_config?.conditions ?? []).find((condition) => {
+    const variableSelector = normalizeVariableSelector(condition.variable_selector)
+    if (!variableSelector || variableSelector[1] !== metric.optionId || condition.comparison_operator !== '≥')
+      return false
+
+    if (!metric.nodeInfoList?.length)
+      return true
+
+    return metric.nodeInfoList.some(nodeInfo => nodeInfo.node_id === variableSelector[0])
+  })
+
+  return getConditionNumericValue(matchingCondition?.value) ?? metric.threshold ?? DEFAULT_PIPELINE_METRIC_THRESHOLD
+}
+
+const normalizePipelineMetrics = (
+  config: EvaluationConfig,
+  metrics: EvaluationMetric[],
+) => {
+  return metrics.map(metric => ({
+    ...metric,
+    valueType: 'number' as const,
+    threshold: getPipelineMetricThreshold(metric, config),
+  }))
+}
+
 const getNormalizedConditionValue = (
  operator: ComparisonOperator,
  previousValue: EvaluationJudgmentConditionValue | string | number | boolean | null | undefined,
@@ -404,8 +455,10 @@ export const buildStateFromEvaluationConfig = (
  config: EvaluationConfig,
 ): EvaluationResourceState => {
  const defaultMetrics = normalizeDefaultMetrics(resourceType, config.default_metrics)
-  const customMetrics = normalizeCustomMetric(config.customized_metrics)
-  const metrics = [...defaultMetrics, ...customMetrics]
+  const customMetrics = isPipelineResourceType(resourceType) ? [] : normalizeCustomMetric(config.customized_metrics)
+  const metrics = isPipelineResourceType(resourceType)
+    ? normalizePipelineMetrics(config, defaultMetrics)
+    : [...defaultMetrics, ...customMetrics]

  return {
    ...buildInitialState(resourceType),
@@ -458,7 +511,40 @@ const buildCustomizedMetricsPayload = (metrics: EvaluationMetric[]): EvaluationC
  }
 }

-const buildJudgmentConfigPayload = (resource: EvaluationResourceState): EvaluationConfigData['judgment_config'] => {
+const buildPipelineJudgmentConfigPayload = (
+  resource: EvaluationResourceState,
+): EvaluationConfigData['judgment_config'] => {
+  const conditions = resource.metrics
+    .filter((metric): metric is EvaluationMetric & { kind: 'builtin' } => metric.kind === 'builtin')
+    .map((metric) => {
+      const nodeInfo = metric.nodeInfoList?.[0]
+      if (!nodeInfo)
+        return null
+
+      return {
+        variable_selector: [nodeInfo.node_id, metric.optionId],
+        comparison_operator: '≥',
+        value: String(metric.threshold ?? DEFAULT_PIPELINE_METRIC_THRESHOLD),
+      }
+    })
+    .filter((condition): condition is NonNullable<typeof condition> => !!condition)
+
+  if (!conditions.length)
+    return null
+
+  return {
+    logical_operator: PIPELINE_LOGICAL_OPERATOR,
+    conditions,
+  }
+}
+
+const buildJudgmentConfigPayload = (
+  resource: EvaluationResourceState,
+  resourceType?: EvaluationResourceType,
+): EvaluationConfigData['judgment_config'] => {
+  if ((resourceType && isPipelineResourceType(resourceType)) || isPipelineResourceState(resource))
+    return buildPipelineJudgmentConfigPayload(resource)
+
  const conditions = resource.judgmentConfig.conditions
    .filter(condition => !!condition.variableSelector)
    .map((condition) => {
@@ -488,6 +574,7 @@ const buildJudgmentConfigPayload = (resource: EvaluationResourceState): Evaluati

 export const buildEvaluationConfigPayload = (
  resource: EvaluationResourceState,
+  resourceType?: EvaluationResourceType,
 ): EvaluationConfigData | null => {
  const selectedModel = decodeModelSelection(resource.judgeModelId)

@@ -504,16 +591,19 @@ export const buildEvaluationConfigPayload = (
        value_type: metric.valueType,
        node_info_list: metric.nodeInfoList ?? [],
      })),
-    customized_metrics: buildCustomizedMetricsPayload(resource.metrics),
-    judgment_config: buildJudgmentConfigPayload(resource),
+    customized_metrics: (resourceType && isPipelineResourceType(resourceType)) || isPipelineResourceState(resource)
+      ? null
+      : buildCustomizedMetricsPayload(resource.metrics),
+    judgment_config: buildJudgmentConfigPayload(resource, resourceType),
  }
 }

 export const buildEvaluationRunRequest = (
  resource: EvaluationResourceState,
  fileId: string,
+  resourceType?: EvaluationResourceType,
 ): EvaluationRunRequest | null => {
-  const configPayload = buildEvaluationConfigPayload(resource)
+  const configPayload = buildEvaluationConfigPayload(resource, resourceType)

  if (!configPayload)
    return null