From ef7ff3356d8e0174d806d6bf0a11b57d1b50499f Mon Sep 17 00:00:00 2001 From: Asuka Minato Date: Sun, 26 Apr 2026 09:59:22 +0900 Subject: [PATCH] refactor: port ChildChunk (#30920) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- api/core/rag/datasource/retrieval_service.py | 1 + api/core/rag/docstore/dataset_docstore.py | 7 ++- api/models/dataset.py | 49 ++++++++++++------- api/services/dataset_service.py | 6 ++- api/services/vector_service.py | 6 ++- .../services/dataset_service_test_helpers.py | 1 - .../services/test_dataset_service_segment.py | 5 +- 7 files changed, 48 insertions(+), 27 deletions(-) diff --git a/api/core/rag/datasource/retrieval_service.py b/api/core/rag/datasource/retrieval_service.py index 2997710daf..c60d19045a 100644 --- a/api/core/rag/datasource/retrieval_service.py +++ b/api/core/rag/datasource/retrieval_service.py @@ -551,6 +551,7 @@ class RetrievalService: child_index_nodes = session.execute(child_chunk_stmt).scalars().all() for i in child_index_nodes: + assert i.index_node_id segment_ids.append(i.segment_id) if i.segment_id in child_chunk_map: child_chunk_map[i.segment_id].append(i) diff --git a/api/core/rag/docstore/dataset_docstore.py b/api/core/rag/docstore/dataset_docstore.py index f4699f6869..78305a6ac0 100644 --- a/api/core/rag/docstore/dataset_docstore.py +++ b/api/core/rag/docstore/dataset_docstore.py @@ -11,6 +11,7 @@ from core.rag.models.document import AttachmentDocument, Document from extensions.ext_database import db from graphon.model_runtime.entities.model_entities import ModelType from models.dataset import ChildChunk, Dataset, DocumentSegment, SegmentAttachmentBinding +from models.enums import SegmentType class DatasetDocumentStore: @@ -127,6 +128,7 @@ class DatasetDocumentStore: if save_child: if doc.children: for position, child in enumerate(doc.children, start=1): + assert self._document_id child_segment = ChildChunk( tenant_id=self._dataset.tenant_id, dataset_id=self._dataset.id, @@ -137,7 +139,7 @@ class DatasetDocumentStore: index_node_hash=child.metadata.get("doc_hash"), content=child.page_content, word_count=len(child.page_content), - type="automatic", + type=SegmentType.AUTOMATIC, created_by=self._user_id, ) db.session.add(child_segment) @@ -163,6 +165,7 @@ class DatasetDocumentStore: ) # add new child chunks for position, child in enumerate(doc.children, start=1): + assert self._document_id child_segment = ChildChunk( tenant_id=self._dataset.tenant_id, dataset_id=self._dataset.id, @@ -173,7 +176,7 @@ class DatasetDocumentStore: index_node_hash=child.metadata.get("doc_hash"), content=child.page_content, word_count=len(child.page_content), - type="automatic", + type=SegmentType.AUTOMATIC, created_by=self._user_id, ) db.session.add(child_segment) diff --git a/api/models/dataset.py b/api/models/dataset.py index eee5c39a0e..a00e9f7640 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -1036,7 +1036,7 @@ class DocumentSegment(Base): return attachment_list -class ChildChunk(Base): +class ChildChunk(TypeBase): __tablename__ = "child_chunks" __table_args__ = ( sa.PrimaryKeyConstraint("id", name="child_chunk_pkey"), @@ -1046,29 +1046,42 @@ class ChildChunk(Base): ) # initial fields - id = mapped_column(StringUUID, nullable=False, default=lambda: str(uuid4())) - tenant_id = mapped_column(StringUUID, nullable=False) - dataset_id = mapped_column(StringUUID, nullable=False) - document_id = mapped_column(StringUUID, nullable=False) - segment_id = mapped_column(StringUUID, nullable=False) + id: Mapped[str] = mapped_column(StringUUID, nullable=False, default_factory=lambda: str(uuid4()), init=False) + tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + dataset_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + document_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + segment_id: Mapped[str] = mapped_column(StringUUID, nullable=False) position: Mapped[int] = mapped_column(sa.Integer, nullable=False) - content = mapped_column(LongText, nullable=False) + content: Mapped[str] = mapped_column(LongText, nullable=False) word_count: Mapped[int] = mapped_column(sa.Integer, nullable=False) # indexing fields - index_node_id = mapped_column(String(255), nullable=True) - index_node_hash = mapped_column(String(255), nullable=True) - type: Mapped[SegmentType] = mapped_column( - EnumText(SegmentType, length=255), nullable=False, server_default=sa.text("'automatic'") + created_by: Mapped[str] = mapped_column(StringUUID, nullable=False) + created_at: Mapped[datetime] = mapped_column( + DateTime, nullable=False, server_default=sa.func.current_timestamp(), init=False ) - created_by = mapped_column(StringUUID, nullable=False) - created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=sa.func.current_timestamp()) - updated_by = mapped_column(StringUUID, nullable=True) + updated_by: Mapped[str | None] = mapped_column(StringUUID, nullable=True, init=False) updated_at: Mapped[datetime] = mapped_column( - DateTime, nullable=False, server_default=sa.func.current_timestamp(), onupdate=func.current_timestamp() + DateTime, + nullable=False, + server_default=sa.func.current_timestamp(), + onupdate=func.current_timestamp(), + init=False, ) - indexing_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) - completed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) - error = mapped_column(LongText, nullable=True) + indexing_at: Mapped[datetime | None] = mapped_column( + DateTime, nullable=True, insert_default=None, server_default=None, init=False + ) + completed_at: Mapped[datetime | None] = mapped_column( + DateTime, nullable=True, insert_default=None, server_default=None, init=False + ) + index_node_id: Mapped[str | None] = mapped_column(String(255), nullable=True, default=None) + index_node_hash: Mapped[str | None] = mapped_column(String(255), nullable=True, default=None) + type: Mapped[SegmentType] = mapped_column( + EnumText(SegmentType, length=255), + nullable=False, + server_default=sa.text("'automatic'"), + default=SegmentType.AUTOMATIC, + ) + error: Mapped[str | None] = mapped_column(LongText, nullable=True, init=False) @property def dataset(self): diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index 894cb05687..eef38f1ce2 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -3748,6 +3748,7 @@ class SegmentService: ChildChunk.segment_id == segment.id, ) ) + assert current_user.current_tenant_id child_chunk = ChildChunk( tenant_id=current_user.current_tenant_id, dataset_id=dataset.id, @@ -3758,7 +3759,7 @@ class SegmentService: index_node_hash=index_node_hash, content=content, word_count=len(content), - type="customized", + type=SegmentType.CUSTOMIZED, created_by=current_user.id, ) db.session.add(child_chunk) @@ -3818,6 +3819,7 @@ class SegmentService: if new_child_chunks_args: child_chunk_count = len(child_chunks) for position, args in enumerate(new_child_chunks_args, start=child_chunk_count + 1): + assert current_user.current_tenant_id index_node_id = str(uuid.uuid4()) index_node_hash = helper.generate_text_hash(args.content) child_chunk = ChildChunk( @@ -3830,7 +3832,7 @@ class SegmentService: index_node_hash=index_node_hash, content=args.content, word_count=len(args.content), - type="customized", + type=SegmentType.CUSTOMIZED, created_by=current_user.id, ) diff --git a/api/services/vector_service.py b/api/services/vector_service.py index 58193d75a9..7e689af35d 100644 --- a/api/services/vector_service.py +++ b/api/services/vector_service.py @@ -16,6 +16,7 @@ from graphon.model_runtime.entities.model_entities import ModelType from models import UploadFile from models.dataset import ChildChunk, Dataset, DatasetProcessRule, DocumentSegment, SegmentAttachmentBinding from models.dataset import Document as DatasetDocument +from models.enums import SegmentType logger = logging.getLogger(__name__) @@ -178,7 +179,7 @@ class VectorService: index_node_hash=child_chunk.metadata["doc_hash"], content=child_chunk.page_content, word_count=len(child_chunk.page_content), - type="automatic", + type=SegmentType.AUTOMATIC, created_by=dataset_document.created_by, ) db.session.add(child_segment) @@ -222,6 +223,7 @@ class VectorService: ) documents.append(new_child_document) for update_child_chunk in update_child_chunks: + assert update_child_chunk.index_node_id child_document = Document( page_content=update_child_chunk.content, metadata={ @@ -234,6 +236,7 @@ class VectorService: documents.append(child_document) delete_node_ids.append(update_child_chunk.index_node_id) for delete_child_chunk in delete_child_chunks: + assert delete_child_chunk.index_node_id delete_node_ids.append(delete_child_chunk.index_node_id) if dataset.indexing_technique == IndexTechniqueType.HIGH_QUALITY: # update vector index @@ -246,6 +249,7 @@ class VectorService: @classmethod def delete_child_chunk_vector(cls, child_chunk: ChildChunk, dataset: Dataset): vector = Vector(dataset=dataset) + assert child_chunk.index_node_id vector.delete_by_ids([child_chunk.index_node_id]) @classmethod diff --git a/api/tests/unit_tests/services/dataset_service_test_helpers.py b/api/tests/unit_tests/services/dataset_service_test_helpers.py index 3349c1fd8c..806f1e8d91 100644 --- a/api/tests/unit_tests/services/dataset_service_test_helpers.py +++ b/api/tests/unit_tests/services/dataset_service_test_helpers.py @@ -365,7 +365,6 @@ def _make_segment( def _make_child_chunk() -> ChildChunk: return ChildChunk( - id="child-a", tenant_id="tenant-1", dataset_id="dataset-1", document_id="doc-1", diff --git a/api/tests/unit_tests/services/test_dataset_service_segment.py b/api/tests/unit_tests/services/test_dataset_service_segment.py index 5cfef76719..6330e53765 100644 --- a/api/tests/unit_tests/services/test_dataset_service_segment.py +++ b/api/tests/unit_tests/services/test_dataset_service_segment.py @@ -89,7 +89,6 @@ class TestSegmentServiceChildChunks: document = _make_document() segment = _make_segment() existing_a = ChildChunk( - id="child-a", tenant_id="tenant-1", dataset_id="dataset-1", document_id="doc-1", @@ -100,7 +99,6 @@ class TestSegmentServiceChildChunks: created_by="user-1", ) existing_b = ChildChunk( - id="child-b", tenant_id="tenant-1", dataset_id="dataset-1", document_id="doc-1", @@ -110,7 +108,8 @@ class TestSegmentServiceChildChunks: word_count=9, created_by="user-1", ) - + existing_a.id = "child-a" + existing_b.id = "child-b" with ( patch("services.dataset_service.db") as mock_db, patch("services.dataset_service.uuid.uuid4", return_value="node-new"),