From ef7ff3356d8e0174d806d6bf0a11b57d1b50499f Mon Sep 17 00:00:00 2001
From: Asuka Minato <i@asukaminato.eu.org>
Date: Sun, 26 Apr 2026 09:59:22 +0900
Subject: [PATCH] refactor: port ChildChunk (#30920)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
---
 api/core/rag/datasource/retrieval_service.py  |  1 +
 api/core/rag/docstore/dataset_docstore.py     |  7 ++-
 api/models/dataset.py                         | 49 ++++++++++++-------
 api/services/dataset_service.py               |  6 ++-
 api/services/vector_service.py                |  6 ++-
 .../services/dataset_service_test_helpers.py  |  1 -
 .../services/test_dataset_service_segment.py  |  5 +-
 7 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/api/core/rag/datasource/retrieval_service.py b/api/core/rag/datasource/retrieval_service.py
index 2997710daf..c60d19045a 100644
--- a/api/core/rag/datasource/retrieval_service.py
+++ b/api/core/rag/datasource/retrieval_service.py
@@ -551,6 +551,7 @@ class RetrievalService:
                 child_index_nodes = session.execute(child_chunk_stmt).scalars().all()
 
                 for i in child_index_nodes:
+                    assert i.index_node_id
                     segment_ids.append(i.segment_id)
                     if i.segment_id in child_chunk_map:
                         child_chunk_map[i.segment_id].append(i)
diff --git a/api/core/rag/docstore/dataset_docstore.py b/api/core/rag/docstore/dataset_docstore.py
index f4699f6869..78305a6ac0 100644
--- a/api/core/rag/docstore/dataset_docstore.py
+++ b/api/core/rag/docstore/dataset_docstore.py
@@ -11,6 +11,7 @@ from core.rag.models.document import AttachmentDocument, Document
 from extensions.ext_database import db
 from graphon.model_runtime.entities.model_entities import ModelType
 from models.dataset import ChildChunk, Dataset, DocumentSegment, SegmentAttachmentBinding
+from models.enums import SegmentType
 
 
 class DatasetDocumentStore:
@@ -127,6 +128,7 @@ class DatasetDocumentStore:
                 if save_child:
                     if doc.children:
                         for position, child in enumerate(doc.children, start=1):
+                            assert self._document_id
                             child_segment = ChildChunk(
                                 tenant_id=self._dataset.tenant_id,
                                 dataset_id=self._dataset.id,
@@ -137,7 +139,7 @@ class DatasetDocumentStore:
                                 index_node_hash=child.metadata.get("doc_hash"),
                                 content=child.page_content,
                                 word_count=len(child.page_content),
-                                type="automatic",
+                                type=SegmentType.AUTOMATIC,
                                 created_by=self._user_id,
                             )
                             db.session.add(child_segment)
@@ -163,6 +165,7 @@ class DatasetDocumentStore:
                     )
                     # add new child chunks
                     for position, child in enumerate(doc.children, start=1):
+                        assert self._document_id
                         child_segment = ChildChunk(
                             tenant_id=self._dataset.tenant_id,
                             dataset_id=self._dataset.id,
@@ -173,7 +176,7 @@ class DatasetDocumentStore:
                             index_node_hash=child.metadata.get("doc_hash"),
                             content=child.page_content,
                             word_count=len(child.page_content),
-                            type="automatic",
+                            type=SegmentType.AUTOMATIC,
                             created_by=self._user_id,
                         )
                         db.session.add(child_segment)
diff --git a/api/models/dataset.py b/api/models/dataset.py
index eee5c39a0e..a00e9f7640 100644
--- a/api/models/dataset.py
+++ b/api/models/dataset.py
@@ -1036,7 +1036,7 @@ class DocumentSegment(Base):
         return attachment_list
 
 
-class ChildChunk(Base):
+class ChildChunk(TypeBase):
     __tablename__ = "child_chunks"
     __table_args__ = (
         sa.PrimaryKeyConstraint("id", name="child_chunk_pkey"),
@@ -1046,29 +1046,42 @@ class ChildChunk(Base):
     )
 
     # initial fields
-    id = mapped_column(StringUUID, nullable=False, default=lambda: str(uuid4()))
-    tenant_id = mapped_column(StringUUID, nullable=False)
-    dataset_id = mapped_column(StringUUID, nullable=False)
-    document_id = mapped_column(StringUUID, nullable=False)
-    segment_id = mapped_column(StringUUID, nullable=False)
+    id: Mapped[str] = mapped_column(StringUUID, nullable=False, default_factory=lambda: str(uuid4()), init=False)
+    tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
+    dataset_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
+    document_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
+    segment_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
     position: Mapped[int] = mapped_column(sa.Integer, nullable=False)
-    content = mapped_column(LongText, nullable=False)
+    content: Mapped[str] = mapped_column(LongText, nullable=False)
     word_count: Mapped[int] = mapped_column(sa.Integer, nullable=False)
     # indexing fields
-    index_node_id = mapped_column(String(255), nullable=True)
-    index_node_hash = mapped_column(String(255), nullable=True)
-    type: Mapped[SegmentType] = mapped_column(
-        EnumText(SegmentType, length=255), nullable=False, server_default=sa.text("'automatic'")
+    created_by: Mapped[str] = mapped_column(StringUUID, nullable=False)
+    created_at: Mapped[datetime] = mapped_column(
+        DateTime, nullable=False, server_default=sa.func.current_timestamp(), init=False
     )
-    created_by = mapped_column(StringUUID, nullable=False)
-    created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=sa.func.current_timestamp())
-    updated_by = mapped_column(StringUUID, nullable=True)
+    updated_by: Mapped[str | None] = mapped_column(StringUUID, nullable=True, init=False)
     updated_at: Mapped[datetime] = mapped_column(
-        DateTime, nullable=False, server_default=sa.func.current_timestamp(), onupdate=func.current_timestamp()
+        DateTime,
+        nullable=False,
+        server_default=sa.func.current_timestamp(),
+        onupdate=func.current_timestamp(),
+        init=False,
     )
-    indexing_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
-    completed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
-    error = mapped_column(LongText, nullable=True)
+    indexing_at: Mapped[datetime | None] = mapped_column(
+        DateTime, nullable=True, insert_default=None, server_default=None, init=False
+    )
+    completed_at: Mapped[datetime | None] = mapped_column(
+        DateTime, nullable=True, insert_default=None, server_default=None, init=False
+    )
+    index_node_id: Mapped[str | None] = mapped_column(String(255), nullable=True, default=None)
+    index_node_hash: Mapped[str | None] = mapped_column(String(255), nullable=True, default=None)
+    type: Mapped[SegmentType] = mapped_column(
+        EnumText(SegmentType, length=255),
+        nullable=False,
+        server_default=sa.text("'automatic'"),
+        default=SegmentType.AUTOMATIC,
+    )
+    error: Mapped[str | None] = mapped_column(LongText, nullable=True, init=False)
 
     @property
     def dataset(self):
diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py
index 894cb05687..eef38f1ce2 100644
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -3748,6 +3748,7 @@ class SegmentService:
                     ChildChunk.segment_id == segment.id,
                 )
             )
+            assert current_user.current_tenant_id
             child_chunk = ChildChunk(
                 tenant_id=current_user.current_tenant_id,
                 dataset_id=dataset.id,
@@ -3758,7 +3759,7 @@ class SegmentService:
                 index_node_hash=index_node_hash,
                 content=content,
                 word_count=len(content),
-                type="customized",
+                type=SegmentType.CUSTOMIZED,
                 created_by=current_user.id,
             )
             db.session.add(child_chunk)
@@ -3818,6 +3819,7 @@ class SegmentService:
             if new_child_chunks_args:
                 child_chunk_count = len(child_chunks)
                 for position, args in enumerate(new_child_chunks_args, start=child_chunk_count + 1):
+                    assert current_user.current_tenant_id
                     index_node_id = str(uuid.uuid4())
                     index_node_hash = helper.generate_text_hash(args.content)
                     child_chunk = ChildChunk(
@@ -3830,7 +3832,7 @@ class SegmentService:
                         index_node_hash=index_node_hash,
                         content=args.content,
                         word_count=len(args.content),
-                        type="customized",
+                        type=SegmentType.CUSTOMIZED,
                         created_by=current_user.id,
                     )
 
diff --git a/api/services/vector_service.py b/api/services/vector_service.py
index 58193d75a9..7e689af35d 100644
--- a/api/services/vector_service.py
+++ b/api/services/vector_service.py
@@ -16,6 +16,7 @@ from graphon.model_runtime.entities.model_entities import ModelType
 from models import UploadFile
 from models.dataset import ChildChunk, Dataset, DatasetProcessRule, DocumentSegment, SegmentAttachmentBinding
 from models.dataset import Document as DatasetDocument
+from models.enums import SegmentType
 
 logger = logging.getLogger(__name__)
 
@@ -178,7 +179,7 @@ class VectorService:
                     index_node_hash=child_chunk.metadata["doc_hash"],
                     content=child_chunk.page_content,
                     word_count=len(child_chunk.page_content),
-                    type="automatic",
+                    type=SegmentType.AUTOMATIC,
                     created_by=dataset_document.created_by,
                 )
                 db.session.add(child_segment)
@@ -222,6 +223,7 @@ class VectorService:
             )
             documents.append(new_child_document)
         for update_child_chunk in update_child_chunks:
+            assert update_child_chunk.index_node_id
             child_document = Document(
                 page_content=update_child_chunk.content,
                 metadata={
@@ -234,6 +236,7 @@ class VectorService:
             documents.append(child_document)
             delete_node_ids.append(update_child_chunk.index_node_id)
         for delete_child_chunk in delete_child_chunks:
+            assert delete_child_chunk.index_node_id
             delete_node_ids.append(delete_child_chunk.index_node_id)
         if dataset.indexing_technique == IndexTechniqueType.HIGH_QUALITY:
             # update vector index
@@ -246,6 +249,7 @@ class VectorService:
     @classmethod
     def delete_child_chunk_vector(cls, child_chunk: ChildChunk, dataset: Dataset):
         vector = Vector(dataset=dataset)
+        assert child_chunk.index_node_id
         vector.delete_by_ids([child_chunk.index_node_id])
 
     @classmethod
diff --git a/api/tests/unit_tests/services/dataset_service_test_helpers.py b/api/tests/unit_tests/services/dataset_service_test_helpers.py
index 3349c1fd8c..806f1e8d91 100644
--- a/api/tests/unit_tests/services/dataset_service_test_helpers.py
+++ b/api/tests/unit_tests/services/dataset_service_test_helpers.py
@@ -365,7 +365,6 @@ def _make_segment(
 
 def _make_child_chunk() -> ChildChunk:
     return ChildChunk(
-        id="child-a",
         tenant_id="tenant-1",
         dataset_id="dataset-1",
         document_id="doc-1",
diff --git a/api/tests/unit_tests/services/test_dataset_service_segment.py b/api/tests/unit_tests/services/test_dataset_service_segment.py
index 5cfef76719..6330e53765 100644
--- a/api/tests/unit_tests/services/test_dataset_service_segment.py
+++ b/api/tests/unit_tests/services/test_dataset_service_segment.py
@@ -89,7 +89,6 @@ class TestSegmentServiceChildChunks:
         document = _make_document()
         segment = _make_segment()
         existing_a = ChildChunk(
-            id="child-a",
             tenant_id="tenant-1",
             dataset_id="dataset-1",
             document_id="doc-1",
@@ -100,7 +99,6 @@ class TestSegmentServiceChildChunks:
             created_by="user-1",
         )
         existing_b = ChildChunk(
-            id="child-b",
             tenant_id="tenant-1",
             dataset_id="dataset-1",
             document_id="doc-1",
@@ -110,7 +108,8 @@ class TestSegmentServiceChildChunks:
             word_count=9,
             created_by="user-1",
         )
-
+        existing_a.id = "child-a"
+        existing_b.id = "child-b"
         with (
             patch("services.dataset_service.db") as mock_db,
             patch("services.dataset_service.uuid.uuid4", return_value="node-new"),