mirror of
https://mirror.skon.top/github.com/langgenius/dify.git
synced 2026-05-01 03:30:02 +08:00
refactor: port ChildChunk (#30920)
Some checks failed
autofix.ci / autofix (push) Has been cancelled
Build and Push API & Web / build (api, {{defaultContext}}:api, Dockerfile, DIFY_API_IMAGE_NAME, linux/amd64, ubuntu-latest, build-api-amd64) (push) Has been cancelled
Build and Push API & Web / build (api, {{defaultContext}}:api, Dockerfile, DIFY_API_IMAGE_NAME, linux/arm64, ubuntu-24.04-arm, build-api-arm64) (push) Has been cancelled
Build and Push API & Web / build (web, {{defaultContext}}, web/Dockerfile, DIFY_WEB_IMAGE_NAME, linux/amd64, ubuntu-latest, build-web-amd64) (push) Has been cancelled
Build and Push API & Web / build (web, {{defaultContext}}, web/Dockerfile, DIFY_WEB_IMAGE_NAME, linux/arm64, ubuntu-24.04-arm, build-web-arm64) (push) Has been cancelled
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Has been cancelled
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Has been cancelled
Main CI Pipeline / Skip Duplicate Checks (push) Has been cancelled
Main CI Pipeline / Check Changed Files (push) Has been cancelled
Main CI Pipeline / Run API Tests (push) Has been cancelled
Main CI Pipeline / Skip API Tests (push) Has been cancelled
Main CI Pipeline / API Tests (push) Has been cancelled
Main CI Pipeline / Run Web Tests (push) Has been cancelled
Main CI Pipeline / Skip Web Tests (push) Has been cancelled
Main CI Pipeline / Web Tests (push) Has been cancelled
Main CI Pipeline / Run Web Full-Stack E2E (push) Has been cancelled
Main CI Pipeline / Skip Web Full-Stack E2E (push) Has been cancelled
Main CI Pipeline / Web Full-Stack E2E (push) Has been cancelled
Main CI Pipeline / Style Check (push) Has been cancelled
Main CI Pipeline / Run VDB Tests (push) Has been cancelled
Main CI Pipeline / Skip VDB Tests (push) Has been cancelled
Main CI Pipeline / VDB Tests (push) Has been cancelled
Main CI Pipeline / Run DB Migration Test (push) Has been cancelled
Main CI Pipeline / Skip DB Migration Test (push) Has been cancelled
Main CI Pipeline / DB Migration Test (push) Has been cancelled
Some checks failed
autofix.ci / autofix (push) Has been cancelled
Build and Push API & Web / build (api, {{defaultContext}}:api, Dockerfile, DIFY_API_IMAGE_NAME, linux/amd64, ubuntu-latest, build-api-amd64) (push) Has been cancelled
Build and Push API & Web / build (api, {{defaultContext}}:api, Dockerfile, DIFY_API_IMAGE_NAME, linux/arm64, ubuntu-24.04-arm, build-api-arm64) (push) Has been cancelled
Build and Push API & Web / build (web, {{defaultContext}}, web/Dockerfile, DIFY_WEB_IMAGE_NAME, linux/amd64, ubuntu-latest, build-web-amd64) (push) Has been cancelled
Build and Push API & Web / build (web, {{defaultContext}}, web/Dockerfile, DIFY_WEB_IMAGE_NAME, linux/arm64, ubuntu-24.04-arm, build-web-arm64) (push) Has been cancelled
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Has been cancelled
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Has been cancelled
Main CI Pipeline / Skip Duplicate Checks (push) Has been cancelled
Main CI Pipeline / Check Changed Files (push) Has been cancelled
Main CI Pipeline / Run API Tests (push) Has been cancelled
Main CI Pipeline / Skip API Tests (push) Has been cancelled
Main CI Pipeline / API Tests (push) Has been cancelled
Main CI Pipeline / Run Web Tests (push) Has been cancelled
Main CI Pipeline / Skip Web Tests (push) Has been cancelled
Main CI Pipeline / Web Tests (push) Has been cancelled
Main CI Pipeline / Run Web Full-Stack E2E (push) Has been cancelled
Main CI Pipeline / Skip Web Full-Stack E2E (push) Has been cancelled
Main CI Pipeline / Web Full-Stack E2E (push) Has been cancelled
Main CI Pipeline / Style Check (push) Has been cancelled
Main CI Pipeline / Run VDB Tests (push) Has been cancelled
Main CI Pipeline / Skip VDB Tests (push) Has been cancelled
Main CI Pipeline / VDB Tests (push) Has been cancelled
Main CI Pipeline / Run DB Migration Test (push) Has been cancelled
Main CI Pipeline / Skip DB Migration Test (push) Has been cancelled
Main CI Pipeline / DB Migration Test (push) Has been cancelled
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
@@ -551,6 +551,7 @@ class RetrievalService:
|
||||
child_index_nodes = session.execute(child_chunk_stmt).scalars().all()
|
||||
|
||||
for i in child_index_nodes:
|
||||
assert i.index_node_id
|
||||
segment_ids.append(i.segment_id)
|
||||
if i.segment_id in child_chunk_map:
|
||||
child_chunk_map[i.segment_id].append(i)
|
||||
|
||||
@@ -11,6 +11,7 @@ from core.rag.models.document import AttachmentDocument, Document
|
||||
from extensions.ext_database import db
|
||||
from graphon.model_runtime.entities.model_entities import ModelType
|
||||
from models.dataset import ChildChunk, Dataset, DocumentSegment, SegmentAttachmentBinding
|
||||
from models.enums import SegmentType
|
||||
|
||||
|
||||
class DatasetDocumentStore:
|
||||
@@ -127,6 +128,7 @@ class DatasetDocumentStore:
|
||||
if save_child:
|
||||
if doc.children:
|
||||
for position, child in enumerate(doc.children, start=1):
|
||||
assert self._document_id
|
||||
child_segment = ChildChunk(
|
||||
tenant_id=self._dataset.tenant_id,
|
||||
dataset_id=self._dataset.id,
|
||||
@@ -137,7 +139,7 @@ class DatasetDocumentStore:
|
||||
index_node_hash=child.metadata.get("doc_hash"),
|
||||
content=child.page_content,
|
||||
word_count=len(child.page_content),
|
||||
type="automatic",
|
||||
type=SegmentType.AUTOMATIC,
|
||||
created_by=self._user_id,
|
||||
)
|
||||
db.session.add(child_segment)
|
||||
@@ -163,6 +165,7 @@ class DatasetDocumentStore:
|
||||
)
|
||||
# add new child chunks
|
||||
for position, child in enumerate(doc.children, start=1):
|
||||
assert self._document_id
|
||||
child_segment = ChildChunk(
|
||||
tenant_id=self._dataset.tenant_id,
|
||||
dataset_id=self._dataset.id,
|
||||
@@ -173,7 +176,7 @@ class DatasetDocumentStore:
|
||||
index_node_hash=child.metadata.get("doc_hash"),
|
||||
content=child.page_content,
|
||||
word_count=len(child.page_content),
|
||||
type="automatic",
|
||||
type=SegmentType.AUTOMATIC,
|
||||
created_by=self._user_id,
|
||||
)
|
||||
db.session.add(child_segment)
|
||||
|
||||
@@ -1036,7 +1036,7 @@ class DocumentSegment(Base):
|
||||
return attachment_list
|
||||
|
||||
|
||||
class ChildChunk(Base):
|
||||
class ChildChunk(TypeBase):
|
||||
__tablename__ = "child_chunks"
|
||||
__table_args__ = (
|
||||
sa.PrimaryKeyConstraint("id", name="child_chunk_pkey"),
|
||||
@@ -1046,29 +1046,42 @@ class ChildChunk(Base):
|
||||
)
|
||||
|
||||
# initial fields
|
||||
id = mapped_column(StringUUID, nullable=False, default=lambda: str(uuid4()))
|
||||
tenant_id = mapped_column(StringUUID, nullable=False)
|
||||
dataset_id = mapped_column(StringUUID, nullable=False)
|
||||
document_id = mapped_column(StringUUID, nullable=False)
|
||||
segment_id = mapped_column(StringUUID, nullable=False)
|
||||
id: Mapped[str] = mapped_column(StringUUID, nullable=False, default_factory=lambda: str(uuid4()), init=False)
|
||||
tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
|
||||
dataset_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
|
||||
document_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
|
||||
segment_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
|
||||
position: Mapped[int] = mapped_column(sa.Integer, nullable=False)
|
||||
content = mapped_column(LongText, nullable=False)
|
||||
content: Mapped[str] = mapped_column(LongText, nullable=False)
|
||||
word_count: Mapped[int] = mapped_column(sa.Integer, nullable=False)
|
||||
# indexing fields
|
||||
index_node_id = mapped_column(String(255), nullable=True)
|
||||
index_node_hash = mapped_column(String(255), nullable=True)
|
||||
type: Mapped[SegmentType] = mapped_column(
|
||||
EnumText(SegmentType, length=255), nullable=False, server_default=sa.text("'automatic'")
|
||||
created_by: Mapped[str] = mapped_column(StringUUID, nullable=False)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime, nullable=False, server_default=sa.func.current_timestamp(), init=False
|
||||
)
|
||||
created_by = mapped_column(StringUUID, nullable=False)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=sa.func.current_timestamp())
|
||||
updated_by = mapped_column(StringUUID, nullable=True)
|
||||
updated_by: Mapped[str | None] = mapped_column(StringUUID, nullable=True, init=False)
|
||||
updated_at: Mapped[datetime] = mapped_column(
|
||||
DateTime, nullable=False, server_default=sa.func.current_timestamp(), onupdate=func.current_timestamp()
|
||||
DateTime,
|
||||
nullable=False,
|
||||
server_default=sa.func.current_timestamp(),
|
||||
onupdate=func.current_timestamp(),
|
||||
init=False,
|
||||
)
|
||||
indexing_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
||||
completed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
||||
error = mapped_column(LongText, nullable=True)
|
||||
indexing_at: Mapped[datetime | None] = mapped_column(
|
||||
DateTime, nullable=True, insert_default=None, server_default=None, init=False
|
||||
)
|
||||
completed_at: Mapped[datetime | None] = mapped_column(
|
||||
DateTime, nullable=True, insert_default=None, server_default=None, init=False
|
||||
)
|
||||
index_node_id: Mapped[str | None] = mapped_column(String(255), nullable=True, default=None)
|
||||
index_node_hash: Mapped[str | None] = mapped_column(String(255), nullable=True, default=None)
|
||||
type: Mapped[SegmentType] = mapped_column(
|
||||
EnumText(SegmentType, length=255),
|
||||
nullable=False,
|
||||
server_default=sa.text("'automatic'"),
|
||||
default=SegmentType.AUTOMATIC,
|
||||
)
|
||||
error: Mapped[str | None] = mapped_column(LongText, nullable=True, init=False)
|
||||
|
||||
@property
|
||||
def dataset(self):
|
||||
|
||||
@@ -3748,6 +3748,7 @@ class SegmentService:
|
||||
ChildChunk.segment_id == segment.id,
|
||||
)
|
||||
)
|
||||
assert current_user.current_tenant_id
|
||||
child_chunk = ChildChunk(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
dataset_id=dataset.id,
|
||||
@@ -3758,7 +3759,7 @@ class SegmentService:
|
||||
index_node_hash=index_node_hash,
|
||||
content=content,
|
||||
word_count=len(content),
|
||||
type="customized",
|
||||
type=SegmentType.CUSTOMIZED,
|
||||
created_by=current_user.id,
|
||||
)
|
||||
db.session.add(child_chunk)
|
||||
@@ -3818,6 +3819,7 @@ class SegmentService:
|
||||
if new_child_chunks_args:
|
||||
child_chunk_count = len(child_chunks)
|
||||
for position, args in enumerate(new_child_chunks_args, start=child_chunk_count + 1):
|
||||
assert current_user.current_tenant_id
|
||||
index_node_id = str(uuid.uuid4())
|
||||
index_node_hash = helper.generate_text_hash(args.content)
|
||||
child_chunk = ChildChunk(
|
||||
@@ -3830,7 +3832,7 @@ class SegmentService:
|
||||
index_node_hash=index_node_hash,
|
||||
content=args.content,
|
||||
word_count=len(args.content),
|
||||
type="customized",
|
||||
type=SegmentType.CUSTOMIZED,
|
||||
created_by=current_user.id,
|
||||
)
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ from graphon.model_runtime.entities.model_entities import ModelType
|
||||
from models import UploadFile
|
||||
from models.dataset import ChildChunk, Dataset, DatasetProcessRule, DocumentSegment, SegmentAttachmentBinding
|
||||
from models.dataset import Document as DatasetDocument
|
||||
from models.enums import SegmentType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -178,7 +179,7 @@ class VectorService:
|
||||
index_node_hash=child_chunk.metadata["doc_hash"],
|
||||
content=child_chunk.page_content,
|
||||
word_count=len(child_chunk.page_content),
|
||||
type="automatic",
|
||||
type=SegmentType.AUTOMATIC,
|
||||
created_by=dataset_document.created_by,
|
||||
)
|
||||
db.session.add(child_segment)
|
||||
@@ -222,6 +223,7 @@ class VectorService:
|
||||
)
|
||||
documents.append(new_child_document)
|
||||
for update_child_chunk in update_child_chunks:
|
||||
assert update_child_chunk.index_node_id
|
||||
child_document = Document(
|
||||
page_content=update_child_chunk.content,
|
||||
metadata={
|
||||
@@ -234,6 +236,7 @@ class VectorService:
|
||||
documents.append(child_document)
|
||||
delete_node_ids.append(update_child_chunk.index_node_id)
|
||||
for delete_child_chunk in delete_child_chunks:
|
||||
assert delete_child_chunk.index_node_id
|
||||
delete_node_ids.append(delete_child_chunk.index_node_id)
|
||||
if dataset.indexing_technique == IndexTechniqueType.HIGH_QUALITY:
|
||||
# update vector index
|
||||
@@ -246,6 +249,7 @@ class VectorService:
|
||||
@classmethod
|
||||
def delete_child_chunk_vector(cls, child_chunk: ChildChunk, dataset: Dataset):
|
||||
vector = Vector(dataset=dataset)
|
||||
assert child_chunk.index_node_id
|
||||
vector.delete_by_ids([child_chunk.index_node_id])
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -365,7 +365,6 @@ def _make_segment(
|
||||
|
||||
def _make_child_chunk() -> ChildChunk:
|
||||
return ChildChunk(
|
||||
id="child-a",
|
||||
tenant_id="tenant-1",
|
||||
dataset_id="dataset-1",
|
||||
document_id="doc-1",
|
||||
|
||||
@@ -89,7 +89,6 @@ class TestSegmentServiceChildChunks:
|
||||
document = _make_document()
|
||||
segment = _make_segment()
|
||||
existing_a = ChildChunk(
|
||||
id="child-a",
|
||||
tenant_id="tenant-1",
|
||||
dataset_id="dataset-1",
|
||||
document_id="doc-1",
|
||||
@@ -100,7 +99,6 @@ class TestSegmentServiceChildChunks:
|
||||
created_by="user-1",
|
||||
)
|
||||
existing_b = ChildChunk(
|
||||
id="child-b",
|
||||
tenant_id="tenant-1",
|
||||
dataset_id="dataset-1",
|
||||
document_id="doc-1",
|
||||
@@ -110,7 +108,8 @@ class TestSegmentServiceChildChunks:
|
||||
word_count=9,
|
||||
created_by="user-1",
|
||||
)
|
||||
|
||||
existing_a.id = "child-a"
|
||||
existing_b.id = "child-b"
|
||||
with (
|
||||
patch("services.dataset_service.db") as mock_db,
|
||||
patch("services.dataset_service.uuid.uuid4", return_value="node-new"),
|
||||
|
||||
Reference in New Issue
Block a user