test: migrate clean notion task tests to SQLAlchemy 2.0 APIs (#35159)

This commit is contained in:
jimcody1995
2026-04-16 00:52:11 -07:00
committed by GitHub
parent a1f990584b
commit 7f4fe4d064

View File

@@ -11,7 +11,8 @@ from unittest.mock import Mock, patch
import pytest
from faker import Faker
from sqlalchemy import func, select
from sqlalchemy import ColumnElement, func, select
from sqlalchemy.orm import Session
from core.rag.index_processor.constant.index_type import IndexStructureType
from models.dataset import Dataset, Document, DocumentSegment
@@ -21,6 +22,14 @@ from tasks.clean_notion_document_task import clean_notion_document_task
from tests.test_containers_integration_tests.helpers import generate_valid_password
def _count_documents(session: Session, condition: ColumnElement[bool]) -> int:
return session.scalar(select(func.count()).select_from(Document).where(condition)) or 0
def _count_segments(session: Session, condition: ColumnElement[bool]) -> int:
return session.scalar(select(func.count()).select_from(DocumentSegment).where(condition)) or 0
class TestCleanNotionDocumentTask:
"""Integration tests for clean_notion_document_task using testcontainers."""
@@ -146,29 +155,14 @@ class TestCleanNotionDocumentTask:
db_session_with_containers.commit()
# Verify data exists before cleanup
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(Document).where(Document.id.in_(document_ids))
)
== 3
)
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.document_id.in_(document_ids))
)
== 6
)
assert _count_documents(db_session_with_containers, Document.id.in_(document_ids)) == 3
assert _count_segments(db_session_with_containers, DocumentSegment.document_id.in_(document_ids)) == 6
# Execute cleanup task
clean_notion_document_task(document_ids, dataset.id)
# Verify segments are deleted
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.document_id.in_(document_ids))
)
== 0
)
assert _count_segments(db_session_with_containers, DocumentSegment.document_id.in_(document_ids)) == 0
# Verify index processor was called
mock_processor = mock_index_processor_factory.return_value.init_index_processor.return_value
@@ -328,12 +322,7 @@ class TestCleanNotionDocumentTask:
# The task properly handles various index types and document configurations.
# Verify segments are deleted
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.document_id == document.id)
)
== 0
)
assert _count_segments(db_session_with_containers, DocumentSegment.document_id == document.id) == 0
# Reset mock for next iteration
mock_index_processor_factory.reset_mock()
@@ -416,12 +405,7 @@ class TestCleanNotionDocumentTask:
clean_notion_document_task([document.id], dataset.id)
# Verify segments are deleted
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.document_id == document.id)
)
== 0
)
assert _count_segments(db_session_with_containers, DocumentSegment.document_id == document.id) == 0
# Note: This test successfully verifies that segments without index_node_ids
# are properly deleted from the database.
@@ -507,18 +491,8 @@ class TestCleanNotionDocumentTask:
db_session_with_containers.commit()
# Verify all data exists before cleanup
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(Document).where(Document.dataset_id == dataset.id)
)
== 5
)
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.dataset_id == dataset.id)
)
== 10
)
assert _count_documents(db_session_with_containers, Document.dataset_id == dataset.id) == 5
assert _count_segments(db_session_with_containers, DocumentSegment.dataset_id == dataset.id) == 10
# Clean up only first 3 documents
documents_to_clean = [doc.id for doc in documents[:3]]
@@ -528,29 +502,12 @@ class TestCleanNotionDocumentTask:
clean_notion_document_task(documents_to_clean, dataset.id)
# Verify only specified documents' segments are deleted
assert (
db_session_with_containers.scalar(
select(func.count())
.select_from(DocumentSegment)
.where(DocumentSegment.document_id.in_(documents_to_clean))
)
== 0
)
assert _count_segments(db_session_with_containers, DocumentSegment.document_id.in_(documents_to_clean)) == 0
# Verify remaining documents and segments are intact
remaining_docs = [doc.id for doc in documents[3:]]
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(Document).where(Document.id.in_(remaining_docs))
)
== 2
)
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.document_id.in_(remaining_docs))
)
== 4
)
assert _count_documents(db_session_with_containers, Document.id.in_(remaining_docs)) == 2
assert _count_segments(db_session_with_containers, DocumentSegment.document_id.in_(remaining_docs)) == 4
# Note: This test successfully verifies partial document cleanup operations.
# The database operations work correctly, isolating only the specified documents.
@@ -634,23 +591,13 @@ class TestCleanNotionDocumentTask:
db_session_with_containers.commit()
# Verify all segments exist before cleanup
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.document_id == document.id)
)
== 4
)
assert _count_segments(db_session_with_containers, DocumentSegment.document_id == document.id) == 4
# Execute cleanup task
clean_notion_document_task([document.id], dataset.id)
# Verify all segments are deleted regardless of status
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.document_id == document.id)
)
== 0
)
assert _count_segments(db_session_with_containers, DocumentSegment.document_id == document.id) == 0
# Note: This test successfully verifies database operations.
# IndexProcessor verification would require more sophisticated mocking.
@@ -820,16 +767,9 @@ class TestCleanNotionDocumentTask:
db_session_with_containers.commit()
# Verify all data exists before cleanup
assert _count_documents(db_session_with_containers, Document.dataset_id == dataset.id) == num_documents
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(Document).where(Document.dataset_id == dataset.id)
)
== num_documents
)
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.dataset_id == dataset.id)
)
_count_segments(db_session_with_containers, DocumentSegment.dataset_id == dataset.id)
== num_documents * num_segments_per_doc
)
@@ -838,12 +778,7 @@ class TestCleanNotionDocumentTask:
clean_notion_document_task(all_document_ids, dataset.id)
# Verify all segments are deleted
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.dataset_id == dataset.id)
)
== 0
)
assert _count_segments(db_session_with_containers, DocumentSegment.dataset_id == dataset.id) == 0
# Note: This test successfully verifies bulk document cleanup operations.
# The database efficiently handles large-scale deletions.
@@ -950,29 +885,12 @@ class TestCleanNotionDocumentTask:
clean_notion_document_task([target_document.id], target_dataset.id)
# Verify only documents' segments from target dataset are deleted
assert (
db_session_with_containers.scalar(
select(func.count())
.select_from(DocumentSegment)
.where(DocumentSegment.document_id == target_document.id)
)
== 0
)
assert _count_segments(db_session_with_containers, DocumentSegment.document_id == target_document.id) == 0
# Verify documents from other datasets remain intact
remaining_docs = [doc.id for doc in all_documents[1:]]
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(Document).where(Document.id.in_(remaining_docs))
)
== 2
)
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.document_id.in_(remaining_docs))
)
== 6
)
assert _count_documents(db_session_with_containers, Document.id.in_(remaining_docs)) == 2
assert _count_segments(db_session_with_containers, DocumentSegment.document_id.in_(remaining_docs)) == 6
# Note: This test successfully verifies multi-tenant isolation.
# Only documents from the target dataset are affected, maintaining tenant separation.
@@ -1067,13 +985,9 @@ class TestCleanNotionDocumentTask:
db_session_with_containers.commit()
# Verify all data exists before cleanup
assert db_session_with_containers.scalar(
select(func.count()).select_from(Document).where(Document.dataset_id == dataset.id)
) == len(document_statuses)
assert _count_documents(db_session_with_containers, Document.dataset_id == dataset.id) == len(document_statuses)
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.dataset_id == dataset.id)
)
_count_segments(db_session_with_containers, DocumentSegment.dataset_id == dataset.id)
== len(document_statuses) * 2
)
@@ -1082,12 +996,7 @@ class TestCleanNotionDocumentTask:
clean_notion_document_task(all_document_ids, dataset.id)
# Verify all segments are deleted regardless of status
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.dataset_id == dataset.id)
)
== 0
)
assert _count_segments(db_session_with_containers, DocumentSegment.dataset_id == dataset.id) == 0
# Note: This test successfully verifies cleanup of documents in various states.
# All documents are deleted regardless of their indexing status.
@@ -1185,29 +1094,14 @@ class TestCleanNotionDocumentTask:
db_session_with_containers.commit()
# Verify data exists before cleanup
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(Document).where(Document.id == document.id)
)
== 1
)
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.document_id == document.id)
)
== 3
)
assert _count_documents(db_session_with_containers, Document.id == document.id) == 1
assert _count_segments(db_session_with_containers, DocumentSegment.document_id == document.id) == 3
# Execute cleanup task
clean_notion_document_task([document.id], dataset.id)
# Verify segments are deleted
assert (
db_session_with_containers.scalar(
select(func.count()).select_from(DocumentSegment).where(DocumentSegment.document_id == document.id)
)
== 0
)
assert _count_segments(db_session_with_containers, DocumentSegment.document_id == document.id) == 0
# Note: This test successfully verifies cleanup of documents with rich metadata.
# The task properly handles complex document structures and metadata fields.