diff --git a/api/controllers/service_api/dataset/document.py b/api/controllers/service_api/dataset/document.py index bc28ecb6b7..0b09facf58 100644 --- a/api/controllers/service_api/dataset/document.py +++ b/api/controllers/service_api/dataset/document.py @@ -468,15 +468,98 @@ class DocumentAddByFileApi(DatasetApiResource): return documents_and_batch_fields, 200 +def _update_document_by_file(tenant_id: str, dataset_id: UUID, document_id: UUID) -> tuple[Mapping[str, object], int]: + """Update a document from an uploaded file for canonical and deprecated routes.""" + dataset_id_str = str(dataset_id) + tenant_id_str = str(tenant_id) + dataset = db.session.scalar( + select(Dataset).where(Dataset.tenant_id == tenant_id_str, Dataset.id == dataset_id_str).limit(1) + ) + + if not dataset: + raise ValueError("Dataset does not exist.") + + if dataset.provider == "external": + raise ValueError("External datasets are not supported.") + + args: dict[str, object] = {} + if "data" in request.form: + args = json.loads(request.form["data"]) + if "doc_form" not in args: + args["doc_form"] = dataset.chunk_structure or "text_model" + if "doc_language" not in args: + args["doc_language"] = "English" + + # indexing_technique is already set in dataset since this is an update + args["indexing_technique"] = dataset.indexing_technique + + if "file" in request.files: + # save file info + file = request.files["file"] + + if len(request.files) > 1: + raise TooManyFilesError() + + if not file.filename: + raise FilenameNotExistsError + + if not current_user: + raise ValueError("current_user is required") + + try: + upload_file = FileService(db.engine).upload_file( + filename=file.filename, + content=file.read(), + mimetype=file.mimetype, + user=current_user, + source="datasets", + ) + except services.errors.file.FileTooLargeError as file_too_large_error: + raise FileTooLargeError(file_too_large_error.description) + except services.errors.file.UnsupportedFileTypeError: + raise UnsupportedFileTypeError() + data_source = { + "type": "upload_file", + "info_list": {"data_source_type": "upload_file", "file_info_list": {"file_ids": [upload_file.id]}}, + } + args["data_source"] = data_source + + # validate args + args["original_document_id"] = str(document_id) + + knowledge_config = KnowledgeConfig.model_validate(args) + DocumentService.document_create_args_validate(knowledge_config) + + try: + documents, _ = DocumentService.save_document_with_dataset_id( + dataset=dataset, + knowledge_config=knowledge_config, + account=dataset.created_by_account, + dataset_process_rule=dataset.latest_process_rule if "process_rule" not in args else None, + created_from="api", + ) + except ProviderTokenNotInitError as ex: + raise ProviderNotInitializeError(ex.description) + document = documents[0] + documents_and_batch_fields = {"document": marshal(document, document_fields), "batch": document.batch} + return documents_and_batch_fields, 200 + + @service_api_ns.route( "/datasets//documents//update_by_file", "/datasets//documents//update-by-file", ) -class DocumentUpdateByFileApi(DatasetApiResource): - """Resource for update documents.""" +class DeprecatedDocumentUpdateByFileApi(DatasetApiResource): + """Deprecated resource aliases for file document updates.""" - @service_api_ns.doc("update_document_by_file") - @service_api_ns.doc(description="Update an existing document by uploading a file") + @service_api_ns.doc("update_document_by_file_deprecated") + @service_api_ns.doc(deprecated=True) + @service_api_ns.doc( + description=( + "Deprecated legacy alias for updating an existing document by uploading a file. " + "Use PATCH /datasets/{dataset_id}/documents/{document_id} instead." + ) + ) @service_api_ns.doc(params={"dataset_id": "Dataset ID", "document_id": "Document ID"}) @service_api_ns.doc( responses={ @@ -487,82 +570,9 @@ class DocumentUpdateByFileApi(DatasetApiResource): ) @cloud_edition_billing_resource_check("vector_space", "dataset") @cloud_edition_billing_rate_limit_check("knowledge", "dataset") - def post(self, tenant_id, dataset_id, document_id): - """Update document by upload file.""" - dataset = db.session.scalar( - select(Dataset).where(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).limit(1) - ) - - if not dataset: - raise ValueError("Dataset does not exist.") - - if dataset.provider == "external": - raise ValueError("External datasets are not supported.") - - args = {} - if "data" in request.form: - args = json.loads(request.form["data"]) - if "doc_form" not in args: - args["doc_form"] = dataset.chunk_structure or "text_model" - if "doc_language" not in args: - args["doc_language"] = "English" - - # get dataset info - dataset_id = str(dataset_id) - tenant_id = str(tenant_id) - - # indexing_technique is already set in dataset since this is an update - args["indexing_technique"] = dataset.indexing_technique - - if "file" in request.files: - # save file info - file = request.files["file"] - - if len(request.files) > 1: - raise TooManyFilesError() - - if not file.filename: - raise FilenameNotExistsError - - if not current_user: - raise ValueError("current_user is required") - - try: - upload_file = FileService(db.engine).upload_file( - filename=file.filename, - content=file.read(), - mimetype=file.mimetype, - user=current_user, - source="datasets", - ) - except services.errors.file.FileTooLargeError as file_too_large_error: - raise FileTooLargeError(file_too_large_error.description) - except services.errors.file.UnsupportedFileTypeError: - raise UnsupportedFileTypeError() - data_source = { - "type": "upload_file", - "info_list": {"data_source_type": "upload_file", "file_info_list": {"file_ids": [upload_file.id]}}, - } - args["data_source"] = data_source - # validate args - args["original_document_id"] = str(document_id) - - knowledge_config = KnowledgeConfig.model_validate(args) - DocumentService.document_create_args_validate(knowledge_config) - - try: - documents, _ = DocumentService.save_document_with_dataset_id( - dataset=dataset, - knowledge_config=knowledge_config, - account=dataset.created_by_account, - dataset_process_rule=dataset.latest_process_rule if "process_rule" not in args else None, - created_from="api", - ) - except ProviderTokenNotInitError as ex: - raise ProviderNotInitializeError(ex.description) - document = documents[0] - documents_and_batch_fields = {"document": marshal(document, document_fields), "batch": document.batch} - return documents_and_batch_fields, 200 + def post(self, tenant_id: str, dataset_id: UUID, document_id: UUID): + """Update document by file through the deprecated file-update aliases.""" + return _update_document_by_file(tenant_id=tenant_id, dataset_id=dataset_id, document_id=document_id) @service_api_ns.route("/datasets//documents") @@ -876,6 +886,22 @@ class DocumentApi(DatasetApiResource): return response + @service_api_ns.doc("update_document_by_file") + @service_api_ns.doc(description="Update an existing document by uploading a file") + @service_api_ns.doc(params={"dataset_id": "Dataset ID", "document_id": "Document ID"}) + @service_api_ns.doc( + responses={ + 200: "Document updated successfully", + 401: "Unauthorized - invalid API token", + 404: "Document not found", + } + ) + @cloud_edition_billing_resource_check("vector_space", "dataset") + @cloud_edition_billing_rate_limit_check("knowledge", "dataset") + def patch(self, tenant_id: str, dataset_id: UUID, document_id: UUID): + """Update document by file on the canonical document resource.""" + return _update_document_by_file(tenant_id=tenant_id, dataset_id=dataset_id, document_id=document_id) + @service_api_ns.doc("delete_document") @service_api_ns.doc(description="Delete a document") @service_api_ns.doc(params={"dataset_id": "Dataset ID", "document_id": "Document ID"}) diff --git a/api/tests/unit_tests/controllers/service_api/dataset/test_document.py b/api/tests/unit_tests/controllers/service_api/dataset/test_document.py index 1b391e67ec..230c51161f 100644 --- a/api/tests/unit_tests/controllers/service_api/dataset/test_document.py +++ b/api/tests/unit_tests/controllers/service_api/dataset/test_document.py @@ -23,6 +23,7 @@ from werkzeug.exceptions import Forbidden, NotFound from controllers.service_api.dataset.document import ( DeprecatedDocumentAddByTextApi, + DeprecatedDocumentUpdateByFileApi, DeprecatedDocumentUpdateByTextApi, DocumentAddByFileApi, DocumentAddByTextApi, @@ -32,7 +33,6 @@ from controllers.service_api.dataset.document import ( DocumentListQuery, DocumentTextCreatePayload, DocumentTextUpdate, - DocumentUpdateByFileApi, DocumentUpdateByTextApi, InvalidMetadataError, ) @@ -1095,8 +1095,8 @@ class TestArchivedDocumentImmutableError: assert error.code == 403 -class TestDocumentTextRouteDeprecation: - """Test that legacy underscore text routes stay marked deprecated.""" +class TestDocumentRouteDeprecation: + """Test that legacy document routes stay marked deprecated.""" def test_create_by_text_legacy_alias_is_deprecated(self): """Ensure only the legacy create-by-text alias is marked deprecated.""" @@ -1108,10 +1108,15 @@ class TestDocumentTextRouteDeprecation: assert DeprecatedDocumentUpdateByTextApi.post.__apidoc__["deprecated"] is True assert DocumentUpdateByTextApi.post.__apidoc__.get("deprecated") is not True + def test_update_by_file_legacy_aliases_are_deprecated(self): + """Ensure only the legacy file-update aliases are marked deprecated.""" + assert DeprecatedDocumentUpdateByFileApi.post.__apidoc__["deprecated"] is True + assert DocumentApi.patch.__apidoc__.get("deprecated") is not True + # ============================================================================= # Endpoint tests for DocumentUpdateByTextApi, DocumentAddByFileApi, -# DocumentUpdateByFileApi. +# and the canonical/deprecated document file update routes. # # These controllers use ``@cloud_edition_billing_resource_check`` (does NOT # preserve ``__wrapped__``) and ``@cloud_edition_billing_rate_limit_check`` @@ -1359,13 +1364,52 @@ class TestDocumentAddByFileApiPost: api.post(tenant_id=mock_tenant.id, dataset_id=mock_dataset.id) -class TestDocumentUpdateByFileApiPost: - """Test suite for DocumentUpdateByFileApi.post() endpoint. +class TestDocumentUpdateByFileApiPatch: + """Test suite for the canonical document file update endpoint. - ``post`` is wrapped by ``@cloud_edition_billing_resource_check`` and + ``patch`` is wrapped by ``@cloud_edition_billing_resource_check`` and ``@cloud_edition_billing_rate_limit_check``. """ + @pytest.mark.parametrize("route_name", ["update_by_file", "update-by-file"]) + @patch("controllers.service_api.dataset.document._update_document_by_file") + @patch("controllers.service_api.wraps.FeatureService") + @patch("controllers.service_api.wraps.validate_and_get_api_token") + def test_update_by_file_deprecated_aliases_delegate_to_shared_handler( + self, + mock_validate_token, + mock_feature_svc, + mock_update_document_by_file, + route_name, + app, + mock_tenant, + mock_dataset, + ): + """Test legacy POST aliases still dispatch while marked deprecated.""" + _setup_billing_mocks(mock_validate_token, mock_feature_svc, mock_tenant.id) + mock_update_document_by_file.return_value = ({"document": {"id": "doc-1"}, "batch": "batch-1"}, 200) + + doc_id = str(uuid.uuid4()) + with app.test_request_context( + f"/datasets/{mock_dataset.id}/documents/{doc_id}/{route_name}", + method="POST", + headers={"Authorization": "Bearer test_token"}, + ): + api = DeprecatedDocumentUpdateByFileApi() + response, status = api.post( + tenant_id=mock_tenant.id, + dataset_id=mock_dataset.id, + document_id=doc_id, + ) + + assert status == 200 + assert response["batch"] == "batch-1" + mock_update_document_by_file.assert_called_once_with( + tenant_id=mock_tenant.id, + dataset_id=mock_dataset.id, + document_id=doc_id, + ) + @patch("controllers.service_api.dataset.document.db") @patch("controllers.service_api.wraps.FeatureService") @patch("controllers.service_api.wraps.validate_and_get_api_token") @@ -1387,15 +1431,15 @@ class TestDocumentUpdateByFileApiPost: doc_id = str(uuid.uuid4()) data = {"file": (BytesIO(b"content"), "test.pdf", "application/pdf")} with app.test_request_context( - f"/datasets/{mock_dataset.id}/documents/{doc_id}/update_by_file", - method="POST", + f"/datasets/{mock_dataset.id}/documents/{doc_id}", + method="PATCH", content_type="multipart/form-data", data=data, headers={"Authorization": "Bearer test_token"}, ): - api = DocumentUpdateByFileApi() + api = DocumentApi() with pytest.raises(ValueError, match="Dataset does not exist"): - api.post( + api.patch( tenant_id=mock_tenant.id, dataset_id=mock_dataset.id, document_id=doc_id, @@ -1423,15 +1467,15 @@ class TestDocumentUpdateByFileApiPost: doc_id = str(uuid.uuid4()) data = {"file": (BytesIO(b"content"), "test.pdf", "application/pdf")} with app.test_request_context( - f"/datasets/{mock_dataset.id}/documents/{doc_id}/update_by_file", - method="POST", + f"/datasets/{mock_dataset.id}/documents/{doc_id}", + method="PATCH", content_type="multipart/form-data", data=data, headers={"Authorization": "Bearer test_token"}, ): - api = DocumentUpdateByFileApi() + api = DocumentApi() with pytest.raises(ValueError, match="External datasets"): - api.post( + api.patch( tenant_id=mock_tenant.id, dataset_id=mock_dataset.id, document_id=doc_id, @@ -1482,14 +1526,14 @@ class TestDocumentUpdateByFileApiPost: doc_id = str(uuid.uuid4()) data = {"file": (BytesIO(b"file content"), "test.pdf", "application/pdf")} with app.test_request_context( - f"/datasets/{mock_dataset.id}/documents/{doc_id}/update_by_file", - method="POST", + f"/datasets/{mock_dataset.id}/documents/{doc_id}", + method="PATCH", content_type="multipart/form-data", data=data, headers={"Authorization": "Bearer test_token"}, ): - api = DocumentUpdateByFileApi() - response, status = api.post( + api = DocumentApi() + response, status = api.patch( tenant_id=mock_tenant.id, dataset_id=mock_dataset.id, document_id=doc_id,