From 86c856f56f43bba2d3016a21c3d619f38fba196a Mon Sep 17 00:00:00 2001 From: Luis Pater Date: Sun, 19 Apr 2026 03:21:59 +0800 Subject: [PATCH] feat(translator): add partial and full image generation support in Codex-GPT and Codex-Gemini flows - Introduced `LastImageHashByItemID` in Codex-GPT and `LastImageHashByID` in Codex-Gemini for deduplication of generated images. - Added support for handling `partial_image` and `image_generation_call` types, with inline data embedding for Gemini and URL payload conversion for GPT. - Extended unit tests to verify image handling in both streaming and non-streaming modes. --- .../codex/gemini/codex_gemini_response.go | 92 +++++++++++++ .../gemini/codex_gemini_response_test.go | 76 +++++++++++ .../chat-completions/codex_openai_response.go | 125 +++++++++++++++++- .../codex_openai_response_test.go | 59 +++++++++ 4 files changed, 351 insertions(+), 1 deletion(-) diff --git a/internal/translator/codex/gemini/codex_gemini_response.go b/internal/translator/codex/gemini/codex_gemini_response.go index f6ef87710..a2e4e20ea 100644 --- a/internal/translator/codex/gemini/codex_gemini_response.go +++ b/internal/translator/codex/gemini/codex_gemini_response.go @@ -7,6 +7,8 @@ package gemini import ( "bytes" "context" + "crypto/sha256" + "strings" "time" translatorcommon "github.com/router-for-me/CLIProxyAPI/v6/internal/translator/common" @@ -25,6 +27,7 @@ type ConvertCodexResponseToGeminiParams struct { ResponseID string LastStorageOutput []byte HasOutputTextDelta bool + LastImageHashByID map[string][32]byte } // ConvertCodexResponseToGemini converts Codex streaming response format to Gemini format. @@ -48,6 +51,7 @@ func ConvertCodexResponseToGemini(_ context.Context, modelName string, originalR ResponseID: "", LastStorageOutput: nil, HasOutputTextDelta: false, + LastImageHashByID: make(map[string][32]byte), } } @@ -74,10 +78,63 @@ func ConvertCodexResponseToGemini(_ context.Context, modelName string, originalR template, _ = sjson.SetBytes(template, "responseId", params.ResponseID) } + if typeStr == "response.image_generation_call.partial_image" { + itemID := rootResult.Get("item_id").String() + b64 := rootResult.Get("partial_image_b64").String() + if b64 == "" { + return [][]byte{} + } + if itemID != "" { + if params.LastImageHashByID == nil { + params.LastImageHashByID = make(map[string][32]byte) + } + hash := sha256.Sum256([]byte(b64)) + if last, ok := params.LastImageHashByID[itemID]; ok && last == hash { + return [][]byte{} + } + params.LastImageHashByID[itemID] = hash + } + + outputFormat := rootResult.Get("output_format").String() + mimeType := mimeTypeFromCodexOutputFormat(outputFormat) + + part := []byte(`{"inlineData":{"data":"","mimeType":""}}`) + part, _ = sjson.SetBytes(part, "inlineData.data", b64) + part, _ = sjson.SetBytes(part, "inlineData.mimeType", mimeType) + template, _ = sjson.SetRawBytes(template, "candidates.0.content.parts.-1", part) + return [][]byte{template} + } + // Handle function call completion if typeStr == "response.output_item.done" { itemResult := rootResult.Get("item") itemType := itemResult.Get("type").String() + if itemType == "image_generation_call" { + itemID := itemResult.Get("id").String() + b64 := itemResult.Get("result").String() + if b64 == "" { + return [][]byte{} + } + if itemID != "" { + if params.LastImageHashByID == nil { + params.LastImageHashByID = make(map[string][32]byte) + } + hash := sha256.Sum256([]byte(b64)) + if last, ok := params.LastImageHashByID[itemID]; ok && last == hash { + return [][]byte{} + } + params.LastImageHashByID[itemID] = hash + } + + outputFormat := itemResult.Get("output_format").String() + mimeType := mimeTypeFromCodexOutputFormat(outputFormat) + + part := []byte(`{"inlineData":{"data":"","mimeType":""}}`) + part, _ = sjson.SetBytes(part, "inlineData.data", b64) + part, _ = sjson.SetBytes(part, "inlineData.mimeType", mimeType) + template, _ = sjson.SetRawBytes(template, "candidates.0.content.parts.-1", part) + return [][]byte{template} + } if itemType == "function_call" { // Create function call part functionCall := []byte(`{"functionCall":{"name":"","args":{}}}`) @@ -270,6 +327,20 @@ func ConvertCodexResponseToGeminiNonStream(_ context.Context, modelName string, }) } + case "image_generation_call": + flushPendingFunctionCalls() + b64 := value.Get("result").String() + if b64 == "" { + break + } + outputFormat := value.Get("output_format").String() + mimeType := mimeTypeFromCodexOutputFormat(outputFormat) + + part := []byte(`{"inlineData":{"data":"","mimeType":""}}`) + part, _ = sjson.SetBytes(part, "inlineData.data", b64) + part, _ = sjson.SetBytes(part, "inlineData.mimeType", mimeType) + template, _ = sjson.SetRawBytes(template, "candidates.0.content.parts.-1", part) + case "function_call": // Collect function call for potential merging with consecutive ones hasToolCall = true @@ -342,3 +413,24 @@ func buildReverseMapFromGeminiOriginal(original []byte) map[string]string { func GeminiTokenCount(ctx context.Context, count int64) []byte { return translatorcommon.GeminiTokenCountJSON(count) } + +func mimeTypeFromCodexOutputFormat(outputFormat string) string { + if outputFormat == "" { + return "image/png" + } + if strings.Contains(outputFormat, "/") { + return outputFormat + } + switch strings.ToLower(outputFormat) { + case "png": + return "image/png" + case "jpg", "jpeg": + return "image/jpeg" + case "webp": + return "image/webp" + case "gif": + return "image/gif" + default: + return "image/png" + } +} diff --git a/internal/translator/codex/gemini/codex_gemini_response_test.go b/internal/translator/codex/gemini/codex_gemini_response_test.go index b8f227beb..547ee8471 100644 --- a/internal/translator/codex/gemini/codex_gemini_response_test.go +++ b/internal/translator/codex/gemini/codex_gemini_response_test.go @@ -33,3 +33,79 @@ func TestConvertCodexResponseToGemini_StreamEmptyOutputUsesOutputItemDoneMessage t.Fatalf("expected fallback content from response.output_item.done message; outputs=%q", outputs) } } + +func TestConvertCodexResponseToGemini_StreamPartialImageEmitsInlineData(t *testing.T) { + ctx := context.Background() + originalRequest := []byte(`{"tools":[]}`) + var param any + + chunk := []byte(`data: {"type":"response.image_generation_call.partial_image","item_id":"ig_123","output_format":"png","partial_image_b64":"aGVsbG8=","partial_image_index":0}`) + out := ConvertCodexResponseToGemini(ctx, "gemini-2.5-pro", originalRequest, nil, chunk, ¶m) + if len(out) != 1 { + t.Fatalf("expected 1 chunk, got %d", len(out)) + } + + got := gjson.GetBytes(out[0], "candidates.0.content.parts.0.inlineData.data").String() + if got != "aGVsbG8=" { + t.Fatalf("expected inlineData.data %q, got %q; chunk=%s", "aGVsbG8=", got, string(out[0])) + } + + gotMime := gjson.GetBytes(out[0], "candidates.0.content.parts.0.inlineData.mimeType").String() + if gotMime != "image/png" { + t.Fatalf("expected inlineData.mimeType %q, got %q; chunk=%s", "image/png", gotMime, string(out[0])) + } + + out = ConvertCodexResponseToGemini(ctx, "gemini-2.5-pro", originalRequest, nil, chunk, ¶m) + if len(out) != 0 { + t.Fatalf("expected duplicate image chunk to be suppressed, got %d", len(out)) + } +} + +func TestConvertCodexResponseToGemini_StreamImageGenerationCallDoneEmitsInlineData(t *testing.T) { + ctx := context.Background() + originalRequest := []byte(`{"tools":[]}`) + var param any + + out := ConvertCodexResponseToGemini(ctx, "gemini-2.5-pro", originalRequest, nil, []byte(`data: {"type":"response.image_generation_call.partial_image","item_id":"ig_123","output_format":"png","partial_image_b64":"aGVsbG8=","partial_image_index":0}`), ¶m) + if len(out) != 1 { + t.Fatalf("expected 1 chunk, got %d", len(out)) + } + + out = ConvertCodexResponseToGemini(ctx, "gemini-2.5-pro", originalRequest, nil, []byte(`data: {"type":"response.output_item.done","item":{"id":"ig_123","type":"image_generation_call","output_format":"png","result":"aGVsbG8="}}`), ¶m) + if len(out) != 0 { + t.Fatalf("expected output_item.done to be suppressed when identical to last partial image, got %d", len(out)) + } + + out = ConvertCodexResponseToGemini(ctx, "gemini-2.5-pro", originalRequest, nil, []byte(`data: {"type":"response.output_item.done","item":{"id":"ig_123","type":"image_generation_call","output_format":"jpeg","result":"Ymll"}}`), ¶m) + if len(out) != 1 { + t.Fatalf("expected 1 chunk, got %d", len(out)) + } + + got := gjson.GetBytes(out[0], "candidates.0.content.parts.0.inlineData.data").String() + if got != "Ymll" { + t.Fatalf("expected inlineData.data %q, got %q; chunk=%s", "Ymll", got, string(out[0])) + } + + gotMime := gjson.GetBytes(out[0], "candidates.0.content.parts.0.inlineData.mimeType").String() + if gotMime != "image/jpeg" { + t.Fatalf("expected inlineData.mimeType %q, got %q; chunk=%s", "image/jpeg", gotMime, string(out[0])) + } +} + +func TestConvertCodexResponseToGemini_NonStreamImageGenerationCallAddsInlineDataPart(t *testing.T) { + ctx := context.Background() + originalRequest := []byte(`{"tools":[]}`) + + raw := []byte(`{"type":"response.completed","response":{"id":"resp_123","created_at":1700000000,"usage":{"input_tokens":1,"output_tokens":1},"output":[{"type":"message","content":[{"type":"output_text","text":"ok"}]},{"type":"image_generation_call","output_format":"png","result":"aGVsbG8="}]}}`) + out := ConvertCodexResponseToGeminiNonStream(ctx, "gemini-2.5-pro", originalRequest, nil, raw, nil) + + got := gjson.GetBytes(out, "candidates.0.content.parts.1.inlineData.data").String() + if got != "aGVsbG8=" { + t.Fatalf("expected inlineData.data %q, got %q; chunk=%s", "aGVsbG8=", got, string(out)) + } + + gotMime := gjson.GetBytes(out, "candidates.0.content.parts.1.inlineData.mimeType").String() + if gotMime != "image/png" { + t.Fatalf("expected inlineData.mimeType %q, got %q; chunk=%s", "image/png", gotMime, string(out)) + } +} diff --git a/internal/translator/codex/openai/chat-completions/codex_openai_response.go b/internal/translator/codex/openai/chat-completions/codex_openai_response.go index afae35d48..75b5b848b 100644 --- a/internal/translator/codex/openai/chat-completions/codex_openai_response.go +++ b/internal/translator/codex/openai/chat-completions/codex_openai_response.go @@ -8,6 +8,8 @@ package chat_completions import ( "bytes" "context" + "crypto/sha256" + "strings" "time" "github.com/tidwall/gjson" @@ -26,6 +28,7 @@ type ConvertCliToOpenAIParams struct { FunctionCallIndex int HasReceivedArgumentsDelta bool HasToolCallAnnounced bool + LastImageHashByItemID map[string][32]byte } // ConvertCodexResponseToOpenAI translates a single chunk of a streaming response from the @@ -51,6 +54,7 @@ func ConvertCodexResponseToOpenAI(_ context.Context, modelName string, originalR FunctionCallIndex: -1, HasReceivedArgumentsDelta: false, HasToolCallAnnounced: false, + LastImageHashByItemID: make(map[string][32]byte), } } @@ -70,6 +74,9 @@ func ConvertCodexResponseToOpenAI(_ context.Context, modelName string, originalR (*param).(*ConvertCliToOpenAIParams).ResponseID = rootResult.Get("response.id").String() (*param).(*ConvertCliToOpenAIParams).CreatedAt = rootResult.Get("response.created_at").Int() (*param).(*ConvertCliToOpenAIParams).Model = rootResult.Get("response.model").String() + if (*param).(*ConvertCliToOpenAIParams).LastImageHashByItemID == nil { + (*param).(*ConvertCliToOpenAIParams).LastImageHashByItemID = make(map[string][32]byte) + } return [][]byte{} } @@ -120,6 +127,39 @@ func ConvertCodexResponseToOpenAI(_ context.Context, modelName string, originalR template, _ = sjson.SetBytes(template, "choices.0.delta.role", "assistant") template, _ = sjson.SetBytes(template, "choices.0.delta.content", deltaResult.String()) } + } else if dataType == "response.image_generation_call.partial_image" { + itemID := rootResult.Get("item_id").String() + b64 := rootResult.Get("partial_image_b64").String() + if b64 == "" { + return [][]byte{} + } + if itemID != "" { + p := (*param).(*ConvertCliToOpenAIParams) + if p.LastImageHashByItemID == nil { + p.LastImageHashByItemID = make(map[string][32]byte) + } + hash := sha256.Sum256([]byte(b64)) + if last, ok := p.LastImageHashByItemID[itemID]; ok && last == hash { + return [][]byte{} + } + p.LastImageHashByItemID[itemID] = hash + } + + outputFormat := rootResult.Get("output_format").String() + mimeType := mimeTypeFromCodexOutputFormat(outputFormat) + imageURL := "data:" + mimeType + ";base64," + b64 + + imagesResult := gjson.GetBytes(template, "choices.0.delta.images") + if !imagesResult.Exists() || !imagesResult.IsArray() { + template, _ = sjson.SetRawBytes(template, "choices.0.delta.images", []byte(`[]`)) + } + imageIndex := len(gjson.GetBytes(template, "choices.0.delta.images").Array()) + imagePayload := []byte(`{"type":"image_url","image_url":{"url":""}}`) + imagePayload, _ = sjson.SetBytes(imagePayload, "index", imageIndex) + imagePayload, _ = sjson.SetBytes(imagePayload, "image_url.url", imageURL) + + template, _ = sjson.SetBytes(template, "choices.0.delta.role", "assistant") + template, _ = sjson.SetRawBytes(template, "choices.0.delta.images.-1", imagePayload) } else if dataType == "response.completed" { finishReason := "stop" if (*param).(*ConvertCliToOpenAIParams).FunctionCallIndex != -1 { @@ -183,7 +223,46 @@ func ConvertCodexResponseToOpenAI(_ context.Context, modelName string, originalR } else if dataType == "response.output_item.done" { itemResult := rootResult.Get("item") - if !itemResult.Exists() || itemResult.Get("type").String() != "function_call" { + if !itemResult.Exists() { + return [][]byte{} + } + itemType := itemResult.Get("type").String() + if itemType == "image_generation_call" { + itemID := itemResult.Get("id").String() + b64 := itemResult.Get("result").String() + if b64 == "" { + return [][]byte{} + } + if itemID != "" { + p := (*param).(*ConvertCliToOpenAIParams) + if p.LastImageHashByItemID == nil { + p.LastImageHashByItemID = make(map[string][32]byte) + } + hash := sha256.Sum256([]byte(b64)) + if last, ok := p.LastImageHashByItemID[itemID]; ok && last == hash { + return [][]byte{} + } + p.LastImageHashByItemID[itemID] = hash + } + + outputFormat := itemResult.Get("output_format").String() + mimeType := mimeTypeFromCodexOutputFormat(outputFormat) + imageURL := "data:" + mimeType + ";base64," + b64 + + imagesResult := gjson.GetBytes(template, "choices.0.delta.images") + if !imagesResult.Exists() || !imagesResult.IsArray() { + template, _ = sjson.SetRawBytes(template, "choices.0.delta.images", []byte(`[]`)) + } + imageIndex := len(gjson.GetBytes(template, "choices.0.delta.images").Array()) + imagePayload := []byte(`{"type":"image_url","image_url":{"url":""}}`) + imagePayload, _ = sjson.SetBytes(imagePayload, "index", imageIndex) + imagePayload, _ = sjson.SetBytes(imagePayload, "image_url.url", imageURL) + + template, _ = sjson.SetBytes(template, "choices.0.delta.role", "assistant") + template, _ = sjson.SetRawBytes(template, "choices.0.delta.images.-1", imagePayload) + return [][]byte{template} + } + if itemType != "function_call" { return [][]byte{} } @@ -285,6 +364,7 @@ func ConvertCodexResponseToOpenAINonStream(_ context.Context, _ string, original // Process the output array for content and function calls var toolCalls [][]byte + var images [][]byte outputResult := responseResult.Get("output") if outputResult.IsArray() { outputArray := outputResult.Array() @@ -339,6 +419,19 @@ func ConvertCodexResponseToOpenAINonStream(_ context.Context, _ string, original } toolCalls = append(toolCalls, functionCallTemplate) + case "image_generation_call": + b64 := outputItem.Get("result").String() + if b64 == "" { + break + } + outputFormat := outputItem.Get("output_format").String() + mimeType := mimeTypeFromCodexOutputFormat(outputFormat) + imageURL := "data:" + mimeType + ";base64," + b64 + + imagePayload := []byte(`{"type":"image_url","image_url":{"url":""}}`) + imagePayload, _ = sjson.SetBytes(imagePayload, "index", len(images)) + imagePayload, _ = sjson.SetBytes(imagePayload, "image_url.url", imageURL) + images = append(images, imagePayload) } } @@ -361,6 +454,15 @@ func ConvertCodexResponseToOpenAINonStream(_ context.Context, _ string, original } template, _ = sjson.SetBytes(template, "choices.0.message.role", "assistant") } + + // Add images if any + if len(images) > 0 { + template, _ = sjson.SetRawBytes(template, "choices.0.message.images", []byte(`[]`)) + for _, image := range images { + template, _ = sjson.SetRawBytes(template, "choices.0.message.images.-1", image) + } + template, _ = sjson.SetBytes(template, "choices.0.message.role", "assistant") + } } // Extract and set the finish reason based on status @@ -409,3 +511,24 @@ func buildReverseMapFromOriginalOpenAI(original []byte) map[string]string { } return rev } + +func mimeTypeFromCodexOutputFormat(outputFormat string) string { + if outputFormat == "" { + return "image/png" + } + if strings.Contains(outputFormat, "/") { + return outputFormat + } + switch strings.ToLower(outputFormat) { + case "png": + return "image/png" + case "jpg", "jpeg": + return "image/jpeg" + case "webp": + return "image/webp" + case "gif": + return "image/gif" + default: + return "image/png" + } +} diff --git a/internal/translator/codex/openai/chat-completions/codex_openai_response_test.go b/internal/translator/codex/openai/chat-completions/codex_openai_response_test.go index 534884c22..a6bb486fd 100644 --- a/internal/translator/codex/openai/chat-completions/codex_openai_response_test.go +++ b/internal/translator/codex/openai/chat-completions/codex_openai_response_test.go @@ -90,3 +90,62 @@ func TestConvertCodexResponseToOpenAI_ToolCallArgumentsDeltaOmitsNullContentFiel t.Fatalf("expected tool call arguments delta to exist, got %s", string(out[0])) } } + +func TestConvertCodexResponseToOpenAI_StreamPartialImageEmitsDeltaImages(t *testing.T) { + ctx := context.Background() + var param any + + chunk := []byte(`data: {"type":"response.image_generation_call.partial_image","item_id":"ig_123","output_format":"png","partial_image_b64":"aGVsbG8=","partial_image_index":0}`) + + out := ConvertCodexResponseToOpenAI(ctx, "gpt-5.4", nil, nil, chunk, ¶m) + if len(out) != 1 { + t.Fatalf("expected 1 chunk, got %d", len(out)) + } + + gotURL := gjson.GetBytes(out[0], "choices.0.delta.images.0.image_url.url").String() + if gotURL != "data:image/png;base64,aGVsbG8=" { + t.Fatalf("expected image url %q, got %q; chunk=%s", "data:image/png;base64,aGVsbG8=", gotURL, string(out[0])) + } + + out = ConvertCodexResponseToOpenAI(ctx, "gpt-5.4", nil, nil, chunk, ¶m) + if len(out) != 0 { + t.Fatalf("expected duplicate image chunk to be suppressed, got %d", len(out)) + } +} + +func TestConvertCodexResponseToOpenAI_StreamImageGenerationCallDoneEmitsDeltaImages(t *testing.T) { + ctx := context.Background() + var param any + + out := ConvertCodexResponseToOpenAI(ctx, "gpt-5.4", nil, nil, []byte(`data: {"type":"response.image_generation_call.partial_image","item_id":"ig_123","output_format":"png","partial_image_b64":"aGVsbG8=","partial_image_index":0}`), ¶m) + if len(out) != 1 { + t.Fatalf("expected 1 chunk, got %d", len(out)) + } + + out = ConvertCodexResponseToOpenAI(ctx, "gpt-5.4", nil, nil, []byte(`data: {"type":"response.output_item.done","item":{"id":"ig_123","type":"image_generation_call","output_format":"png","result":"aGVsbG8="}}`), ¶m) + if len(out) != 0 { + t.Fatalf("expected output_item.done to be suppressed when identical to last partial image, got %d", len(out)) + } + + out = ConvertCodexResponseToOpenAI(ctx, "gpt-5.4", nil, nil, []byte(`data: {"type":"response.output_item.done","item":{"id":"ig_123","type":"image_generation_call","output_format":"jpeg","result":"Ymll"}}`), ¶m) + if len(out) != 1 { + t.Fatalf("expected 1 chunk, got %d", len(out)) + } + + gotURL := gjson.GetBytes(out[0], "choices.0.delta.images.0.image_url.url").String() + if gotURL != "data:image/jpeg;base64,Ymll" { + t.Fatalf("expected image url %q, got %q; chunk=%s", "data:image/jpeg;base64,Ymll", gotURL, string(out[0])) + } +} + +func TestConvertCodexResponseToOpenAI_NonStreamImageGenerationCallAddsMessageImages(t *testing.T) { + ctx := context.Background() + + raw := []byte(`{"type":"response.completed","response":{"id":"resp_123","created_at":1700000000,"model":"gpt-5.4","status":"completed","usage":{"input_tokens":1,"output_tokens":1,"total_tokens":2},"output":[{"type":"message","content":[{"type":"output_text","text":"ok"}]},{"type":"image_generation_call","output_format":"png","result":"aGVsbG8="}]}}`) + out := ConvertCodexResponseToOpenAINonStream(ctx, "gpt-5.4", nil, nil, raw, nil) + + gotURL := gjson.GetBytes(out, "choices.0.message.images.0.image_url.url").String() + if gotURL != "data:image/png;base64,aGVsbG8=" { + t.Fatalf("expected image url %q, got %q; chunk=%s", "data:image/png;base64,aGVsbG8=", gotURL, string(out)) + } +}