diff --git a/backend/internal/pkg/apicompat/chatcompletions_responses_test.go b/backend/internal/pkg/apicompat/chatcompletions_responses_test.go index 8b819033..f54a4a02 100644 --- a/backend/internal/pkg/apicompat/chatcompletions_responses_test.go +++ b/backend/internal/pkg/apicompat/chatcompletions_responses_test.go @@ -181,6 +181,35 @@ func TestChatCompletionsToResponses_ImageURL(t *testing.T) { assert.Equal(t, "data:image/png;base64,abc123", parts[1].ImageURL) } +func TestChatCompletionsToResponses_SystemArrayContent(t *testing.T) { + req := &ChatCompletionsRequest{ + Model: "gpt-4o", + Messages: []ChatMessage{ + {Role: "system", Content: json.RawMessage(`[{"type":"text","text":"You are a careful visual assistant."}]`)}, + {Role: "user", Content: json.RawMessage(`[{"type":"text","text":"Describe this image"},{"type":"image_url","image_url":{"url":"data:image/png;base64,abc123"}}]`)}, + }, + } + + resp, err := ChatCompletionsToResponses(req) + require.NoError(t, err) + + var items []ResponsesInputItem + require.NoError(t, json.Unmarshal(resp.Input, &items)) + require.Len(t, items, 2) + + var systemParts []ResponsesContentPart + require.NoError(t, json.Unmarshal(items[0].Content, &systemParts)) + require.Len(t, systemParts, 1) + assert.Equal(t, "input_text", systemParts[0].Type) + assert.Equal(t, "You are a careful visual assistant.", systemParts[0].Text) + + var userParts []ResponsesContentPart + require.NoError(t, json.Unmarshal(items[1].Content, &userParts)) + require.Len(t, userParts, 2) + assert.Equal(t, "input_image", userParts[1].Type) + assert.Equal(t, "data:image/png;base64,abc123", userParts[1].ImageURL) +} + func TestChatCompletionsToResponses_LegacyFunctions(t *testing.T) { req := &ChatCompletionsRequest{ Model: "gpt-4o", @@ -398,6 +427,45 @@ func TestResponsesToChatCompletions_Reasoning(t *testing.T) { assert.Equal(t, "I thought about it.", chat.Choices[0].Message.ReasoningContent) } +func TestChatCompletionsToResponses_ToolArrayContent(t *testing.T) { + req := &ChatCompletionsRequest{ + Model: "gpt-4o", + Messages: []ChatMessage{ + {Role: "user", Content: json.RawMessage(`"Use the tool"`)}, + { + Role: "assistant", + ToolCalls: []ChatToolCall{ + { + ID: "call_1", + Type: "function", + Function: ChatFunctionCall{ + Name: "inspect_image", + Arguments: `{}`, + }, + }, + }, + }, + { + Role: "tool", + ToolCallID: "call_1", + Content: json.RawMessage( + `[{"type":"text","text":"image width: 100"},{"type":"image_url","image_url":{"url":"data:image/png;base64,ignored"}},{"type":"text","text":"; image height: 200"}]`, + ), + }, + }, + } + + resp, err := ChatCompletionsToResponses(req) + require.NoError(t, err) + + var items []ResponsesInputItem + require.NoError(t, json.Unmarshal(resp.Input, &items)) + require.Len(t, items, 3) + assert.Equal(t, "function_call_output", items[2].Type) + assert.Equal(t, "call_1", items[2].CallID) + assert.Equal(t, "image width: 100; image height: 200", items[2].Output) +} + func TestResponsesToChatCompletions_Incomplete(t *testing.T) { resp := &ResponsesResponse{ ID: "resp_inc", diff --git a/backend/internal/pkg/apicompat/chatcompletions_to_responses.go b/backend/internal/pkg/apicompat/chatcompletions_to_responses.go index c4a9e773..6cdd012a 100644 --- a/backend/internal/pkg/apicompat/chatcompletions_to_responses.go +++ b/backend/internal/pkg/apicompat/chatcompletions_to_responses.go @@ -6,6 +6,11 @@ import ( "strings" ) +type chatMessageContent struct { + Text *string + Parts []ChatContentPart +} + // ChatCompletionsToResponses converts a Chat Completions request into a // Responses API request. The upstream always streams, so Stream is forced to // true. store is always false and reasoning.encrypted_content is always @@ -113,11 +118,11 @@ func chatMessageToResponsesItems(m ChatMessage) ([]ResponsesInputItem, error) { // chatSystemToResponses converts a system message. func chatSystemToResponses(m ChatMessage) ([]ResponsesInputItem, error) { - text, err := parseChatContent(m.Content) + parsed, err := parseChatMessageContent(m.Content) if err != nil { return nil, err } - content, err := json.Marshal(text) + content, err := marshalChatInputContent(parsed) if err != nil { return nil, err } @@ -127,39 +132,11 @@ func chatSystemToResponses(m ChatMessage) ([]ResponsesInputItem, error) { // chatUserToResponses converts a user message, handling both plain strings and // multi-modal content arrays. func chatUserToResponses(m ChatMessage) ([]ResponsesInputItem, error) { - // Try plain string first. - var s string - if err := json.Unmarshal(m.Content, &s); err == nil { - content, _ := json.Marshal(s) - return []ResponsesInputItem{{Role: "user", Content: content}}, nil - } - - var parts []ChatContentPart - if err := json.Unmarshal(m.Content, &parts); err != nil { + parsed, err := parseChatMessageContent(m.Content) + if err != nil { return nil, fmt.Errorf("parse user content: %w", err) } - - var responseParts []ResponsesContentPart - for _, p := range parts { - switch p.Type { - case "text": - if p.Text != "" { - responseParts = append(responseParts, ResponsesContentPart{ - Type: "input_text", - Text: p.Text, - }) - } - case "image_url": - if p.ImageURL != nil && p.ImageURL.URL != "" { - responseParts = append(responseParts, ResponsesContentPart{ - Type: "input_image", - ImageURL: p.ImageURL.URL, - }) - } - } - } - - content, err := json.Marshal(responseParts) + content, err := marshalChatInputContent(parsed) if err != nil { return nil, err } @@ -312,16 +289,79 @@ func chatFunctionToResponses(m ChatMessage) ([]ResponsesInputItem, error) { } // parseChatContent returns the string value of a ChatMessage Content field. -// Content must be a JSON string. Returns "" if content is null or empty. +// Content can be a JSON string or an array of typed parts. Array content is +// flattened to text by concatenating text parts and ignoring non-text parts. func parseChatContent(raw json.RawMessage) (string, error) { + parsed, err := parseChatMessageContent(raw) + if err != nil { + return "", err + } + if parsed.Text != nil { + return *parsed.Text, nil + } + return flattenChatContentParts(parsed.Parts), nil +} + +func parseChatMessageContent(raw json.RawMessage) (chatMessageContent, error) { if len(raw) == 0 { - return "", nil + return chatMessageContent{Text: stringPtr("")}, nil } + var s string - if err := json.Unmarshal(raw, &s); err != nil { - return "", fmt.Errorf("parse content as string: %w", err) + if err := json.Unmarshal(raw, &s); err == nil { + return chatMessageContent{Text: &s}, nil } - return s, nil + + var parts []ChatContentPart + if err := json.Unmarshal(raw, &parts); err == nil { + return chatMessageContent{Parts: parts}, nil + } + + return chatMessageContent{}, fmt.Errorf("parse content as string or parts array") +} + +func marshalChatInputContent(content chatMessageContent) (json.RawMessage, error) { + if content.Text != nil { + return json.Marshal(*content.Text) + } + return json.Marshal(convertChatContentPartsToResponses(content.Parts)) +} + +func convertChatContentPartsToResponses(parts []ChatContentPart) []ResponsesContentPart { + var responseParts []ResponsesContentPart + for _, p := range parts { + switch p.Type { + case "text": + if p.Text != "" { + responseParts = append(responseParts, ResponsesContentPart{ + Type: "input_text", + Text: p.Text, + }) + } + case "image_url": + if p.ImageURL != nil && p.ImageURL.URL != "" { + responseParts = append(responseParts, ResponsesContentPart{ + Type: "input_image", + ImageURL: p.ImageURL.URL, + }) + } + } + } + return responseParts +} + +func flattenChatContentParts(parts []ChatContentPart) string { + var textParts []string + for _, p := range parts { + if p.Type == "text" && p.Text != "" { + textParts = append(textParts, p.Text) + } + } + return strings.Join(textParts, "") +} + +func stringPtr(s string) *string { + return &s } // convertChatToolsToResponses maps Chat Completions tool definitions and legacy