feat: opt minimax tts req struct

This commit is contained in:
feitianbubu
2025-10-20 14:46:04 +08:00
parent a2d34b9e47
commit d52505cac1
2 changed files with 112 additions and 128 deletions

View File

@@ -34,17 +34,20 @@ func (a *Adaptor) ConvertAudioRequest(c *gin.Context, info *relaycommon.RelayInf
return nil, errors.New("unsupported audio relay mode") return nil, errors.New("unsupported audio relay mode")
} }
voiceID := mapVoiceType(request.Voice) voiceID := request.Voice
speed := request.Speed speed := request.Speed
outputFormat := mapOutputFormat(request.ResponseFormat) outputFormat := request.ResponseFormat
c.Set("response_format", outputFormat)
minimaxRequest := MiniMaxTTSRequest{ minimaxRequest := MiniMaxTTSRequest{
Model: getTTSModel(info.OriginModelName), Model: info.OriginModelName,
Text: request.Input, Text: request.Input,
VoiceID: voiceID, VoiceSetting: VoiceSetting{
Speed: speed, VoiceID: voiceID,
Speed: speed,
},
AudioSetting: &AudioSetting{
Format: outputFormat,
},
OutputFormat: outputFormat, OutputFormat: outputFormat,
} }
@@ -59,6 +62,11 @@ func (a *Adaptor) ConvertAudioRequest(c *gin.Context, info *relaycommon.RelayInf
if err != nil { if err != nil {
return nil, fmt.Errorf("error marshalling minimax request: %w", err) return nil, fmt.Errorf("error marshalling minimax request: %w", err)
} }
if outputFormat != "hex" {
outputFormat = "url"
}
c.Set("response_format", outputFormat)
// Debug: log the request structure // Debug: log the request structure
fmt.Printf("MiniMax TTS Request: %s\n", string(jsonData)) fmt.Printf("MiniMax TTS Request: %s\n", string(jsonData))
@@ -79,12 +87,6 @@ func (a *Adaptor) GetRequestURL(info *relaycommon.RelayInfo) (string, error) {
func (a *Adaptor) SetupRequestHeader(c *gin.Context, req *http.Header, info *relaycommon.RelayInfo) error { func (a *Adaptor) SetupRequestHeader(c *gin.Context, req *http.Header, info *relaycommon.RelayInfo) error {
channel.SetupApiRequestHeader(info, c, req) channel.SetupApiRequestHeader(info, c, req)
if info.RelayMode == constant.RelayModeAudioSpeech {
req.Set("Content-Type", "application/json")
return nil
}
req.Set("Authorization", "Bearer "+info.ApiKey) req.Set("Authorization", "Bearer "+info.ApiKey)
return nil return nil
} }

View File

@@ -1,11 +1,13 @@
package minimax package minimax
import ( import (
"encoding/base64" "encoding/hex"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt"
"io" "io"
"net/http" "net/http"
"strings"
"github.com/QuantumNous/new-api/dto" "github.com/QuantumNous/new-api/dto"
relaycommon "github.com/QuantumNous/new-api/relay/common" relaycommon "github.com/QuantumNous/new-api/relay/common"
@@ -14,96 +16,78 @@ import (
) )
type MiniMaxTTSRequest struct { type MiniMaxTTSRequest struct {
Model string `json:"model"` Model string `json:"model"`
Text string `json:"text"` Text string `json:"text"`
VoiceID string `json:"voice_id"` Stream bool `json:"stream,omitempty"`
Speed float64 `json:"speed,omitempty"` StreamOptions *StreamOptions `json:"stream_options,omitempty"`
Vol float64 `json:"vol,omitempty"` VoiceSetting VoiceSetting `json:"voice_setting"`
Pitch int `json:"pitch,omitempty"` PronunciationDict *PronunciationDict `json:"pronunciation_dict,omitempty"`
AudioSampleRate int `json:"audio_sample_rate,omitempty"` AudioSetting *AudioSetting `json:"audio_setting,omitempty"`
OutputFormat string `json:"output_format,omitempty"` TimbreWeights []TimbreWeight `json:"timbre_weights,omitempty"`
LanguageBoost string `json:"language_boost,omitempty"`
VoiceModify *VoiceModify `json:"voice_modify,omitempty"`
SubtitleEnable bool `json:"subtitle_enable,omitempty"`
OutputFormat string `json:"output_format,omitempty"`
AigcWatermark bool `json:"aigc_watermark,omitempty"`
}
type StreamOptions struct {
ExcludeAggregatedAudio bool `json:"exclude_aggregated_audio,omitempty"`
}
type VoiceSetting struct {
VoiceID string `json:"voice_id"`
Speed float64 `json:"speed,omitempty"`
Vol float64 `json:"vol,omitempty"`
Pitch int `json:"pitch,omitempty"`
Emotion string `json:"emotion,omitempty"`
TextNormalization bool `json:"text_normalization,omitempty"`
LatexRead bool `json:"latex_read,omitempty"`
}
type PronunciationDict struct {
Tone []string `json:"tone,omitempty"`
}
type AudioSetting struct {
SampleRate int `json:"sample_rate,omitempty"`
Bitrate int `json:"bitrate,omitempty"`
Format string `json:"format,omitempty"`
Channel int `json:"channel,omitempty"`
ForceCbr bool `json:"force_cbr,omitempty"`
}
type TimbreWeight struct {
VoiceID string `json:"voice_id"`
Weight int `json:"weight"`
}
type VoiceModify struct {
Pitch int `json:"pitch,omitempty"`
Intensity int `json:"intensity,omitempty"`
Timbre int `json:"timbre,omitempty"`
SoundEffects string `json:"sound_effects,omitempty"`
} }
type MiniMaxTTSResponse struct { type MiniMaxTTSResponse struct {
Created int `json:"created"` Data MiniMaxTTSData `json:"data"`
Data []MiniMaxTTSData `json:"data"` ExtraInfo MiniMaxExtraInfo `json:"extra_info"`
ID string `json:"id"` TraceID string `json:"trace_id"`
Model string `json:"model"` BaseResp MiniMaxBaseResp `json:"base_resp"`
Object string `json:"object"`
Usage MiniMaxTTSUsage `json:"usage"`
} }
type MiniMaxTTSData struct { type MiniMaxTTSData struct {
Index int `json:"index"` Audio string `json:"audio"`
Audio string `json:"audio"` Status int `json:"status"`
Text string `json:"text"`
FinishReason string `json:"finish_reason"`
} }
type MiniMaxTTSUsage struct { type MiniMaxExtraInfo struct {
TotalTokens int `json:"total_tokens"` UsageCharacters int64 `json:"usage_characters"`
} }
type MiniMaxTTSErrorResponse struct { type MiniMaxBaseResp struct {
Error MiniMaxTTSError `json:"error"` StatusCode int64 `json:"status_code"`
} StatusMsg string `json:"status_msg"`
type MiniMaxTTSError struct {
Code string `json:"code"`
Message string `json:"message"`
Type string `json:"type"`
}
// OpenAI voice to MiniMax voice_id mapping
var openAIToMiniMaxVoiceMap = map[string]string{
"alloy": "male-qn-qingse",
"echo": "male-qn-jingying",
"fable": "female-shaonv",
"onyx": "male-qn-badao",
"nova": "female-shaonv-jingpin",
"shimmer": "female-yujie",
// Add some standard MiniMax voice IDs
"voice-1": "male-qn-qingse",
"voice-2": "female-shaonv",
}
// OpenAI response format to MiniMax output format mapping
var responseFormatToOutputFormatMap = map[string]string{
"mp3": "mp3",
"opus": "mp3",
"aac": "aac",
"flac": "flac",
"wav": "wav",
"pcm": "pcm",
}
// TTS model mapping - MiniMax uses speech-01 or speech-01-turbo
var modelToTTSModelMap = map[string]string{
"speech-01": "speech-01",
"speech-01-turbo": "speech-01-turbo",
"tts-1": "speech-01-turbo",
"tts-1-hd": "speech-01",
}
func mapVoiceType(openAIVoice string) string {
if voice, ok := openAIToMiniMaxVoiceMap[openAIVoice]; ok {
return voice
}
return "female-shaonv" // default voice
}
func mapOutputFormat(responseFormat string) string {
if format, ok := responseFormatToOutputFormatMap[responseFormat]; ok {
return format
}
return "mp3" // default format
}
func getTTSModel(modelName string) string {
if ttsModel, ok := modelToTTSModelMap[modelName]; ok {
return ttsModel
}
return "speech-01-turbo" // default model
} }
func getContentTypeByFormat(format string) string { func getContentTypeByFormat(format string) string {
@@ -124,66 +108,64 @@ func handleTTSResponse(c *gin.Context, resp *http.Response, info *relaycommon.Re
body, readErr := io.ReadAll(resp.Body) body, readErr := io.ReadAll(resp.Body)
if readErr != nil { if readErr != nil {
return nil, types.NewErrorWithStatusCode( return nil, types.NewErrorWithStatusCode(
errors.New("failed to read minimax response"), fmt.Errorf("failed to read minimax response: %w", readErr),
types.ErrorCodeReadResponseBodyFailed, types.ErrorCodeReadResponseBodyFailed,
http.StatusInternalServerError, http.StatusInternalServerError,
) )
} }
defer resp.Body.Close() defer resp.Body.Close()
// First try to parse as error response // Parse response
var errorResp MiniMaxTTSErrorResponse
if unmarshalErr := json.Unmarshal(body, &errorResp); unmarshalErr == nil && errorResp.Error.Code != "" {
return nil, types.NewErrorWithStatusCode(
errors.New(errorResp.Error.Message),
types.ErrorCodeBadResponse,
http.StatusBadRequest,
)
}
// Parse as successful response
var minimaxResp MiniMaxTTSResponse var minimaxResp MiniMaxTTSResponse
if unmarshalErr := json.Unmarshal(body, &minimaxResp); unmarshalErr != nil { if unmarshalErr := json.Unmarshal(body, &minimaxResp); unmarshalErr != nil {
return nil, types.NewErrorWithStatusCode( return nil, types.NewErrorWithStatusCode(
errors.New("failed to parse minimax response"), fmt.Errorf("failed to unmarshal minimax TTS response: %w", unmarshalErr),
types.ErrorCodeBadResponseBody, types.ErrorCodeBadResponseBody,
http.StatusInternalServerError, http.StatusInternalServerError,
) )
} }
// Check base_resp status code
if minimaxResp.BaseResp.StatusCode != 0 {
return nil, types.NewErrorWithStatusCode(
fmt.Errorf("minimax TTS error: %d - %s", minimaxResp.BaseResp.StatusCode, minimaxResp.BaseResp.StatusMsg),
types.ErrorCodeBadResponse,
http.StatusBadRequest,
)
}
// Check if we have audio data // Check if we have audio data
if len(minimaxResp.Data) == 0 || minimaxResp.Data[0].Audio == "" { if minimaxResp.Data.Audio == "" {
return nil, types.NewErrorWithStatusCode( return nil, types.NewErrorWithStatusCode(
errors.New("no audio data in response"), fmt.Errorf("no audio data in minimax TTS response"),
types.ErrorCodeBadResponse, types.ErrorCodeBadResponse,
http.StatusBadRequest, http.StatusBadRequest,
) )
} }
// Decode base64 audio data if strings.HasPrefix(minimaxResp.Data.Audio, "http") {
audioData, decodeErr := base64.StdEncoding.DecodeString(minimaxResp.Data[0].Audio) c.Redirect(http.StatusFound, minimaxResp.Data.Audio)
if decodeErr != nil { } else {
return nil, types.NewErrorWithStatusCode( // Handle hex-encoded audio data
errors.New("failed to decode audio data"), audioData, decodeErr := hex.DecodeString(minimaxResp.Data.Audio)
types.ErrorCodeBadResponseBody, if decodeErr != nil {
http.StatusInternalServerError, return nil, types.NewErrorWithStatusCode(
) fmt.Errorf("failed to decode hex audio data: %w", decodeErr),
} types.ErrorCodeBadResponse,
http.StatusInternalServerError,
)
}
// Get output format from context or default to mp3 // Determine content type - default to mp3
outputFormat := c.GetString("response_format") contentType := "audio/mpeg"
if outputFormat == "" {
outputFormat = "mp3"
}
contentType := getContentTypeByFormat(outputFormat) c.Data(http.StatusOK, contentType, audioData)
c.Header("Content-Type", contentType) }
c.Data(http.StatusOK, contentType, audioData)
usage = &dto.Usage{ usage = &dto.Usage{
PromptTokens: info.PromptTokens, PromptTokens: info.PromptTokens,
CompletionTokens: 0, CompletionTokens: 0,
TotalTokens: minimaxResp.Usage.TotalTokens, TotalTokens: int(minimaxResp.ExtraInfo.UsageCharacters),
} }
return usage, nil return usage, nil