feat(gemini): implement video generation configuration and billing estimation
- Added Gemini video generation configuration structures and payloads. - Introduced functions for parsing and resolving video duration and resolution from metadata. - Enhanced the Vertex adaptor to support Gemini video generation requests and billing estimation based on duration and resolution. - Updated model pricing settings for new Gemini video models.
This commit is contained in:
@@ -43,6 +43,7 @@ func (m *OpenAIVideo) SetMetadata(k string, v any) {
|
|||||||
func NewOpenAIVideo() *OpenAIVideo {
|
func NewOpenAIVideo() *OpenAIVideo {
|
||||||
return &OpenAIVideo{
|
return &OpenAIVideo{
|
||||||
Object: "video",
|
Object: "video",
|
||||||
|
Status: VideoStatusQueued,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -22,64 +22,6 @@ import (
|
|||||||
"github.com/pkg/errors"
|
"github.com/pkg/errors"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ============================
|
|
||||||
// Request / Response structures
|
|
||||||
// ============================
|
|
||||||
|
|
||||||
// GeminiVideoGenerationConfig represents the video generation configuration
|
|
||||||
// Based on: https://ai.google.dev/gemini-api/docs/video
|
|
||||||
type GeminiVideoGenerationConfig struct {
|
|
||||||
AspectRatio string `json:"aspectRatio,omitempty"` // "16:9" or "9:16"
|
|
||||||
DurationSeconds float64 `json:"durationSeconds,omitempty"` // 4, 6, or 8 (as number)
|
|
||||||
NegativePrompt string `json:"negativePrompt,omitempty"` // unwanted elements
|
|
||||||
PersonGeneration string `json:"personGeneration,omitempty"` // "allow_all" for text-to-video, "allow_adult" for image-to-video
|
|
||||||
Resolution string `json:"resolution,omitempty"` // video resolution
|
|
||||||
}
|
|
||||||
|
|
||||||
// GeminiVideoRequest represents a single video generation instance
|
|
||||||
type GeminiVideoRequest struct {
|
|
||||||
Prompt string `json:"prompt"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// GeminiVideoPayload represents the complete video generation request payload
|
|
||||||
type GeminiVideoPayload struct {
|
|
||||||
Instances []GeminiVideoRequest `json:"instances"`
|
|
||||||
Parameters GeminiVideoGenerationConfig `json:"parameters,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type submitResponse struct {
|
|
||||||
Name string `json:"name"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type operationVideo struct {
|
|
||||||
MimeType string `json:"mimeType"`
|
|
||||||
BytesBase64Encoded string `json:"bytesBase64Encoded"`
|
|
||||||
Encoding string `json:"encoding"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type operationResponse struct {
|
|
||||||
Name string `json:"name"`
|
|
||||||
Done bool `json:"done"`
|
|
||||||
Response struct {
|
|
||||||
Type string `json:"@type"`
|
|
||||||
RaiMediaFilteredCount int `json:"raiMediaFilteredCount"`
|
|
||||||
Videos []operationVideo `json:"videos"`
|
|
||||||
BytesBase64Encoded string `json:"bytesBase64Encoded"`
|
|
||||||
Encoding string `json:"encoding"`
|
|
||||||
Video string `json:"video"`
|
|
||||||
GenerateVideoResponse struct {
|
|
||||||
GeneratedSamples []struct {
|
|
||||||
Video struct {
|
|
||||||
URI string `json:"uri"`
|
|
||||||
} `json:"video"`
|
|
||||||
} `json:"generatedSamples"`
|
|
||||||
} `json:"generateVideoResponse"`
|
|
||||||
} `json:"response"`
|
|
||||||
Error struct {
|
|
||||||
Message string `json:"message"`
|
|
||||||
} `json:"error"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// ============================
|
// ============================
|
||||||
// Adaptor implementation
|
// Adaptor implementation
|
||||||
// ============================
|
// ============================
|
||||||
@@ -99,17 +41,16 @@ func (a *TaskAdaptor) Init(info *relaycommon.RelayInfo) {
|
|||||||
|
|
||||||
// ValidateRequestAndSetAction parses body, validates fields and sets default action.
|
// ValidateRequestAndSetAction parses body, validates fields and sets default action.
|
||||||
func (a *TaskAdaptor) ValidateRequestAndSetAction(c *gin.Context, info *relaycommon.RelayInfo) (taskErr *dto.TaskError) {
|
func (a *TaskAdaptor) ValidateRequestAndSetAction(c *gin.Context, info *relaycommon.RelayInfo) (taskErr *dto.TaskError) {
|
||||||
// Use the standard validation method for TaskSubmitReq
|
|
||||||
return relaycommon.ValidateBasicTaskRequest(c, info, constant.TaskActionTextGenerate)
|
return relaycommon.ValidateBasicTaskRequest(c, info, constant.TaskActionTextGenerate)
|
||||||
}
|
}
|
||||||
|
|
||||||
// BuildRequestURL constructs the upstream URL.
|
// BuildRequestURL constructs the Gemini API generateVideos endpoint.
|
||||||
func (a *TaskAdaptor) BuildRequestURL(info *relaycommon.RelayInfo) (string, error) {
|
func (a *TaskAdaptor) BuildRequestURL(info *relaycommon.RelayInfo) (string, error) {
|
||||||
modelName := info.UpstreamModelName
|
modelName := info.UpstreamModelName
|
||||||
version := model_setting.GetGeminiVersionSetting(modelName)
|
version := model_setting.GetGeminiVersionSetting(modelName)
|
||||||
|
|
||||||
return fmt.Sprintf(
|
return fmt.Sprintf(
|
||||||
"%s/%s/models/%s:predictLongRunning",
|
"%s/%s/models/%s:generateVideos",
|
||||||
a.baseURL,
|
a.baseURL,
|
||||||
version,
|
version,
|
||||||
modelName,
|
modelName,
|
||||||
@@ -124,7 +65,7 @@ func (a *TaskAdaptor) BuildRequestHeader(c *gin.Context, req *http.Request, info
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// BuildRequestBody converts request into Gemini specific format.
|
// BuildRequestBody converts request into the Gemini API generateVideos format.
|
||||||
func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayInfo) (io.Reader, error) {
|
func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayInfo) (io.Reader, error) {
|
||||||
v, ok := c.Get("task_request")
|
v, ok := c.Get("task_request")
|
||||||
if !ok {
|
if !ok {
|
||||||
@@ -135,18 +76,34 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn
|
|||||||
return nil, fmt.Errorf("unexpected task_request type")
|
return nil, fmt.Errorf("unexpected task_request type")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create structured video generation request
|
|
||||||
body := GeminiVideoPayload{
|
body := GeminiVideoPayload{
|
||||||
Instances: []GeminiVideoRequest{
|
Prompt: req.Prompt,
|
||||||
{Prompt: req.Prompt},
|
Config: &GeminiVideoGenerationConfig{},
|
||||||
},
|
|
||||||
Parameters: GeminiVideoGenerationConfig{},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
metadata := req.Metadata
|
if img := ExtractMultipartImage(c, info); img != nil {
|
||||||
if err := taskcommon.UnmarshalMetadata(metadata, &body.Parameters); err != nil {
|
body.Image = img
|
||||||
|
} else if len(req.Images) > 0 {
|
||||||
|
if parsed := ParseImageInput(req.Images[0]); parsed != nil {
|
||||||
|
body.Image = parsed
|
||||||
|
info.Action = constant.TaskActionGenerate
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := taskcommon.UnmarshalMetadata(req.Metadata, body.Config); err != nil {
|
||||||
return nil, errors.Wrap(err, "unmarshal metadata failed")
|
return nil, errors.Wrap(err, "unmarshal metadata failed")
|
||||||
}
|
}
|
||||||
|
if body.Config.DurationSeconds == 0 && req.Duration > 0 {
|
||||||
|
body.Config.DurationSeconds = req.Duration
|
||||||
|
}
|
||||||
|
if body.Config.Resolution == "" && req.Size != "" {
|
||||||
|
body.Config.Resolution = SizeToVeoResolution(req.Size)
|
||||||
|
}
|
||||||
|
if body.Config.AspectRatio == "" && req.Size != "" {
|
||||||
|
body.Config.AspectRatio = SizeToVeoAspectRatio(req.Size)
|
||||||
|
}
|
||||||
|
body.Config.Resolution = strings.ToLower(body.Config.Resolution)
|
||||||
|
body.Config.NumberOfVideos = 1
|
||||||
|
|
||||||
data, err := common.Marshal(body)
|
data, err := common.Marshal(body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -186,14 +143,40 @@ func (a *TaskAdaptor) DoResponse(c *gin.Context, resp *http.Response, info *rela
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *TaskAdaptor) GetModelList() []string {
|
func (a *TaskAdaptor) GetModelList() []string {
|
||||||
return []string{"veo-3.0-generate-001", "veo-3.1-generate-preview", "veo-3.1-fast-generate-preview"}
|
return []string{
|
||||||
|
"veo-3.0-generate-001",
|
||||||
|
"veo-3.0-fast-generate-001",
|
||||||
|
"veo-3.1-generate-preview",
|
||||||
|
"veo-3.1-fast-generate-preview",
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *TaskAdaptor) GetChannelName() string {
|
func (a *TaskAdaptor) GetChannelName() string {
|
||||||
return "gemini"
|
return "gemini"
|
||||||
}
|
}
|
||||||
|
|
||||||
// FetchTask fetch task status
|
// EstimateBilling returns OtherRatios based on durationSeconds and resolution.
|
||||||
|
func (a *TaskAdaptor) EstimateBilling(c *gin.Context, info *relaycommon.RelayInfo) map[string]float64 {
|
||||||
|
v, ok := c.Get("task_request")
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
req, ok := v.(relaycommon.TaskSubmitReq)
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
seconds := ResolveVeoDuration(req.Metadata, req.Duration, req.Seconds)
|
||||||
|
resolution := ResolveVeoResolution(req.Metadata, req.Size)
|
||||||
|
resRatio := VeoResolutionRatio(info.UpstreamModelName, resolution)
|
||||||
|
|
||||||
|
return map[string]float64{
|
||||||
|
"seconds": float64(seconds),
|
||||||
|
"resolution": resRatio,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// FetchTask polls task status via the Gemini operations GET endpoint.
|
||||||
func (a *TaskAdaptor) FetchTask(baseUrl, key string, body map[string]any, proxy string) (*http.Response, error) {
|
func (a *TaskAdaptor) FetchTask(baseUrl, key string, body map[string]any, proxy string) (*http.Response, error) {
|
||||||
taskID, ok := body["task_id"].(string)
|
taskID, ok := body["task_id"].(string)
|
||||||
if !ok {
|
if !ok {
|
||||||
@@ -205,7 +188,6 @@ func (a *TaskAdaptor) FetchTask(baseUrl, key string, body map[string]any, proxy
|
|||||||
return nil, fmt.Errorf("decode task_id failed: %w", err)
|
return nil, fmt.Errorf("decode task_id failed: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// For Gemini API, we use GET request to the operations endpoint
|
|
||||||
version := model_setting.GetGeminiVersionSetting("default")
|
version := model_setting.GetGeminiVersionSetting("default")
|
||||||
url := fmt.Sprintf("%s/%s/%s", baseUrl, version, upstreamName)
|
url := fmt.Sprintf("%s/%s/%s", baseUrl, version, upstreamName)
|
||||||
|
|
||||||
@@ -249,11 +231,9 @@ func (a *TaskAdaptor) ParseTaskResult(respBody []byte) (*relaycommon.TaskInfo, e
|
|||||||
ti.Progress = "100%"
|
ti.Progress = "100%"
|
||||||
|
|
||||||
ti.TaskID = taskcommon.EncodeLocalTaskID(op.Name)
|
ti.TaskID = taskcommon.EncodeLocalTaskID(op.Name)
|
||||||
// Url intentionally left empty — the caller constructs the proxy URL using the public task ID
|
|
||||||
|
|
||||||
// Extract URL from generateVideoResponse if available
|
if len(op.Response.GenerateVideoResponse.GeneratedVideos) > 0 {
|
||||||
if len(op.Response.GenerateVideoResponse.GeneratedSamples) > 0 {
|
if uri := op.Response.GenerateVideoResponse.GeneratedVideos[0].Video.URI; uri != "" {
|
||||||
if uri := op.Response.GenerateVideoResponse.GeneratedSamples[0].Video.URI; uri != "" {
|
|
||||||
ti.RemoteUrl = uri
|
ti.RemoteUrl = uri
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -262,8 +242,6 @@ func (a *TaskAdaptor) ParseTaskResult(respBody []byte) (*relaycommon.TaskInfo, e
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *TaskAdaptor) ConvertToOpenAIVideo(task *model.Task) ([]byte, error) {
|
func (a *TaskAdaptor) ConvertToOpenAIVideo(task *model.Task) ([]byte, error) {
|
||||||
// Use GetUpstreamTaskID() to get the real upstream operation name for model extraction.
|
|
||||||
// task.TaskID is now a public task_xxxx ID, no longer a base64-encoded upstream name.
|
|
||||||
upstreamTaskID := task.GetUpstreamTaskID()
|
upstreamTaskID := task.GetUpstreamTaskID()
|
||||||
upstreamName, err := taskcommon.DecodeLocalTaskID(upstreamTaskID)
|
upstreamName, err := taskcommon.DecodeLocalTaskID(upstreamTaskID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
138
relay/channel/task/gemini/billing.go
Normal file
138
relay/channel/task/gemini/billing.go
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ParseVeoDurationSeconds extracts durationSeconds from metadata.
|
||||||
|
// Returns 8 (Veo default) when not specified or invalid.
|
||||||
|
func ParseVeoDurationSeconds(metadata map[string]any) int {
|
||||||
|
if metadata == nil {
|
||||||
|
return 8
|
||||||
|
}
|
||||||
|
v, ok := metadata["durationSeconds"]
|
||||||
|
if !ok {
|
||||||
|
return 8
|
||||||
|
}
|
||||||
|
switch n := v.(type) {
|
||||||
|
case float64:
|
||||||
|
if int(n) > 0 {
|
||||||
|
return int(n)
|
||||||
|
}
|
||||||
|
case int:
|
||||||
|
if n > 0 {
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 8
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseVeoResolution extracts resolution from metadata.
|
||||||
|
// Returns "720p" when not specified.
|
||||||
|
func ParseVeoResolution(metadata map[string]any) string {
|
||||||
|
if metadata == nil {
|
||||||
|
return "720p"
|
||||||
|
}
|
||||||
|
v, ok := metadata["resolution"]
|
||||||
|
if !ok {
|
||||||
|
return "720p"
|
||||||
|
}
|
||||||
|
if s, ok := v.(string); ok && s != "" {
|
||||||
|
return strings.ToLower(s)
|
||||||
|
}
|
||||||
|
return "720p"
|
||||||
|
}
|
||||||
|
|
||||||
|
// ResolveVeoDuration returns the effective duration in seconds.
|
||||||
|
// Priority: metadata["durationSeconds"] > stdDuration > stdSeconds > default (8).
|
||||||
|
func ResolveVeoDuration(metadata map[string]any, stdDuration int, stdSeconds string) int {
|
||||||
|
if metadata != nil {
|
||||||
|
if _, exists := metadata["durationSeconds"]; exists {
|
||||||
|
if d := ParseVeoDurationSeconds(metadata); d > 0 {
|
||||||
|
return d
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if stdDuration > 0 {
|
||||||
|
return stdDuration
|
||||||
|
}
|
||||||
|
if s, err := strconv.Atoi(stdSeconds); err == nil && s > 0 {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return 8
|
||||||
|
}
|
||||||
|
|
||||||
|
// ResolveVeoResolution returns the effective resolution string (lowercase).
|
||||||
|
// Priority: metadata["resolution"] > SizeToVeoResolution(stdSize) > default ("720p").
|
||||||
|
func ResolveVeoResolution(metadata map[string]any, stdSize string) string {
|
||||||
|
if metadata != nil {
|
||||||
|
if _, exists := metadata["resolution"]; exists {
|
||||||
|
if r := ParseVeoResolution(metadata); r != "" {
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if stdSize != "" {
|
||||||
|
return SizeToVeoResolution(stdSize)
|
||||||
|
}
|
||||||
|
return "720p"
|
||||||
|
}
|
||||||
|
|
||||||
|
// SizeToVeoResolution converts a "WxH" size string to a Veo resolution label.
|
||||||
|
func SizeToVeoResolution(size string) string {
|
||||||
|
parts := strings.SplitN(strings.ToLower(size), "x", 2)
|
||||||
|
if len(parts) != 2 {
|
||||||
|
return "720p"
|
||||||
|
}
|
||||||
|
w, _ := strconv.Atoi(parts[0])
|
||||||
|
h, _ := strconv.Atoi(parts[1])
|
||||||
|
maxDim := w
|
||||||
|
if h > maxDim {
|
||||||
|
maxDim = h
|
||||||
|
}
|
||||||
|
if maxDim >= 3840 {
|
||||||
|
return "4k"
|
||||||
|
}
|
||||||
|
if maxDim >= 1920 {
|
||||||
|
return "1080p"
|
||||||
|
}
|
||||||
|
return "720p"
|
||||||
|
}
|
||||||
|
|
||||||
|
// SizeToVeoAspectRatio converts a "WxH" size string to a Veo aspect ratio.
|
||||||
|
func SizeToVeoAspectRatio(size string) string {
|
||||||
|
parts := strings.SplitN(strings.ToLower(size), "x", 2)
|
||||||
|
if len(parts) != 2 {
|
||||||
|
return "16:9"
|
||||||
|
}
|
||||||
|
w, _ := strconv.Atoi(parts[0])
|
||||||
|
h, _ := strconv.Atoi(parts[1])
|
||||||
|
if w <= 0 || h <= 0 {
|
||||||
|
return "16:9"
|
||||||
|
}
|
||||||
|
if h > w {
|
||||||
|
return "9:16"
|
||||||
|
}
|
||||||
|
return "16:9"
|
||||||
|
}
|
||||||
|
|
||||||
|
// VeoResolutionRatio returns the pricing multiplier for the given resolution.
|
||||||
|
// Standard resolutions (720p, 1080p) return 1.0.
|
||||||
|
// 4K returns a model-specific multiplier based on Google's official pricing.
|
||||||
|
func VeoResolutionRatio(modelName, resolution string) float64 {
|
||||||
|
if resolution != "4k" {
|
||||||
|
return 1.0
|
||||||
|
}
|
||||||
|
// 4K multipliers derived from Vertex AI official pricing (video+audio base):
|
||||||
|
// veo-3.1-generate: $0.60 / $0.40 = 1.5
|
||||||
|
// veo-3.1-fast-generate: $0.35 / $0.15 ≈ 2.333
|
||||||
|
// Veo 3.0 models do not support 4K; return 1.0 as fallback.
|
||||||
|
if strings.Contains(modelName, "3.1-fast-generate") {
|
||||||
|
return 2.333333
|
||||||
|
}
|
||||||
|
if strings.Contains(modelName, "3.1-generate") || strings.Contains(modelName, "3.1") {
|
||||||
|
return 1.5
|
||||||
|
}
|
||||||
|
return 1.0
|
||||||
|
}
|
||||||
63
relay/channel/task/gemini/dto.go
Normal file
63
relay/channel/task/gemini/dto.go
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
// GeminiVideoGenerationConfig represents the Gemini API GenerateVideosConfig.
|
||||||
|
// Reference: https://ai.google.dev/gemini-api/docs/video
|
||||||
|
type GeminiVideoGenerationConfig struct {
|
||||||
|
AspectRatio string `json:"aspectRatio,omitempty"`
|
||||||
|
DurationSeconds int `json:"durationSeconds,omitempty"`
|
||||||
|
NegativePrompt string `json:"negativePrompt,omitempty"`
|
||||||
|
PersonGeneration string `json:"personGeneration,omitempty"`
|
||||||
|
Resolution string `json:"resolution,omitempty"`
|
||||||
|
NumberOfVideos int `json:"numberOfVideos,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// VeoImageInput represents an image input for Veo image-to-video.
|
||||||
|
// Used by both Gemini and Vertex adaptors.
|
||||||
|
type VeoImageInput struct {
|
||||||
|
BytesBase64Encoded string `json:"bytesBase64Encoded"`
|
||||||
|
MimeType string `json:"mimeType"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// GeminiVideoPayload is the top-level request body for the Gemini API
|
||||||
|
// models/{model}:generateVideos endpoint.
|
||||||
|
type GeminiVideoPayload struct {
|
||||||
|
Model string `json:"model,omitempty"`
|
||||||
|
Prompt string `json:"prompt"`
|
||||||
|
Image *VeoImageInput `json:"image,omitempty"`
|
||||||
|
Config *GeminiVideoGenerationConfig `json:"config,omitempty"`
|
||||||
|
// TODO: support referenceImages (style/asset references, up to 3 images)
|
||||||
|
// TODO: support lastFrame (first+last frame interpolation, Veo 3.1)
|
||||||
|
}
|
||||||
|
|
||||||
|
type submitResponse struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type operationVideo struct {
|
||||||
|
MimeType string `json:"mimeType"`
|
||||||
|
BytesBase64Encoded string `json:"bytesBase64Encoded"`
|
||||||
|
Encoding string `json:"encoding"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type operationResponse struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Done bool `json:"done"`
|
||||||
|
Response struct {
|
||||||
|
Type string `json:"@type"`
|
||||||
|
RaiMediaFilteredCount int `json:"raiMediaFilteredCount"`
|
||||||
|
Videos []operationVideo `json:"videos"`
|
||||||
|
BytesBase64Encoded string `json:"bytesBase64Encoded"`
|
||||||
|
Encoding string `json:"encoding"`
|
||||||
|
Video string `json:"video"`
|
||||||
|
GenerateVideoResponse struct {
|
||||||
|
GeneratedVideos []struct {
|
||||||
|
Video struct {
|
||||||
|
URI string `json:"uri"`
|
||||||
|
} `json:"video"`
|
||||||
|
} `json:"generatedVideos"`
|
||||||
|
} `json:"generateVideoResponse"`
|
||||||
|
} `json:"response"`
|
||||||
|
Error struct {
|
||||||
|
Message string `json:"message"`
|
||||||
|
} `json:"error"`
|
||||||
|
}
|
||||||
100
relay/channel/task/gemini/image.go
Normal file
100
relay/channel/task/gemini/image.go
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/base64"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/QuantumNous/new-api/constant"
|
||||||
|
relaycommon "github.com/QuantumNous/new-api/relay/common"
|
||||||
|
"github.com/gin-gonic/gin"
|
||||||
|
)
|
||||||
|
|
||||||
|
const maxVeoImageSize = 20 * 1024 * 1024 // 20 MB
|
||||||
|
|
||||||
|
// ExtractMultipartImage reads the first `input_reference` file from a multipart
|
||||||
|
// form upload and returns a VeoImageInput. Returns nil if no file is present.
|
||||||
|
func ExtractMultipartImage(c *gin.Context, info *relaycommon.RelayInfo) *VeoImageInput {
|
||||||
|
mf, err := c.MultipartForm()
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
files, exists := mf.File["input_reference"]
|
||||||
|
if !exists || len(files) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
fh := files[0]
|
||||||
|
if fh.Size > maxVeoImageSize {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
file, err := fh.Open()
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
fileBytes, err := io.ReadAll(file)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
mimeType := fh.Header.Get("Content-Type")
|
||||||
|
if mimeType == "" || mimeType == "application/octet-stream" {
|
||||||
|
mimeType = http.DetectContentType(fileBytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
info.Action = constant.TaskActionGenerate
|
||||||
|
return &VeoImageInput{
|
||||||
|
BytesBase64Encoded: base64.StdEncoding.EncodeToString(fileBytes),
|
||||||
|
MimeType: mimeType,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseImageInput parses an image string (data URI or raw base64) into a
|
||||||
|
// VeoImageInput. Returns nil if the input is empty or invalid.
|
||||||
|
// TODO: support downloading HTTP URL images and converting to base64
|
||||||
|
func ParseImageInput(imageStr string) *VeoImageInput {
|
||||||
|
imageStr = strings.TrimSpace(imageStr)
|
||||||
|
if imageStr == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if strings.HasPrefix(imageStr, "data:") {
|
||||||
|
return parseDataURI(imageStr)
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := base64.StdEncoding.DecodeString(imageStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return &VeoImageInput{
|
||||||
|
BytesBase64Encoded: imageStr,
|
||||||
|
MimeType: http.DetectContentType(raw),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseDataURI(uri string) *VeoImageInput {
|
||||||
|
// data:image/png;base64,iVBOR...
|
||||||
|
rest := uri[len("data:"):]
|
||||||
|
idx := strings.Index(rest, ",")
|
||||||
|
if idx < 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
meta := rest[:idx]
|
||||||
|
b64 := rest[idx+1:]
|
||||||
|
if b64 == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
mimeType := "application/octet-stream"
|
||||||
|
parts := strings.SplitN(meta, ";", 2)
|
||||||
|
if len(parts) >= 1 && parts[0] != "" {
|
||||||
|
mimeType = parts[0]
|
||||||
|
}
|
||||||
|
|
||||||
|
return &VeoImageInput{
|
||||||
|
BytesBase64Encoded: b64,
|
||||||
|
MimeType: mimeType,
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -16,6 +16,7 @@ import (
|
|||||||
"github.com/QuantumNous/new-api/constant"
|
"github.com/QuantumNous/new-api/constant"
|
||||||
"github.com/QuantumNous/new-api/dto"
|
"github.com/QuantumNous/new-api/dto"
|
||||||
"github.com/QuantumNous/new-api/relay/channel"
|
"github.com/QuantumNous/new-api/relay/channel"
|
||||||
|
geminitask "github.com/QuantumNous/new-api/relay/channel/task/gemini"
|
||||||
taskcommon "github.com/QuantumNous/new-api/relay/channel/task/taskcommon"
|
taskcommon "github.com/QuantumNous/new-api/relay/channel/task/taskcommon"
|
||||||
vertexcore "github.com/QuantumNous/new-api/relay/channel/vertex"
|
vertexcore "github.com/QuantumNous/new-api/relay/channel/vertex"
|
||||||
relaycommon "github.com/QuantumNous/new-api/relay/common"
|
relaycommon "github.com/QuantumNous/new-api/relay/common"
|
||||||
@@ -26,9 +27,34 @@ import (
|
|||||||
// Request / Response structures
|
// Request / Response structures
|
||||||
// ============================
|
// ============================
|
||||||
|
|
||||||
|
type veoInstance struct {
|
||||||
|
Prompt string `json:"prompt"`
|
||||||
|
Image *geminitask.VeoImageInput `json:"image,omitempty"`
|
||||||
|
// TODO: support referenceImages (style/asset references, up to 3 images)
|
||||||
|
// TODO: support lastFrame (first+last frame interpolation, Veo 3.1)
|
||||||
|
}
|
||||||
|
|
||||||
|
type veoParameters struct {
|
||||||
|
SampleCount int `json:"sampleCount"`
|
||||||
|
DurationSeconds int `json:"durationSeconds,omitempty"`
|
||||||
|
AspectRatio string `json:"aspectRatio,omitempty"`
|
||||||
|
Resolution string `json:"resolution,omitempty"`
|
||||||
|
NegativePrompt string `json:"negativePrompt,omitempty"`
|
||||||
|
PersonGeneration string `json:"personGeneration,omitempty"`
|
||||||
|
StorageUri string `json:"storageUri,omitempty"`
|
||||||
|
CompressionQuality string `json:"compressionQuality,omitempty"`
|
||||||
|
ResizeMode string `json:"resizeMode,omitempty"`
|
||||||
|
Seed *int `json:"seed,omitempty"`
|
||||||
|
GenerateAudio *bool `json:"generateAudio,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type requestPayload struct {
|
type requestPayload struct {
|
||||||
Instances []map[string]any `json:"instances"`
|
Instances []veoInstance `json:"instances"`
|
||||||
Parameters map[string]any `json:"parameters,omitempty"`
|
Parameters *veoParameters `json:"parameters,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type fetchOperationPayload struct {
|
||||||
|
OperationName string `json:"operationName"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type submitResponse struct {
|
type submitResponse struct {
|
||||||
@@ -134,25 +160,21 @@ func (a *TaskAdaptor) BuildRequestHeader(c *gin.Context, req *http.Request, info
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// EstimateBilling 根据用户请求中的 sampleCount 计算 OtherRatios。
|
// EstimateBilling returns OtherRatios based on durationSeconds and resolution.
|
||||||
func (a *TaskAdaptor) EstimateBilling(c *gin.Context, _ *relaycommon.RelayInfo) map[string]float64 {
|
func (a *TaskAdaptor) EstimateBilling(c *gin.Context, info *relaycommon.RelayInfo) map[string]float64 {
|
||||||
sampleCount := 1
|
|
||||||
v, ok := c.Get("task_request")
|
v, ok := c.Get("task_request")
|
||||||
if ok {
|
if !ok {
|
||||||
req := v.(relaycommon.TaskSubmitReq)
|
return nil
|
||||||
if req.Metadata != nil {
|
|
||||||
if sc, exists := req.Metadata["sampleCount"]; exists {
|
|
||||||
if i, ok := sc.(int); ok && i > 0 {
|
|
||||||
sampleCount = i
|
|
||||||
}
|
|
||||||
if f, ok := sc.(float64); ok && int(f) > 0 {
|
|
||||||
sampleCount = int(f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
req := v.(relaycommon.TaskSubmitReq)
|
||||||
|
|
||||||
|
seconds := geminitask.ResolveVeoDuration(req.Metadata, req.Duration, req.Seconds)
|
||||||
|
resolution := geminitask.ResolveVeoResolution(req.Metadata, req.Size)
|
||||||
|
resRatio := geminitask.VeoResolutionRatio(info.UpstreamModelName, resolution)
|
||||||
|
|
||||||
return map[string]float64{
|
return map[string]float64{
|
||||||
"sampleCount": float64(sampleCount),
|
"seconds": float64(seconds),
|
||||||
|
"resolution": resRatio,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -164,29 +186,35 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn
|
|||||||
}
|
}
|
||||||
req := v.(relaycommon.TaskSubmitReq)
|
req := v.(relaycommon.TaskSubmitReq)
|
||||||
|
|
||||||
body := requestPayload{
|
instance := veoInstance{Prompt: req.Prompt}
|
||||||
Instances: []map[string]any{{"prompt": req.Prompt}},
|
if img := geminitask.ExtractMultipartImage(c, info); img != nil {
|
||||||
Parameters: map[string]any{},
|
instance.Image = img
|
||||||
}
|
} else if len(req.Images) > 0 {
|
||||||
if req.Metadata != nil {
|
if parsed := geminitask.ParseImageInput(req.Images[0]); parsed != nil {
|
||||||
if v, ok := req.Metadata["storageUri"]; ok {
|
instance.Image = parsed
|
||||||
body.Parameters["storageUri"] = v
|
info.Action = constant.TaskActionGenerate
|
||||||
}
|
}
|
||||||
if v, ok := req.Metadata["sampleCount"]; ok {
|
|
||||||
if i, ok := v.(int); ok {
|
|
||||||
body.Parameters["sampleCount"] = i
|
|
||||||
}
|
|
||||||
if f, ok := v.(float64); ok {
|
|
||||||
body.Parameters["sampleCount"] = int(f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if _, ok := body.Parameters["sampleCount"]; !ok {
|
|
||||||
body.Parameters["sampleCount"] = 1
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if body.Parameters["sampleCount"].(int) <= 0 {
|
params := &veoParameters{}
|
||||||
return nil, fmt.Errorf("sampleCount must be greater than 0")
|
if err := taskcommon.UnmarshalMetadata(req.Metadata, params); err != nil {
|
||||||
|
return nil, fmt.Errorf("unmarshal metadata failed: %w", err)
|
||||||
|
}
|
||||||
|
if params.DurationSeconds == 0 && req.Duration > 0 {
|
||||||
|
params.DurationSeconds = req.Duration
|
||||||
|
}
|
||||||
|
if params.Resolution == "" && req.Size != "" {
|
||||||
|
params.Resolution = geminitask.SizeToVeoResolution(req.Size)
|
||||||
|
}
|
||||||
|
if params.AspectRatio == "" && req.Size != "" {
|
||||||
|
params.AspectRatio = geminitask.SizeToVeoAspectRatio(req.Size)
|
||||||
|
}
|
||||||
|
params.Resolution = strings.ToLower(params.Resolution)
|
||||||
|
params.SampleCount = 1
|
||||||
|
|
||||||
|
body := requestPayload{
|
||||||
|
Instances: []veoInstance{instance},
|
||||||
|
Parameters: params,
|
||||||
}
|
}
|
||||||
|
|
||||||
data, err := common.Marshal(body)
|
data, err := common.Marshal(body)
|
||||||
@@ -226,7 +254,14 @@ func (a *TaskAdaptor) DoResponse(c *gin.Context, resp *http.Response, info *rela
|
|||||||
return localID, responseBody, nil
|
return localID, responseBody, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *TaskAdaptor) GetModelList() []string { return []string{"veo-3.0-generate-001"} }
|
func (a *TaskAdaptor) GetModelList() []string {
|
||||||
|
return []string{
|
||||||
|
"veo-3.0-generate-001",
|
||||||
|
"veo-3.0-fast-generate-001",
|
||||||
|
"veo-3.1-generate-preview",
|
||||||
|
"veo-3.1-fast-generate-preview",
|
||||||
|
}
|
||||||
|
}
|
||||||
func (a *TaskAdaptor) GetChannelName() string { return "vertex" }
|
func (a *TaskAdaptor) GetChannelName() string { return "vertex" }
|
||||||
|
|
||||||
// FetchTask fetch task status
|
// FetchTask fetch task status
|
||||||
@@ -254,7 +289,7 @@ func (a *TaskAdaptor) FetchTask(baseUrl, key string, body map[string]any, proxy
|
|||||||
} else {
|
} else {
|
||||||
url = fmt.Sprintf("https://%s-aiplatform.googleapis.com/v1/projects/%s/locations/%s/publishers/google/models/%s:fetchPredictOperation", region, project, region, modelName)
|
url = fmt.Sprintf("https://%s-aiplatform.googleapis.com/v1/projects/%s/locations/%s/publishers/google/models/%s:fetchPredictOperation", region, project, region, modelName)
|
||||||
}
|
}
|
||||||
payload := map[string]string{"operationName": upstreamName}
|
payload := fetchOperationPayload{OperationName: upstreamName}
|
||||||
data, err := common.Marshal(payload)
|
data, err := common.Marshal(payload)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|||||||
@@ -298,6 +298,10 @@ var defaultModelPrice = map[string]float64{
|
|||||||
"sora-2": 0.3,
|
"sora-2": 0.3,
|
||||||
"sora-2-pro": 0.5,
|
"sora-2-pro": 0.5,
|
||||||
"gpt-4o-mini-tts": 0.3,
|
"gpt-4o-mini-tts": 0.3,
|
||||||
|
"veo-3.0-generate-001": 0.4,
|
||||||
|
"veo-3.0-fast-generate-001": 0.15,
|
||||||
|
"veo-3.1-generate-preview": 0.4,
|
||||||
|
"veo-3.1-fast-generate-preview": 0.15,
|
||||||
}
|
}
|
||||||
|
|
||||||
var defaultAudioRatio = map[string]float64{
|
var defaultAudioRatio = map[string]float64{
|
||||||
|
|||||||
Reference in New Issue
Block a user