Merge pull request #705 from maranello-o/main

fix: incorrect whisper audio usage
This commit is contained in:
Calcium-Ion
2025-01-21 11:21:04 +07:00
committed by GitHub
3 changed files with 78 additions and 54 deletions

View File

@@ -24,7 +24,7 @@ FROM alpine
RUN apk update \
&& apk upgrade \
&& apk add --no-cache ca-certificates tzdata \
&& apk add --no-cache ca-certificates tzdata ffmpeg\
&& update-ca-certificates 2>/dev/null || true
COPY --from=builder2 /build/one-api /

View File

@@ -1,14 +1,19 @@
package common
import (
"bytes"
"context"
crand "crypto/rand"
"encoding/base64"
"fmt"
"github.com/pkg/errors"
"html/template"
"io"
"log"
"math/big"
"math/rand"
"net"
"os"
"os/exec"
"runtime"
"strconv"
@@ -207,3 +212,31 @@ func RandomSleep() {
// Sleep for 0-3000 ms
time.Sleep(time.Duration(rand.Intn(3000)) * time.Millisecond)
}
// SaveTmpFile saves data to a temporary file. The filename would be apppended with a random string.
func SaveTmpFile(filename string, data io.Reader) (string, error) {
f, err := os.CreateTemp(os.TempDir(), filename)
if err != nil {
return "", errors.Wrapf(err, "failed to create temporary file %s", filename)
}
defer f.Close()
_, err = io.Copy(f, data)
if err != nil {
return "", errors.Wrapf(err, "failed to copy data to temporary file %s", filename)
}
return f.Name(), nil
}
// GetAudioDuration returns the duration of an audio file in seconds.
func GetAudioDuration(ctx context.Context, filename string) (float64, error) {
// ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {{input}}
c := exec.CommandContext(ctx, "ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", filename)
output, err := c.Output()
if err != nil {
return 0, errors.Wrap(err, "failed to get audio duration")
}
return strconv.ParseFloat(string(bytes.TrimSpace(output)), 64)
}

View File

@@ -5,7 +5,10 @@ import (
"bytes"
"encoding/json"
"fmt"
"github.com/pkg/errors"
"io"
"math"
"mime/multipart"
"net/http"
"one-api/common"
"one-api/constant"
@@ -13,6 +16,7 @@ import (
relaycommon "one-api/relay/common"
relayconstant "one-api/relay/constant"
"one-api/service"
"os"
"strings"
"sync"
"time"
@@ -316,6 +320,11 @@ func OpenaiTTSHandler(c *gin.Context, resp *http.Response, info *relaycommon.Rel
}
func OpenaiSTTHandler(c *gin.Context, resp *http.Response, info *relaycommon.RelayInfo, responseFormat string) (*dto.OpenAIErrorWithStatusCode, *dto.Usage) {
// count tokens by audio file duration
audioTokens, err := countAudioTokens(c)
if err != nil {
return service.OpenAIErrorWrapper(err, "count_audio_tokens_failed", http.StatusInternalServerError), nil
}
responseBody, err := io.ReadAll(resp.Body)
if err != nil {
return service.OpenAIErrorWrapper(err, "read_response_body_failed", http.StatusInternalServerError), nil
@@ -340,70 +349,52 @@ func OpenaiSTTHandler(c *gin.Context, resp *http.Response, info *relaycommon.Rel
}
resp.Body.Close()
var text string
switch responseFormat {
case "json":
text, err = getTextFromJSON(responseBody)
case "text":
text, err = getTextFromText(responseBody)
case "srt":
text, err = getTextFromSRT(responseBody)
case "verbose_json":
text, err = getTextFromVerboseJSON(responseBody)
case "vtt":
text, err = getTextFromVTT(responseBody)
}
usage := &dto.Usage{}
usage.PromptTokens = info.PromptTokens
usage.CompletionTokens, _ = service.CountTextToken(text, info.UpstreamModelName)
usage.PromptTokens = audioTokens
usage.CompletionTokens = 0
usage.TotalTokens = usage.PromptTokens + usage.CompletionTokens
return nil, usage
}
func getTextFromVTT(body []byte) (string, error) {
return getTextFromSRT(body)
}
func getTextFromVerboseJSON(body []byte) (string, error) {
var whisperResponse dto.WhisperVerboseJSONResponse
if err := json.Unmarshal(body, &whisperResponse); err != nil {
return "", fmt.Errorf("unmarshal_response_body_failed err :%w", err)
func countAudioTokens(c *gin.Context) (int, error) {
body, err := common.GetRequestBody(c)
if err != nil {
return 0, errors.WithStack(err)
}
return whisperResponse.Text, nil
}
func getTextFromSRT(body []byte) (string, error) {
scanner := bufio.NewScanner(strings.NewReader(string(body)))
var builder strings.Builder
var textLine bool
for scanner.Scan() {
line := scanner.Text()
if textLine {
builder.WriteString(line)
textLine = false
continue
} else if strings.Contains(line, "-->") {
textLine = true
continue
}
var reqBody struct {
File *multipart.FileHeader `form:"file" binding:"required"`
}
if err := scanner.Err(); err != nil {
return "", err
c.Request.Body = io.NopCloser(bytes.NewReader(body))
if err = c.ShouldBind(&reqBody); err != nil {
return 0, errors.WithStack(err)
}
return builder.String(), nil
}
func getTextFromText(body []byte) (string, error) {
return strings.TrimSuffix(string(body), "\n"), nil
}
func getTextFromJSON(body []byte) (string, error) {
var whisperResponse dto.AudioResponse
if err := json.Unmarshal(body, &whisperResponse); err != nil {
return "", fmt.Errorf("unmarshal_response_body_failed err :%w", err)
reqFp, err := reqBody.File.Open()
if err != nil {
return 0, errors.WithStack(err)
}
return whisperResponse.Text, nil
tmpFp, err := os.CreateTemp("", "audio-*")
if err != nil {
return 0, errors.WithStack(err)
}
defer os.Remove(tmpFp.Name())
_, err = io.Copy(tmpFp, reqFp)
if err != nil {
return 0, errors.WithStack(err)
}
if err = tmpFp.Close(); err != nil {
return 0, errors.WithStack(err)
}
duration, err := common.GetAudioDuration(c.Request.Context(), tmpFp.Name())
if err != nil {
return 0, errors.WithStack(err)
}
return int(math.Round(math.Ceil(duration) / 60.0 * 1000)), nil // 1 minute 相当于 1k tokens
}
func OpenaiRealtimeHandler(c *gin.Context, info *relaycommon.RelayInfo) (*dto.OpenAIErrorWithStatusCode, *dto.RealtimeUsage) {