Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support gpt-4o-audio-preview #895

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

# Test binary, built with `go test -c`
*.test
test.mp3

# Output of the go coverage tool, specifically when used with LiteIDE
*.out
Expand Down
86 changes: 86 additions & 0 deletions api_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,38 @@ func TestAPI(t *testing.T) {
},
)
checks.NoError(t, err, "CreateChatCompletion (with functions) returned error")

response, err := c.CreateChatCompletion(
ctx,
openai.ChatCompletionRequest{
Model: openai.GPT4oAudioPreview,
Messages: []openai.ChatCompletionMessage{
{
Role: openai.ChatMessageRoleUser,
Content: "hi",
},
},
Audio: &openai.AudioOutput{
Voice: openai.AudioVoiceAlloy,
Format: openai.AudioFormatPCM16,
},
Modalities: []openai.Modality{openai.ModalityText, openai.ModalityAudio},
},
)
checks.NoError(t, err, "CreateChatCompletion (with audio) returned error")
if response.Choices[0].Message.Audio == nil {
t.Fatal("Audio response is nil")
}
if len(response.Choices[0].Message.Audio.Data) == 0 {
t.Fatal("Audio response data is empty")
}
if response.Choices[0].Message.Audio.Transcript == "" {
t.Fatal("Audio response transcript is empty")
}
if response.Usage.PromptTokens == 0 || response.Usage.CompletionTokens == 0 || response.Usage.TotalTokens == 0 {
t.Fatal("Usage is zero")
}
t.Logf("Usage: %+v", response.Usage)
}

func TestCompletionStream(t *testing.T) {
Expand Down Expand Up @@ -145,6 +177,60 @@ func TestCompletionStream(t *testing.T) {
}
}

func TestChatCompletionStream(t *testing.T) {
apiToken := os.Getenv("OPENAI_TOKEN")
if apiToken == "" {
t.Skip("Skipping testing against production OpenAI API. Set OPENAI_TOKEN environment variable to enable it.")
}

c := openai.NewClient(apiToken)
ctx := context.Background()

stream, err := c.CreateChatCompletionStream(ctx, openai.ChatCompletionRequest{
Model: openai.GPT4oAudioPreview,
Messages: []openai.ChatCompletionMessage{
{
Role: openai.ChatMessageRoleUser,
Content: "hi",
},
},
Audio: &openai.AudioOutput{
Voice: openai.AudioVoiceAlloy,
Format: openai.AudioFormatPCM16,
},
Modalities: []openai.Modality{openai.ModalityText, openai.ModalityAudio},
StreamOptions: &openai.StreamOptions{
IncludeUsage: true,
},
})
checks.NoError(t, err, "CreateCompletionStream returned error")
defer stream.Close()

var usage *openai.Usage
counter := 0
for {
response, err := stream.Recv()
if err != nil {
if errors.Is(err, io.EOF) {
break
}
t.Errorf("Stream error: %v", err)
} else {
counter++
}
if response.Usage != nil {
usage = response.Usage
t.Logf("Usage: %+v", usage)
}
}
if counter == 0 {
t.Error("Stream did not return any responses")
}
if usage == nil {
t.Error("Usage is nil")
}
}

func TestAPIError(t *testing.T) {
apiToken := os.Getenv("OPENAI_TOKEN")
if apiToken == "" {
Expand Down
146 changes: 101 additions & 45 deletions chat.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,63 @@ type ChatMessageImageURL struct {
Detail ImageURLDetail `json:"detail,omitempty"`
}

type AudioVoice string

const (
AudioVoiceAlloy AudioVoice = "alloy"
AudioVoiceAsh AudioVoice = "ash"
AudioVoiceBallad AudioVoice = "ballad"
AudioVoiceCoral AudioVoice = "coral"
AudioVoiceEcho AudioVoice = "echo"
AudioVoiceSage AudioVoice = "sage"
AudioVoiceShimmer AudioVoice = "shimmer"
AudioVoiceVerse AudioVoice = "verse"
)

type AudioFormat string

const (
AudioFormatWAV AudioFormat = "wav"
AudioFormatMP3 AudioFormat = "mp3"
AudioFormatFLAC AudioFormat = "flac"
AudioFormatOPUS AudioFormat = "opus"
AudioFormatPCM16 AudioFormat = "pcm16"
)

type ChatMessageAudio struct {
// Base64 encoded audio data.
Data string `json:"data,omitempty"`
// The format of the encoded audio data. Currently supports "wav" and "mp3".
Format AudioFormat `json:"format,omitempty"`
}

type Modality string

const (
ModalityAudio Modality = "audio"
ModalityText Modality = "text"
)

type AudioOutput struct {
// The voice the model uses to respond. Supported voices are alloy, ash, ballad, coral, echo, sage, shimmer, and verse.
Voice AudioVoice `json:"voice"`
// Specifies the output audio format. Must be one of wav, mp3, flac, opus, or pcm16.
Format AudioFormat `json:"format"`
}

type ChatMessagePartType string

const (
ChatMessagePartTypeText ChatMessagePartType = "text"
ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
ChatMessagePartTypeText ChatMessagePartType = "text"
ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
ChatMessagePartTypeInputAudio ChatMessagePartType = "input_audio"
)

type ChatMessagePart struct {
Type ChatMessagePartType `json:"type,omitempty"`
Text string `json:"text,omitempty"`
ImageURL *ChatMessageImageURL `json:"image_url,omitempty"`
Type ChatMessagePartType `json:"type,omitempty"`
Text string `json:"text,omitempty"`
ImageURL *ChatMessageImageURL `json:"image_url,omitempty"`
InputAudio *ChatMessageAudio `json:"input_audio,omitempty"`
}

type ChatCompletionMessage struct {
Expand All @@ -110,72 +156,75 @@ type ChatCompletionMessage struct {

// For Role=tool prompts this should be set to the ID given in the assistant's prior request to call a tool.
ToolCallID string `json:"tool_call_id,omitempty"`

// If the audio output modality is requested, this object contains data about the audio response from the model.
Audio *ChatCompletionAudio `json:"audio,omitempty"`
}

type chatCompletionMessageMultiContent struct {
Role string `json:"role"`
Content string `json:"-"`
Refusal string `json:"refusal,omitempty"`
MultiContent []ChatMessagePart `json:"content,omitempty"`
Name string `json:"name,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
Audio *ChatCompletionAudio `json:"audio,omitempty"`
}

type chatCompletionMessageSingleContent struct {
Role string `json:"role"`
Content string `json:"content"`
Refusal string `json:"refusal,omitempty"`
MultiContent []ChatMessagePart `json:"-"`
Name string `json:"name,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
Audio *ChatCompletionAudio `json:"audio,omitempty"`
WqyJh marked this conversation as resolved.
Show resolved Hide resolved
}

func (m ChatCompletionMessage) MarshalJSON() ([]byte, error) {
if m.Content != "" && m.MultiContent != nil {
return nil, ErrContentFieldsMisused
}
if len(m.MultiContent) > 0 {
msg := struct {
Role string `json:"role"`
Content string `json:"-"`
Refusal string `json:"refusal,omitempty"`
MultiContent []ChatMessagePart `json:"content,omitempty"`
Name string `json:"name,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
}(m)
msg := chatCompletionMessageMultiContent(m)
return json.Marshal(msg)
}

msg := struct {
Role string `json:"role"`
Content string `json:"content"`
Refusal string `json:"refusal,omitempty"`
MultiContent []ChatMessagePart `json:"-"`
Name string `json:"name,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
}(m)
msg := chatCompletionMessageSingleContent(m)
return json.Marshal(msg)
}

func (m *ChatCompletionMessage) UnmarshalJSON(bs []byte) error {
msg := struct {
Role string `json:"role"`
Content string `json:"content"`
Refusal string `json:"refusal,omitempty"`
MultiContent []ChatMessagePart
Name string `json:"name,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
}{}
msg := chatCompletionMessageSingleContent{}

if err := json.Unmarshal(bs, &msg); err == nil {
*m = ChatCompletionMessage(msg)
return nil
}
multiMsg := struct {
Role string `json:"role"`
Content string
Refusal string `json:"refusal,omitempty"`
MultiContent []ChatMessagePart `json:"content"`
Name string `json:"name,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
}{}
multiMsg := chatCompletionMessageMultiContent{}
if err := json.Unmarshal(bs, &multiMsg); err != nil {
return err
}
*m = ChatCompletionMessage(multiMsg)
return nil
}

type ChatCompletionAudio struct {
// Unique identifier for this audio response.
ID string `json:"id"`
// The Unix timestamp (in seconds) for when this audio response will no longer
// be accessible on the server for use in multi-turn conversations.
ExpiresAt int64 `json:"expires_at"`
// Base64 encoded audio bytes generated by the model, in the format specified in the request.
Data string `json:"data"`
// Transcript of the audio generated by the model.
Transcript string `json:"transcript"`
}

type ToolCall struct {
// Index is not nil only in chat completion chunk object
Index *int `json:"index,omitempty"`
Expand Down Expand Up @@ -260,6 +309,13 @@ type ChatCompletionRequest struct {
Store bool `json:"store,omitempty"`
// Metadata to store with the completion.
Metadata map[string]string `json:"metadata,omitempty"`
// Output types that you would like the model to generate for this request.
// Most models are capable of generating text, which is the default: ["text"]
// The gpt-4o-audio-preview model can also be used to generate audio.
// To request that this model generate both text and audio responses, you can use: ["text", "audio"]
Modalities []Modality `json:"modalities,omitempty"`
// Parameters for audio output. Required when audio output is requested with modalities: ["audio"]
Audio *AudioOutput `json:"audio,omitempty"`
}

type StreamOptions struct {
Expand Down
17 changes: 12 additions & 5 deletions chat_stream.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,19 @@ import (
"net/http"
)

type ChatCompletionStreamChoiceDeltaAudio struct {
ID string `json:"id,omitempty"`
Transcript string `json:"transcript,omitempty"`
Data string `json:"data,omitempty"`
}

type ChatCompletionStreamChoiceDelta struct {
Content string `json:"content,omitempty"`
Role string `json:"role,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
Refusal string `json:"refusal,omitempty"`
Content string `json:"content,omitempty"`
Role string `json:"role,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
Refusal string `json:"refusal,omitempty"`
Audio *ChatCompletionStreamChoiceDeltaAudio `json:"audio,omitempty"`
WqyJh marked this conversation as resolved.
Show resolved Hide resolved
}

type ChatCompletionStreamChoiceLogprobs struct {
Expand Down
Loading
Loading