Use audio inputs and outputs with OpenAI's audio-capable models through TheRouter's unified API.
Audio-capable models at TheRouter:
Supported input formats:
Note: Maximum audio file size: 25MB for Chat Completions API, 25MB for Whisper transcription.
from openai import OpenAI
import base64
client = OpenAI(
api_key="your_therouter_api_key",
base_url="https://api.therouter.ai/v1"
)
# Encode audio file to base64
def encode_audio(audio_path):
with open(audio_path, "rb") as audio_file:
return base64.b64encode(audio_file.read()).decode("utf-8")
base64_audio = encode_audio("path/to/audio.mp3")
response = client.chat.completions.create(
model="openai/gpt-audio-1.5",
modalities=["text", "audio"],
audio={"voice": "alloy", "format": "wav"},
messages=[
{
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {
"data": base64_audio,
"format": "mp3"
}
}
]
}
]
)
# Access audio output if available
if response.choices[0].message.audio:
audio_data = response.choices[0].message.audio.data
# Decode base64 audio
audio_bytes = base64.b64decode(audio_data)
with open("response.wav", "wb") as f:
f.write(audio_bytes)
# Access text transcript
print(response.choices[0].message.content)import OpenAI from "openai";
import * as fs from "fs";
const client = new OpenAI({
apiKey: process.env.THEROUTER_API_KEY,
baseURL: "https://api.therouter.ai/v1",
});
// Encode audio file
function encodeAudio(audioPath: string): string {
const audioBuffer = fs.readFileSync(audioPath);
return audioBuffer.toString("base64");
}
const base64Audio = encodeAudio("path/to/audio.mp3");
const response = await client.chat.completions.create({
model: "openai/gpt-audio-1.5",
modalities: ["text", "audio"],
audio: { voice: "alloy", format: "pcm16" },
messages: [
{
role: "user",
content: [
{
type: "text",
text: "Please respond to my audio message",
},
{
type: "input_audio",
input_audio: {
data: base64Audio,
format: "mp3",
},
},
],
},
],
});
// Handle audio output
if (response.choices[0].message.audio) {
const audioData = response.choices[0].message.audio.data;
const audioBuffer = Buffer.from(audioData, "base64");
fs.writeFileSync("response.pcm16", audioBuffer);
}
console.log(response.choices[0].message.content);curl https://api.therouter.ai/v1/audio/transcriptions \
-H "Authorization: Bearer ${THEROUTER_API_KEY}" \
-F file="@audio.mp3" \
-F model="openai/whisper-1" \
-F language="en" \
-F response_format="json"from openai import OpenAI
from pathlib import Path
client = OpenAI(
api_key="your_therouter_api_key",
base_url="https://api.therouter.ai/v1"
)
speech_file_path = Path(__file__).parent / "speech.mp3"
response = client.audio.speech.create(
model="openai/tts-1-hd",
voice="nova",
input="Hello! This is a test of TheRouter's text-to-speech API."
)
response.stream_to_file(speech_file_path)# Note: Realtime API uses WebSocket connection
# Example connection (requires WebSocket client)
wscat -c "wss://api.therouter.ai/v1/realtime?model=openai/gpt-realtime-1.5" \
-H "Authorization: Bearer ${THEROUTER_API_KEY}" \
-H "OpenAI-Beta: realtime=v1"
# Send session update
{
"type": "session.update",
"session": {
"modalities": ["text", "audio"],
"voice": "alloy"
}
}Supported output formats for audio generation:
mp3 - Default, good compressionopus - Low latency streamingaac - Higher qualityflac - Lossless qualitywav - Uncompressedpcm16 - Raw 16-bit PCM (realtime API)Available voices for audio output:
alloy - Neutralecho - Malefable - British maleonyx - Deep malenova - Femaleshimmer - Soft female{
"error": {
"message": "Model anthropic/claude-sonnet-4.6 does not support audio content. Use models like openai/gpt-audio-1.5 for audio.",
"type": "invalid_request_error",
"code": "multimodal_not_supported"
}
}Solution: Use an audio-capable model from the list above.
{
"error": {
"message": "Invalid audio format. Supported formats: wav, mp3, flac, m4a, mpeg, mpga, ogg, webm",
"type": "invalid_request_error",
"code": "invalid_audio_format"
}
}Solution: Convert audio to a supported format.
Solution: Compress audio or split into smaller chunks. Maximum: 25MB.
opus for streaming, mp3 for storagemodalities array to control input/output typesAudio pricing differs from text: