gpt-4o-audio-preview的核心功能包括三大方面:首先,它能够根据文本生成自然流畅的语音响应,为语音助手和虚拟客服等应用提供了强大支持。其次,该模型具备分析音频输入的情感、语调和音调的能力,这一特性在情感计算和用户体验分析领域具有广阔应用前景。最后,它支持语音到语音的互动,音频既可以作为输入也可以作为输出,为全方位的语音交互系统奠定了基础。
本文使用pyaudio模拟语音输入和朗读实现语音聊天、打断和function call功能
gpt-4o-realtime-preview为了实现打断,使用的是websocket协议,不是http协议
引入必要的包
import asyncio
import base64
import json
import queue
import requests
import pyaudio
import websockets
from loguru import logger as log # 这个包可以让日志美观一点
设置一些全局变量
OPENAI_API_KEY = "你的openai api key"
# 定义一个图书查询的function
function = {
"name": "library_book_search",
"description": "Search for a book in the library database using the book's title",
"parameters": {
"type": "object",
"properties": {
"book_title": {
"type": "string",
"description": "The title of the book to search for in the library."
}
},
"required": [
"book_title"
]
}
}
# tools与其他模型有区别
# old
tools = [
{
"type": "function",
"function": "function",
}
]
# new
tools = [
{
"type": "function",
**function
}
]
# 当session被创建的时候需要告诉模型的一些参数 比如返回的语音编码格式等等
update_event_session = {
"modalities": ["text", "audio"],
"instructions": "你是一个图书查询智能体,你的任务是帮助用户调用function插件查询图书信息.",
"voice": "sage",
"input_audio_format": "pcm16",
"output_audio_format": "pcm16",
"input_audio_transcription": {
"model": "whisper-1"
},
"turn_detection": {
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 500,
"create_response": True
},
"tools": tools,
"tool_choice": "auto",
"temperature": 0.8,
"max_response_output_tokens": "inf"
}
定义library_book_search的实现
def library_book_search(book_title):
url = "https://xxx.com/book_query/"
data = {
"book_title": book_title,
}
headers = {
"Content-Type": "application/json",
"Accept": "application/json",
}
response = requests.post(url, headers=headers, json=data)
if response.status_code == 200:
data = response.json().get("data")
return data
else:
log.error(f"Error: {response.status_code}")
return None
定义一个类用于接收模型的参数
class BotBackend:
def __init__(self):
self.function_name = ""
self.args = ""
self.function_id = ""
self.item_id = ""
self.finish_reason = ""
self.content = ""
def update_function_name(self, function_name):
self.function_name = function_name
def update_item_id(self, item_id):
self.item_id = item_id
def update_args(self, args):
self.args = args
def update_function_id(self, function_id):
self.function_id = function_id
def update_finish_reason(self, finish_reason):
self.finish_reason = finish_reason
def update_content(self, content):
self.content += content
def reset_gpt_response_log_values(self, exclude=None):
if exclude is None:
exclude = []
attributes = {
"function_name": "",
"args": "",
"function_id": "",
"item_id": "",
"content": "",
}
for attr_name in exclude:
del attributes[attr_name]
for attr_name, value in attributes.items():
setattr(self, attr_name, value)
def run_function(self):
args = json.loads(self.args)
result = library_book_search(**args)
return result
以下是核心代码,实现语音输入输出和function参数的获取
class AudioStreamer:
def __init__(self):
self.ws = None
self.speaker_audio_out = None
self.mic_audio_in = None
self.p = pyaudio.PyAudio()
self.bot_backend = BotBackend()
def mic_audio_in_callback(self, in_data, frame_count, time_info, status):
payload = base64.b64encode(in_data).decode("utf-8")
async def send():
await self.ws.send(
json.dumps(
{
"type": "input_audio_buffer.append",
"audio": payload,
},
)
)
asyncio.run(send())
return None, pyaudio.paContinue
def function_call_callback(self, output):
item_id = output["id"]
self.bot_backend.update_item_id(item_id)
function_name = output["name"]
self.bot_backend.update_function_name(function_name)
call_id = output["call_id"]
self.bot_backend.update_function_id(call_id)
arguments = output["arguments"]
self.bot_backend.update_args(arguments)
self.bot_backend.update_finish_reason("function_call")
result = self.bot_backend.run_function()
return {
"type": "conversation.item.create",
"item": {
"id": self.bot_backend.item_id,
"type": "function_call_output",
"call_id": self.bot_backend.function_id,
"output": json.dumps(result),
},
}
async def ws_receive_worker(self):
async for m in self.ws:
evt = json.loads(m)
print(json.dumps(evt, indent=4))
if evt["type"] == "session.created":
log.info("Connected: say something to GPT-4o")
evt["type"] = "session.update"
evt["session"] = update_event_session
await self.ws.send(json.dumps(evt))
self.mic_audio_in.start_stream()
elif evt["type"] == "response.audio.delta":
audio = base64.b64decode(evt["delta"])
self.speaker_audio_out.write(audio)
elif evt["type"] == "response.audio_transcript.delta":
log.info(evt["delta"])
self.bot_backend.update_content(evt["delta"])
elif evt["type"] == "response.output_item.done":
output = evt["item"]
out_put_type = output["type"]
if out_put_type == "function_call":
event = self.function_call_callback(output)
await self.ws.send(json.dumps(event))
# elif evt["type"] == "response.done":
# output = evt["response"]["output"][0]
# self.function_call_callback(output)
# elif evt["type"] == "response.function_call_arguments.delta":
# args = evt["delta"]
# self.bot_backend.update_args(args)
async def run(self):
self.mic_audio_in = self.p.open(
format=pyaudio.paInt16,
channels=1,
rate=24000,
input=True,
stream_callback=self.mic_audio_in_callback,
frames_per_buffer=int(24000 / 100) * 2, # 20ms of audio
start=False,
)
self.speaker_audio_out = self.p.open(
format=pyaudio.paInt16,
channels=1,
rate=24000,
output=True,
)
self.ws = await websockets.connect(
uri="wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-12-17",
additional_headers={
"Authorization": f"Bearer {OPENAI_API_KEY}",
"OpenAI-Beta": "realtime=v1",
}
)
asyncio.create_task(self.ws_receive_worker())
# asyncio.create_task(self.ws_send_worker())
await asyncio.sleep(15 * 60)
执行代码
if __name__ == "__main__":
log.info("Starting demo")
asyncio.run(AudioStreamer().run())