文章
问答
冒泡
ChatGPT实时模型gpt-4o-realtime-preview-2024-12-17实现function call

gpt-4o-audio-preview的核心功能包括三大方面:首先,它能够根据文本生成自然流畅的语音响应,为语音助手和虚拟客服等应用提供了强大支持。其次,该模型具备分析音频输入的情感、语调和音调的能力,这一特性在情感计算和用户体验分析领域具有广阔应用前景。最后,它支持语音到语音的互动,音频既可以作为输入也可以作为输出,为全方位的语音交互系统奠定了基础。

 

本文使用pyaudio模拟语音输入和朗读实现语音聊天、打断和function call功能

 

gpt-4o-realtime-preview为了实现打断,使用的是websocket协议,不是http协议

引入必要的包

import asyncio
import base64
import json
import queue
import requests
import pyaudio
import websockets
from loguru import logger as log # 这个包可以让日志美观一点

设置一些全局变量

OPENAI_API_KEY = "你的openai api key"
# 定义一个图书查询的function
function = {
    "name": "library_book_search",
    "description": "Search for a book in the library database using the book's title",
    "parameters": {
        "type": "object",
        "properties": {
            "book_title": {
                "type": "string",
                "description": "The title of the book to search for in the library."
            }
        },
        "required": [
            "book_title"
        ]
    }
}
# tools与其他模型有区别
# old
tools = [
    {
        "type": "function",
        "function": "function",
    }
]
# new
tools = [
    {
        "type": "function",
        **function
    }
]
# 当session被创建的时候需要告诉模型的一些参数 比如返回的语音编码格式等等
update_event_session = {
    "modalities": ["text", "audio"],
    "instructions": "你是一个图书查询智能体,你的任务是帮助用户调用function插件查询图书信息.",
    "voice": "sage",
    "input_audio_format": "pcm16",
    "output_audio_format": "pcm16",
    "input_audio_transcription": {
        "model": "whisper-1"
    },
    "turn_detection": {
        "type": "server_vad",
        "threshold": 0.5,
        "prefix_padding_ms": 300,
        "silence_duration_ms": 500,
        "create_response": True
    },
    "tools": tools,
    "tool_choice": "auto",
    "temperature": 0.8,
    "max_response_output_tokens": "inf"
}

定义library_book_search的实现

def library_book_search(book_title):
    url = "https://xxx.com/book_query/"
    data = {
        "book_title": book_title,
    }
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
    }
    response = requests.post(url, headers=headers, json=data)
    if response.status_code == 200:
        data = response.json().get("data")
        return data
    else:
        log.error(f"Error: {response.status_code}")
        return None

定义一个类用于接收模型的参数

class BotBackend:
    def __init__(self):
        self.function_name = ""
        self.args = ""
        self.function_id = ""
        self.item_id = ""
        self.finish_reason = ""
        self.content = ""

    def update_function_name(self, function_name):
        self.function_name = function_name

    def update_item_id(self, item_id):
        self.item_id = item_id

    def update_args(self, args):
        self.args = args

    def update_function_id(self, function_id):
        self.function_id = function_id

    def update_finish_reason(self, finish_reason):
        self.finish_reason = finish_reason

    def update_content(self, content):
        self.content += content

    def reset_gpt_response_log_values(self, exclude=None):
        if exclude is None:
            exclude = []

        attributes = {
            "function_name": "",
            "args": "",
            "function_id": "",
            "item_id": "",
            "content": "",
        }

        for attr_name in exclude:
            del attributes[attr_name]
        for attr_name, value in attributes.items():
            setattr(self, attr_name, value)

    def run_function(self):
        args = json.loads(self.args)
        result = library_book_search(**args)
        return result

以下是核心代码,实现语音输入输出和function参数的获取

class AudioStreamer:
    def __init__(self):
        self.ws = None
        self.speaker_audio_out = None
        self.mic_audio_in = None
        self.p = pyaudio.PyAudio()
        self.bot_backend = BotBackend()

    def mic_audio_in_callback(self, in_data, frame_count, time_info, status):
        payload = base64.b64encode(in_data).decode("utf-8")

        async def send():
            await self.ws.send(
                json.dumps(
                    {
                        "type": "input_audio_buffer.append",
                        "audio": payload,
                    },
                )
            )

        asyncio.run(send())
        return None, pyaudio.paContinue

    def function_call_callback(self, output):
        item_id = output["id"]
        self.bot_backend.update_item_id(item_id)
        function_name = output["name"]
        self.bot_backend.update_function_name(function_name)
        call_id = output["call_id"]
        self.bot_backend.update_function_id(call_id)
        arguments = output["arguments"]
        self.bot_backend.update_args(arguments)
        self.bot_backend.update_finish_reason("function_call")
        result = self.bot_backend.run_function()

        return {
            "type": "conversation.item.create",
            "item": {
                "id": self.bot_backend.item_id,
                "type": "function_call_output",
                "call_id": self.bot_backend.function_id,
                "output": json.dumps(result),
            },
        }

    async def ws_receive_worker(self):
        async for m in self.ws:
            evt = json.loads(m)
            print(json.dumps(evt, indent=4))
            if evt["type"] == "session.created":
                log.info("Connected: say something to GPT-4o")
                evt["type"] = "session.update"
                evt["session"] = update_event_session
                await self.ws.send(json.dumps(evt))
                self.mic_audio_in.start_stream()
            elif evt["type"] == "response.audio.delta":
                audio = base64.b64decode(evt["delta"])
                self.speaker_audio_out.write(audio)
            elif evt["type"] == "response.audio_transcript.delta":
                log.info(evt["delta"])
                self.bot_backend.update_content(evt["delta"])
            elif evt["type"] == "response.output_item.done":
                output = evt["item"]
                out_put_type = output["type"]
                if out_put_type == "function_call":
                    event = self.function_call_callback(output)
                    await self.ws.send(json.dumps(event))
            # elif evt["type"] == "response.done":
            #     output = evt["response"]["output"][0]
            #     self.function_call_callback(output)

            # elif evt["type"] == "response.function_call_arguments.delta":
            #    args = evt["delta"]
            #    self.bot_backend.update_args(args)

    async def run(self):
        self.mic_audio_in = self.p.open(
            format=pyaudio.paInt16,
            channels=1,
            rate=24000,
            input=True,
            stream_callback=self.mic_audio_in_callback,
            frames_per_buffer=int(24000 / 100) * 2,  # 20ms of audio
            start=False,
        )

        self.speaker_audio_out = self.p.open(
            format=pyaudio.paInt16,
            channels=1,
            rate=24000,
            output=True,
        )

        self.ws = await websockets.connect(
            uri="wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-12-17",
            additional_headers={
                "Authorization": f"Bearer {OPENAI_API_KEY}",
                "OpenAI-Beta": "realtime=v1",
            }
        )

        asyncio.create_task(self.ws_receive_worker())
        # asyncio.create_task(self.ws_send_worker())

        await asyncio.sleep(15 * 60)

执行代码

if __name__ == "__main__":
    log.info("Starting demo")
    asyncio.run(AudioStreamer().run())

参考资料1

参考资料2


关于作者

小乙哥
学海无涯,回头是岸
获得点赞
文章被阅读