|
|
|
|
|
|
|
|
|
|
|
|
|
import aiohttp
|
|
import time
|
|
|
|
from typing import AsyncGenerator
|
|
|
|
from pipecat.frames.frames import (
|
|
AudioRawFrame,
|
|
CancelFrame,
|
|
EndFrame,
|
|
ErrorFrame,
|
|
Frame,
|
|
InterimTranscriptionFrame,
|
|
StartFrame,
|
|
SystemFrame,
|
|
TranscriptionFrame)
|
|
from pipecat.processors.frame_processor import FrameDirection
|
|
from pipecat.services.ai_services import AsyncAIService, TTSService
|
|
|
|
from loguru import logger
|
|
|
|
|
|
try:
|
|
from deepgram import (
|
|
DeepgramClient,
|
|
DeepgramClientOptions,
|
|
LiveTranscriptionEvents,
|
|
LiveOptions,
|
|
)
|
|
except ModuleNotFoundError as e:
|
|
logger.error(f"Exception: {e}")
|
|
logger.error(
|
|
"In order to use Deepgram, you need to `pip install pipecat-ai[deepgram]`. Also, set `DEEPGRAM_API_KEY` environment variable.")
|
|
raise Exception(f"Missing module: {e}")
|
|
|
|
|
|
class DeepgramTTSService(TTSService):
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
aiohttp_session: aiohttp.ClientSession,
|
|
api_key: str,
|
|
voice: str = "aura-helios-en",
|
|
base_url: str = "https://api.deepgram.com/v1/speak",
|
|
**kwargs):
|
|
super().__init__(**kwargs)
|
|
|
|
self._voice = voice
|
|
self._api_key = api_key
|
|
self._aiohttp_session = aiohttp_session
|
|
self._base_url = base_url
|
|
|
|
def can_generate_metrics(self) -> bool:
|
|
return True
|
|
|
|
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
logger.debug(f"Generating TTS: [{text}]")
|
|
|
|
base_url = self._base_url
|
|
request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate=16000"
|
|
headers = {"authorization": f"token {self._api_key}"}
|
|
body = {"text": text}
|
|
|
|
try:
|
|
await self.start_ttfb_metrics()
|
|
async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r:
|
|
if r.status != 200:
|
|
response_text = await r.text()
|
|
|
|
|
|
|
|
if "unutterable" in response_text:
|
|
logger.debug(f"Unutterable text: [{text}]")
|
|
return
|
|
|
|
logger.error(
|
|
f"{self} error getting audio (status: {r.status}, error: {response_text})")
|
|
yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {response_text})")
|
|
return
|
|
|
|
async for data in r.content:
|
|
await self.stop_ttfb_metrics()
|
|
frame = AudioRawFrame(audio=data, sample_rate=16000, num_channels=1)
|
|
yield frame
|
|
except Exception as e:
|
|
logger.exception(f"{self} exception: {e}")
|
|
|
|
|
|
class DeepgramSTTService(AsyncAIService):
|
|
def __init__(self,
|
|
*,
|
|
api_key: str,
|
|
url: str = "",
|
|
live_options: LiveOptions = LiveOptions(
|
|
encoding="linear16",
|
|
language="en-US",
|
|
model="nova-2-conversationalai",
|
|
sample_rate=16000,
|
|
channels=1,
|
|
interim_results=True,
|
|
smart_format=True,
|
|
),
|
|
**kwargs):
|
|
super().__init__(**kwargs)
|
|
|
|
self._live_options = live_options
|
|
|
|
self._client = DeepgramClient(
|
|
api_key, config=DeepgramClientOptions(url=url, options={"keepalive": "true"}))
|
|
self._connection = self._client.listen.asynclive.v("1")
|
|
self._connection.on(LiveTranscriptionEvents.Transcript, self._on_message)
|
|
|
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
await super().process_frame(frame, direction)
|
|
|
|
if isinstance(frame, SystemFrame):
|
|
await self.push_frame(frame, direction)
|
|
elif isinstance(frame, AudioRawFrame):
|
|
await self._connection.send(frame.audio)
|
|
else:
|
|
await self.queue_frame(frame, direction)
|
|
|
|
async def start(self, frame: StartFrame):
|
|
if await self._connection.start(self._live_options):
|
|
logger.debug(f"{self}: Connected to Deepgram")
|
|
else:
|
|
logger.error(f"{self}: Unable to connect to Deepgram")
|
|
|
|
async def stop(self, frame: EndFrame):
|
|
await self._connection.finish()
|
|
|
|
async def cancel(self, frame: CancelFrame):
|
|
await self._connection.finish()
|
|
|
|
async def _on_message(self, *args, **kwargs):
|
|
result = kwargs["result"]
|
|
is_final = result.is_final
|
|
transcript = result.channel.alternatives[0].transcript
|
|
if len(transcript) > 0:
|
|
if is_final:
|
|
await self.queue_frame(TranscriptionFrame(transcript, "", int(time.time_ns() / 1000000)))
|
|
else:
|
|
await self.queue_frame(InterimTranscriptionFrame(transcript, "", int(time.time_ns() / 1000000)))
|
|
|