storytelling-backup

Paused

App Files Files Community

storytelling-backup / pipecat /services /ai_services.py

lucy1118

Upload 78 files

8d7f55c verified about 1 year ago

raw

history blame contribute delete

10.9 kB

	#
	# Copyright (c) 2024, Daily
	#
	# SPDX-License-Identifier: BSD 2-Clause License
	#

	import io
	import wave

	from abc import abstractmethod
	from typing import AsyncGenerator

	from pipecat.frames.frames import (
	AudioRawFrame,
	CancelFrame,
	EndFrame,
	ErrorFrame,
	Frame,
	LLMFullResponseEndFrame,
	StartFrame,
	StartInterruptionFrame,
	TTSStartedFrame,
	TTSStoppedFrame,
	TextFrame,
	VisionImageRawFrame,
	)
	from pipecat.processors.async_frame_processor import AsyncFrameProcessor
	from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
	from pipecat.utils.audio import calculate_audio_volume
	from pipecat.utils.utils import exp_smoothing


	class AIService(FrameProcessor):
	def __init__(self, **kwargs):
	super().__init__(**kwargs)

	async def start(self, frame: StartFrame):
	pass

	async def stop(self, frame: EndFrame):
	pass

	async def cancel(self, frame: CancelFrame):
	pass

	async def process_frame(self, frame: Frame, direction: FrameDirection):
	await super().process_frame(frame, direction)

	if isinstance(frame, StartFrame):
	await self.start(frame)
	elif isinstance(frame, CancelFrame):
	await self.cancel(frame)
	elif isinstance(frame, EndFrame):
	await self.stop(frame)

	async def process_generator(self, generator: AsyncGenerator[Frame, None]):
	async for f in generator:
	if isinstance(f, ErrorFrame):
	await self.push_error(f)
	else:
	await self.push_frame(f)


	class AsyncAIService(AsyncFrameProcessor):
	def __init__(self, **kwargs):
	super().__init__(**kwargs)

	async def start(self, frame: StartFrame):
	pass

	async def stop(self, frame: EndFrame):
	pass

	async def cancel(self, frame: CancelFrame):
	pass

	async def process_frame(self, frame: Frame, direction: FrameDirection):
	await super().process_frame(frame, direction)

	if isinstance(frame, StartFrame):
	await self.start(frame)
	elif isinstance(frame, CancelFrame):
	await self.cancel(frame)
	elif isinstance(frame, EndFrame):
	await self.stop(frame)


	class LLMService(AIService):
	"""This class is a no-op but serves as a base class for LLM services."""

	def __init__(self, **kwargs):
	super().__init__(**kwargs)
	self._callbacks = {}
	self._start_callbacks = {}

	# TODO-CB: callback function type
	def register_function(self, function_name: str, callback, start_callback=None):
	self._callbacks[function_name] = callback
	if start_callback:
	self._start_callbacks[function_name] = start_callback

	def unregister_function(self, function_name: str):
	del self._callbacks[function_name]
	if self._start_callbacks[function_name]:
	del self._start_callbacks[function_name]

	def has_function(self, function_name: str):
	return function_name in self._callbacks.keys()

	async def call_function(self, function_name: str, args):
	if function_name in self._callbacks.keys():
	return await self._callbacks[function_name](self, args)
	return None

	async def call_start_function(self, function_name: str):
	if function_name in self._start_callbacks.keys():
	await self._start_callbacks[function_name](self)


	class TTSService(AIService):
	def __init__(self, , aggregate_sentences: bool = True, *kwargs):
	super().__init__(**kwargs)
	self._aggregate_sentences: bool = aggregate_sentences
	self._current_sentence: str = ""

	# Converts the text to audio.
	@abstractmethod
	async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
	pass

	async def say(self, text: str):
	await self.process_frame(TextFrame(text=text), FrameDirection.DOWNSTREAM)

	async def _process_text_frame(self, frame: TextFrame):
	text: str \| None = None
	if not self._aggregate_sentences:
	text = frame.text
	else:
	self._current_sentence += frame.text
	if self._current_sentence.strip().endswith(
	(".", "?", "!")) and not self._current_sentence.strip().endswith(
	("Mr,", "Mrs.", "Ms.", "Dr.")):
	text = self._current_sentence
	self._current_sentence = ""

	if text:
	await self._push_tts_frames(text)

	async def _push_tts_frames(self, text: str):
	text = text.strip()
	if not text:
	return

	await self.push_frame(TTSStartedFrame())
	await self.start_processing_metrics()
	await self.process_generator(self.run_tts(text))
	await self.stop_processing_metrics()
	await self.push_frame(TTSStoppedFrame())
	# We send the original text after the audio. This way, if we are
	# interrupted, the text is not added to the assistant context.
	await self.push_frame(TextFrame(text))

	async def process_frame(self, frame: Frame, direction: FrameDirection):
	await super().process_frame(frame, direction)

	if isinstance(frame, TextFrame):
	await self._process_text_frame(frame)
	elif isinstance(frame, StartInterruptionFrame):
	self._current_sentence = ""
	await self.push_frame(frame, direction)
	elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(frame, EndFrame):
	self._current_sentence = ""
	await self._push_tts_frames(self._current_sentence)
	await self.push_frame(frame)
	else:
	await self.push_frame(frame, direction)


	class STTService(AIService):
	"""STTService is a base class for speech-to-text services."""

	def __init__(self,
	*,
	min_volume: float = 0.6,
	max_silence_secs: float = 0.3,
	max_buffer_secs: float = 1.5,
	sample_rate: int = 16000,
	num_channels: int = 1,
	**kwargs):
	super().__init__(**kwargs)
	self._min_volume = min_volume
	self._max_silence_secs = max_silence_secs
	self._max_buffer_secs = max_buffer_secs
	self._sample_rate = sample_rate
	self._num_channels = num_channels
	(self._content, self._wave) = self._new_wave()
	self._silence_num_frames = 0
	# Volume exponential smoothing
	self._smoothing_factor = 0.2
	self._prev_volume = 0

	@abstractmethod
	async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
	"""Returns transcript as a string"""
	pass

	def _new_wave(self):
	content = io.BytesIO()
	ww = wave.open(content, "wb")
	ww.setsampwidth(2)
	ww.setnchannels(self._num_channels)
	ww.setframerate(self._sample_rate)
	return (content, ww)

	def _get_smoothed_volume(self, frame: AudioRawFrame) -> float:
	volume = calculate_audio_volume(frame.audio, frame.sample_rate)
	return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)

	async def _append_audio(self, frame: AudioRawFrame):
	# Try to filter out empty background noise
	volume = self._get_smoothed_volume(frame)
	if volume >= self._min_volume:
	# If volume is high enough, write new data to wave file
	self._wave.writeframes(frame.audio)
	self._silence_num_frames = 0
	else:
	self._silence_num_frames += frame.num_frames
	self._prev_volume = volume

	# If buffer is not empty and we have enough data or there's been a long
	# silence, transcribe the audio gathered so far.
	silence_secs = self._silence_num_frames / self._sample_rate
	buffer_secs = self._wave.getnframes() / self._sample_rate
	if self._content.tell() > 0 and (
	buffer_secs > self._max_buffer_secs or silence_secs > self._max_silence_secs):
	self._silence_num_frames = 0
	self._wave.close()
	self._content.seek(0)
	await self.start_processing_metrics()
	await self.process_generator(self.run_stt(self._content.read()))
	await self.stop_processing_metrics()
	(self._content, self._wave) = self._new_wave()

	async def process_frame(self, frame: Frame, direction: FrameDirection):
	"""Processes a frame of audio data, either buffering or transcribing it."""
	await super().process_frame(frame, direction)

	if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame):
	self._wave.close()
	await self.push_frame(frame, direction)
	elif isinstance(frame, AudioRawFrame):
	# In this service we accumulate audio internally and at the end we
	# push a TextFrame. We don't really want to push audio frames down.
	await self._append_audio(frame)
	else:
	await self.push_frame(frame, direction)


	class ImageGenService(AIService):

	def __init__(self, **kwargs):
	super().__init__(**kwargs)

	# Renders the image. Returns an Image object.
	@abstractmethod
	async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
	pass

	async def process_frame(self, frame: Frame, direction: FrameDirection):
	await super().process_frame(frame, direction)

	if isinstance(frame, TextFrame):
	await self.push_frame(frame, direction)
	await self.start_processing_metrics()
	await self.process_generator(self.run_image_gen(frame.text))
	await self.stop_processing_metrics()
	else:
	await self.push_frame(frame, direction)


	class VisionService(AIService):
	"""VisionService is a base class for vision services."""

	def __init__(self, **kwargs):
	super().__init__(**kwargs)
	self._describe_text = None

	@abstractmethod
	async def run_vision(self, frame: VisionImageRawFrame) -> AsyncGenerator[Frame, None]:
	pass

	async def process_frame(self, frame: Frame, direction: FrameDirection):
	await super().process_frame(frame, direction)

	if isinstance(frame, VisionImageRawFrame):
	await self.start_processing_metrics()
	await self.process_generator(self.run_vision(frame))
	await self.stop_processing_metrics()
	else:
	await self.push_frame(frame, direction)