File size: 4,426 Bytes
964514c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
026b176
964514c
026b176
 
 
964514c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dc4889
964514c
 
 
 
 
 
 
1dc4889
964514c
 
 
 
 
 
1dc4889
964514c
 
 
 
 
 
 
 
 
 
1dc4889
964514c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dc4889
 
 
964514c
 
 
1dc4889
964514c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
026b176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
964514c
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#coding: utf-8

import os
import tempfile


#from typing import Any
#from typing import Dict
#from typing import IO
#from typing import List
from typing import Optional
from typing import Tuple
#from typing import Union
from base64 import b64encode

from openai import OpenAI
from pydub import AudioSegment
import streamlit as st

#from dotenv import load_dotenv
# Charger les variables d'environnement depuis le fichier .env
#load_dotenv()

class openai_tts(object):
    def __init__(self,                  
                 tts_voice: Optional[str] = "nova",
                 tts_model: Optional[str] = "tts-1",
                 response_format: Optional[str] = "mp3",
                 speed: Optional[float] = 1.0
                 ):
        self.client = None
        self.init_supported_formats__()
        self.init_api_client()

        if response_format:
            self.set_response_format(response_format)
        if tts_voice:
            self.set_tts_voice(tts_voice)
        if tts_model:
            self.set_tts_model(tts_model)
        if speed:
            self.set_tts_speed(speed)

    def set_tts_speed(self, speed):
        if not (0.25 <= speed <= 4.0):
            raise ValueError(f"[TTS] - Speed must be between 0.25 and 4.0. Provided value: {speed}")
        else:
            self.speed = speed
        return self

    def set_tts_voice(self, voice):
        voix_valides = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
        if voice not in voix_valides:
            raise ValueError(f"[TTS] - Invalid TTS voice: {voice}. Valid voices are: {', '.join(voix_valides)}.")
        else:
            self.tts_voice = voice
        return self

    def set_tts_model(self, model):
        if model not in ["tts-1", "tts-1-hd"]:
            raise ValueError(f"[TTS] - Invalid TTS model: {model}. Valid models are 'tts-1' and 'tts-1-hd'.")
        else:
            self.tts_model = model
        return self

    def init_supported_formats__(self):
        self.supported_formats = [ 'mp3', 'opus', 'aac', 'flac', 'wav', 'pcm' ]
        return self

    def set_response_format(self, format: str):
        if format not in self.supported_formats:
            raise ValueError(f"[TTS] - Unsupported format: {format}. Supported formats are: {', '.join(self.supported_formats)}")
        else:
            self.response_format = format
        return self
        
    def init_api_client(self):
        if not (self.client):
            # OpenAI client configuration with API key
            self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        return self

    def text_to_speech(self, 
                    input_text: str) -> Tuple[Optional[bytes], float]:
        """
        Convertit du texte en parole en utilisant l'API OpenAI.

        Args:
            input_text (str): Le texte à convertir en parole.

        Returns:
            Dict[str, Union[float, str]]: Un dictionnaire contenant:
                - 'audio_duration' (float): La durée de l'audio en secondes.
                - 'data_bytes' (str): Les données audio encodées en base64.
        """
        response = self.client.audio.speech.create(
            model=self.tts_model,
            voice=self.tts_voice,
            input=input_text,
            response_format=self.response_format,
            speed=self.speed
        )
        data_output = response.read()

        tmp_file = tempfile.TemporaryFile()
        tmp_file.write(data_output)
        tmp_file.seek(0)
        audio = AudioSegment.from_file(tmp_file, format=self.response_format)
        duration = len(audio) / 1000
        tmp_file.close()
        
        return {
         "audio_duration": duration,
         "data_bytes": b64encode(data_output).decode()
        }


def process_tts_message(text_response: str) -> Tuple[Optional[bytes], Optional[float]]:
    try:
        tts_output_ = openai_tts(
            tts_voice=st.session_state.tts_voice,
            tts_model="tts-1",
            response_format="mp3",
            speed=1.0
            ).text_to_speech(text_response)

        return tts_output_["data_bytes"], tts_output_["audio_duration"]
    except Exception as e:
        st.error(f"Une erreur s'est produite lors de la conversion texte-parole : {e}")
        return None, None
    
"""
if __name__ == "__main__":
    
    openai_tts().text_to_speech("Hello, I am an AI assistant. How can I help you?")
    
"""