openfree commited on
Commit
9e6e720
·
verified ·
1 Parent(s): 00efe1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -2074
app.py CHANGED
@@ -1,2088 +1,35 @@
1
- import spaces
2
- import gradio as gr
3
  import os
4
- import asyncio
5
- import torch
6
- import io
7
- import json
8
- import re
9
- import httpx
10
- import tempfile
11
- import wave
12
- import base64
13
- import numpy as np
14
- import soundfile as sf
15
- import subprocess
16
- import shutil
17
- import requests
18
- import logging
19
- from datetime import datetime, timedelta
20
- from typing import List, Tuple, Dict, Optional
21
- from pathlib import Path
22
- from threading import Thread
23
- from dotenv import load_dotenv
24
 
25
- # PDF processing imports
26
- from langchain_community.document_loaders import PyPDFLoader
27
-
28
- # Edge TTS imports
29
- import edge_tts
30
- from pydub import AudioSegment
31
-
32
- # OpenAI imports
33
- from openai import OpenAI
34
-
35
- # Transformers imports (for legacy local mode)
36
- from transformers import (
37
- AutoModelForCausalLM,
38
- AutoTokenizer,
39
- TextIteratorStreamer,
40
- BitsAndBytesConfig,
41
- )
42
-
43
- # Llama CPP imports (for new local mode)
44
- try:
45
- from llama_cpp import Llama
46
- from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
47
- from llama_cpp_agent.providers import LlamaCppPythonProvider
48
- from llama_cpp_agent.chat_history import BasicChatHistory
49
- from llama_cpp_agent.chat_history.messages import Roles
50
- from huggingface_hub import hf_hub_download
51
- LLAMA_CPP_AVAILABLE = True
52
- except ImportError:
53
- LLAMA_CPP_AVAILABLE = False
54
-
55
- # Spark TTS imports
56
- try:
57
- from huggingface_hub import snapshot_download
58
- SPARK_AVAILABLE = True
59
- except:
60
- SPARK_AVAILABLE = False
61
-
62
- # MeloTTS imports (for local mode)
63
- try:
64
- # unidic 다운로드를 조건부로 처리
65
- if not os.path.exists("/usr/local/lib/python3.10/site-packages/unidic"):
66
- try:
67
- os.system("python -m unidic download")
68
- except:
69
- pass
70
- from melo.api import TTS as MeloTTS
71
- MELO_AVAILABLE = True
72
- except:
73
- MELO_AVAILABLE = False
74
-
75
- # Import config and prompts
76
- from config_prompts import (
77
- ConversationConfig,
78
- PromptBuilder,
79
- DefaultConversations,
80
- EDGE_TTS_ONLY_LANGUAGES,
81
- EDGE_TTS_VOICES
82
- )
83
-
84
- load_dotenv()
85
-
86
- # Brave Search API 설정
87
- BRAVE_KEY = os.getenv("BSEARCH_API")
88
- BRAVE_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
89
-
90
-
91
- def brave_search(query: str, count: int = 8, freshness_days: int | None = None):
92
- """Brave Search API를 사용하여 최신 정보 검색"""
93
- if not BRAVE_KEY:
94
- return []
95
- params = {"q": query, "count": str(count)}
96
- if freshness_days:
97
- dt_from = (datetime.utcnow() - timedelta(days=freshness_days)).strftime("%Y-%m-%d")
98
- params["freshness"] = dt_from
99
  try:
100
- r = requests.get(
101
- BRAVE_ENDPOINT,
102
- headers={"Accept": "application/json", "X-Subscription-Token": BRAVE_KEY},
103
- params=params,
104
- timeout=15
105
- )
106
- raw = r.json().get("web", {}).get("results") or []
107
- return [{
108
- "title": r.get("title", ""),
109
- "url": r.get("url", r.get("link", "")),
110
- "snippet": r.get("description", r.get("text", "")),
111
- "host": re.sub(r"https?://(www\.)?", "", r.get("url", "")).split("/")[0]
112
- } for r in raw[:count]]
113
- except Exception as e:
114
- logging.error(f"Brave search error: {e}")
115
- return []
116
-
117
-
118
- def format_search_results(query: str, for_keyword: bool = False) -> str:
119
- """검색 결과를 포맷팅하여 반환"""
120
- # 키워드 검색의 경우 더 많은 결과 사용
121
- count = 5 if for_keyword else 3
122
- rows = brave_search(query, count, freshness_days=7 if not for_keyword else None)
123
- if not rows:
124
- return ""
125
-
126
- results = []
127
- # 키워드 검색의 경우 더 상세한 정보 포함
128
- max_results = 4 if for_keyword else 2
129
- for r in rows[:max_results]:
130
- if for_keyword:
131
- # 키워드 검색은 더 긴 스니펫 사용
132
- snippet = r['snippet'][:200] + "..." if len(r['snippet']) > 200 else r['snippet']
133
- results.append(f"**{r['title']}**\n{snippet}\nSource: {r['host']}")
134
- else:
135
- # 일반 검색은 짧은 스니펫
136
- snippet = r['snippet'][:100] + "..." if len(r['snippet']) > 100 else r['snippet']
137
- results.append(f"- {r['title']}: {snippet}")
138
-
139
- return "\n\n".join(results) + "\n"
140
-
141
-
142
- def extract_keywords_for_search(text: str, language: str = "English") -> List[str]:
143
- """텍스트에서 검색할 키워드 추출 (개선)"""
144
- # 텍스트 앞부분만 사용 (너무 많은 텍스트 처리 방지)
145
- text_sample = text[:500]
146
-
147
- if language == "Korean":
148
- import re
149
- # 한국어 명사 추출 (2글자 이상)
150
- keywords = re.findall(r'[가-힣]{2,}', text_sample)
151
- # 중복 제거하고 가장 긴 단어 1개만 선택
152
- unique_keywords = list(dict.fromkeys(keywords))
153
- # 길이 순으로 정렬하고 가장 의미있을 것 같은 단어 선택
154
- unique_keywords.sort(key=len, reverse=True)
155
- return unique_keywords[:1] # 1개만 반환
156
- else:
157
- # 영어는 대문자로 시작하는 단어 중 가장 긴 것 1개
158
- words = text_sample.split()
159
- keywords = [word.strip('.,!?;:') for word in words
160
- if len(word) > 4 and word[0].isupper()]
161
- if keywords:
162
- return [max(keywords, key=len)] # 가장 긴 단어 1개
163
- return []
164
-
165
-
166
- def search_and_compile_content(keyword: str, language: str = "English") -> str:
167
- """키워드로 검색하여 충분한 콘텐츠 컴파일"""
168
- if not BRAVE_KEY:
169
- # API 없을 때도 기본 콘텐츠 생성
170
- if language == "Korean":
171
- return f"""
172
- '{keyword}'에 대한 종합적인 정보:
173
-
174
- {keyword}는 현대 사회에서 매우 중요한 주제입니다.
175
- 이 주제는 다양한 측면에서 우리의 삶에 영향을 미치고 있으며,
176
- 최근 들어 더욱 주목받고 있습니다.
177
-
178
- 주요 특징:
179
- 1. 기술적 발전과 혁신
180
- 2. 사회적 영향과 변화
181
- 3. 미래 전망과 가능성
182
- 4. 실용적 활용 방안
183
- 5. 글로벌 트렌드와 동향
184
-
185
- 전문가들은 {keyword}가 앞으로 더욱 중요해질 것으로 예상하고 있으며,
186
- 이에 대한 깊이 있는 이해가 필요한 시점입니다.
187
- """
188
- else:
189
- return f"""
190
- Comprehensive information about '{keyword}':
191
-
192
- {keyword} is a significant topic in modern society.
193
- This subject impacts our lives in various ways and has been
194
- gaining increasing attention recently.
195
-
196
- Key aspects:
197
- 1. Technological advancement and innovation
198
- 2. Social impact and changes
199
- 3. Future prospects and possibilities
200
- 4. Practical applications
201
- 5. Global trends and developments
202
-
203
- Experts predict that {keyword} will become even more important,
204
- and it's crucial to develop a deep understanding of this topic.
205
- """
206
-
207
- # 언어에 따른 다양한 검색 쿼리
208
- if language == "Korean":
209
- queries = [
210
- f"{keyword} 최신 뉴스 2024",
211
- f"{keyword} 정보 설명",
212
- f"{keyword} 트렌드 전망",
213
- f"{keyword} 장점 단점",
214
- f"{keyword} 활용 방법",
215
- f"{keyword} 전문가 의견"
216
- ]
217
- else:
218
- queries = [
219
- f"{keyword} latest news 2024",
220
- f"{keyword} explained comprehensive",
221
- f"{keyword} trends forecast",
222
- f"{keyword} advantages disadvantages",
223
- f"{keyword} how to use",
224
- f"{keyword} expert opinions"
225
- ]
226
-
227
- all_content = []
228
- total_content_length = 0
229
-
230
- for query in queries:
231
- results = brave_search(query, count=5) # 더 많은 결과 가져오기
232
- for r in results[:3]: # 각 쿼리당 상위 3개
233
- content = f"**{r['title']}**\n{r['snippet']}\nSource: {r['host']}\n"
234
- all_content.append(content)
235
- total_content_length += len(r['snippet'])
236
-
237
- # 콘텐츠가 부족하면 추가 생성
238
- if total_content_length < 1000: # 최소 1000자 확보
239
- if language == "Korean":
240
- additional_content = f"""
241
- 추가 정보:
242
- {keyword}와 관련된 최근 동향을 살펴보면, 이 분야는 빠르게 발전하고 있습니다.
243
- 많은 전문가들이 이 주제에 대해 활발히 연구하고 있으며,
244
- 실생활에서의 응용 가능성도 계속 확대되고 있습니다.
245
-
246
- 특히 주목할 점은:
247
- - 기술 혁신의 가속화
248
- - 사용자 경험의 개선
249
- - 접근성의 향상
250
- - 비용 효율성 증대
251
- - 글로벌 시장의 성장
252
-
253
- 이러한 요소들이 {keyword}의 미래를 더욱 밝게 만들고 있습니다.
254
- """
255
- else:
256
- additional_content = f"""
257
- Additional insights:
258
- Recent developments in {keyword} show rapid advancement in this field.
259
- Many experts are actively researching this topic, and its practical
260
- applications continue to expand.
261
-
262
- Key points to note:
263
- - Accelerating technological innovation
264
- - Improving user experience
265
- - Enhanced accessibility
266
- - Increased cost efficiency
267
- - Growing global market
268
-
269
- These factors are making the future of {keyword} increasingly promising.
270
- """
271
- all_content.append(additional_content)
272
-
273
- # 컴파일된 콘텐츠 반환
274
- compiled = "\n\n".join(all_content)
275
-
276
- # 키워드 기반 소개
277
- if language == "Korean":
278
- intro = f"### '{keyword}'에 대한 종합적인 정보와 최신 동향:\n\n"
279
- else:
280
- intro = f"### Comprehensive information and latest trends about '{keyword}':\n\n"
281
-
282
- return intro + compiled
283
-
284
-
285
- class UnifiedAudioConverter:
286
- def __init__(self, config: ConversationConfig):
287
- self.config = config
288
- self.llm_client = None
289
- self.legacy_local_model = None
290
- self.legacy_tokenizer = None
291
- # 새로운 로컬 LLM 관련
292
- self.local_llm = None
293
- self.local_llm_model = None
294
- self.melo_models = None
295
- self.spark_model_dir = None
296
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
297
- # 프롬프트 빌더 추가
298
- self.prompt_builder = PromptBuilder()
299
-
300
- def initialize_api_mode(self, api_key: str):
301
- """Initialize API mode with Together API"""
302
- self.llm_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
303
-
304
- @spaces.GPU(duration=120)
305
- def initialize_local_mode(self):
306
- """Initialize new local mode with Llama CPP"""
307
- if not LLAMA_CPP_AVAILABLE:
308
- raise RuntimeError("Llama CPP dependencies not available. Please install llama-cpp-python and llama-cpp-agent.")
309
-
310
- if self.local_llm is None or self.local_llm_model != self.config.local_model_name:
311
- try:
312
- # 모델 다운로드
313
- model_path = hf_hub_download(
314
- repo_id=self.config.local_model_repo,
315
- filename=self.config.local_model_name,
316
- local_dir="./models"
317
- )
318
-
319
- model_path_local = os.path.join("./models", self.config.local_model_name)
320
-
321
- if not os.path.exists(model_path_local):
322
- raise RuntimeError(f"Model file not found at {model_path_local}")
323
-
324
- # Llama 모델 초기화
325
- self.local_llm = Llama(
326
- model_path=model_path_local,
327
- flash_attn=True,
328
- n_gpu_layers=81 if torch.cuda.is_available() else 0,
329
- n_batch=1024,
330
- n_ctx=16384,
331
- )
332
- self.local_llm_model = self.config.local_model_name
333
- print(f"Local LLM initialized: {model_path_local}")
334
-
335
- except Exception as e:
336
- print(f"Failed to initialize local LLM: {e}")
337
- raise RuntimeError(f"Failed to initialize local LLM: {e}")
338
-
339
- @spaces.GPU(duration=60)
340
- def initialize_legacy_local_mode(self):
341
- """Initialize legacy local mode with Hugging Face model (fallback)"""
342
- if self.legacy_local_model is None:
343
- quantization_config = BitsAndBytesConfig(
344
- load_in_4bit=True,
345
- bnb_4bit_compute_dtype=torch.float16
346
- )
347
- self.legacy_local_model = AutoModelForCausalLM.from_pretrained(
348
- self.config.legacy_local_model_name,
349
- quantization_config=quantization_config
350
- )
351
- self.legacy_tokenizer = AutoTokenizer.from_pretrained(
352
- self.config.legacy_local_model_name,
353
- revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
354
- )
355
-
356
- def initialize_spark_tts(self):
357
- """Initialize Spark TTS model by downloading if needed"""
358
- if not SPARK_AVAILABLE:
359
- raise RuntimeError("Spark TTS dependencies not available")
360
 
361
- model_dir = "pretrained_models/Spark-TTS-0.5B"
 
 
362
 
363
- # Check if model exists, if not download it
364
- if not os.path.exists(model_dir):
365
- print("Downloading Spark-TTS model...")
366
- try:
367
- os.makedirs("pretrained_models", exist_ok=True)
368
- snapshot_download(
369
- "SparkAudio/Spark-TTS-0.5B",
370
- local_dir=model_dir
371
- )
372
- print("Spark-TTS model downloaded successfully")
373
- except Exception as e:
374
- raise RuntimeError(f"Failed to download Spark-TTS model: {e}")
375
 
376
- self.spark_model_dir = model_dir
 
377
 
378
- # Check if we have the CLI inference script
379
- if not os.path.exists("cli/inference.py"):
380
- print("Warning: Spark-TTS CLI not found. Please clone the Spark-TTS repository.")
381
-
382
- @spaces.GPU(duration=60)
383
- def initialize_melo_tts(self):
384
- """Initialize MeloTTS models"""
385
- if MELO_AVAILABLE and self.melo_models is None:
386
- self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
387
-
388
- def fetch_text(self, url: str) -> str:
389
- """Fetch text content from URL"""
390
- if not url:
391
- raise ValueError("URL cannot be empty")
392
-
393
- if not url.startswith("http://") and not url.startswith("https://"):
394
- raise ValueError("URL must start with 'http://' or 'https://'")
395
-
396
- full_url = f"{self.config.prefix_url}{url}"
397
- try:
398
- response = httpx.get(full_url, timeout=60.0)
399
- response.raise_for_status()
400
- return response.text
401
- except httpx.HTTPError as e:
402
- raise RuntimeError(f"Failed to fetch URL: {e}")
403
-
404
- def extract_text_from_pdf(self, pdf_file) -> str:
405
- """Extract text content from PDF file"""
406
- try:
407
- # Gradio returns file path, not file object
408
- if isinstance(pdf_file, str):
409
- pdf_path = pdf_file
410
- else:
411
- # If it's a file object (shouldn't happen with Gradio)
412
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
413
- tmp_file.write(pdf_file.read())
414
- pdf_path = tmp_file.name
415
-
416
- # PDF 로드 및 텍스트 추출
417
- loader = PyPDFLoader(pdf_path)
418
- pages = loader.load()
419
-
420
- # 모든 페이지의 텍스트를 결합
421
- text = "\n".join([page.page_content for page in pages])
422
-
423
- # 임시 파일인 경우 삭제
424
- if not isinstance(pdf_file, str) and os.path.exists(pdf_path):
425
- os.unlink(pdf_path)
426
-
427
- return text
428
- except Exception as e:
429
- raise RuntimeError(f"Failed to extract text from PDF: {e}")
430
-
431
- def _get_messages_formatter_type(self, model_name):
432
- """Get appropriate message formatter for the model"""
433
- if "Mistral" in model_name or "BitSix" in model_name:
434
- return MessagesFormatterType.CHATML
435
- else:
436
- return MessagesFormatterType.LLAMA_3
437
-
438
- @spaces.GPU(duration=120)
439
- def extract_conversation_local(self, text: str, language: str = "English", progress=None) -> Dict:
440
- """Extract conversation using new local LLM with enhanced professional style"""
441
- try:
442
- # 검색 컨텍스트 생성 (키워드 기반이 아닌 경우)
443
- search_context = ""
444
- if BRAVE_KEY and not text.startswith("Keyword-based content:"):
445
- try:
446
- keywords = extract_keywords_for_search(text, language)
447
- if keywords:
448
- search_query = keywords[0] if language == "Korean" else f"{keywords[0]} latest news"
449
- search_context = format_search_results(search_query)
450
- print(f"Search context added for: {search_query}")
451
- except Exception as e:
452
- print(f"Search failed, continuing without context: {e}")
453
-
454
- # 먼저 새로운 로컬 LLM 시도
455
- self.initialize_local_mode()
456
-
457
- chat_template = self._get_messages_formatter_type(self.config.local_model_name)
458
- provider = LlamaCppPythonProvider(self.local_llm)
459
-
460
- # 언어별 시스템 메시지
461
- system_messages = {
462
- "Korean": (
463
- "당신은 한국의 유명 팟캐스트 전문 작가입니다. "
464
- "청취자들이 깊이 있는 전문 지식을 얻을 수 있는 고품질 대담을 한국어로 만듭니다. "
465
- "반드시 서로 존댓말을 사용하며, 12회의 대화 교환으로 구성하세요. "
466
- "모든 대화는 반드시 한국어로 작성하고 JSON 형식으로만 응답하세요."
467
- ),
468
- "Japanese": (
469
- "あなたは日本の有名なポッドキャスト専門作家です。"
470
- "聴衆が深い専門知識を得られる高品質な対談を日本語で作成します。"
471
- "必ずお互いに丁寧語を使用し、12回の対話交換で構成してください。"
472
- "すべての対話は必ず日本語で作成し、JSON形式でのみ回答してください。"
473
- ),
474
- "French": (
475
- "Vous êtes un célèbre scénariste de podcast professionnel français. "
476
- "Créez des discussions de haute qualité en français qui donnent au public "
477
- "des connaissances professionnelles approfondies. "
478
- "Créez exactement 12 échanges de conversation et répondez uniquement en format JSON."
479
- ),
480
- "German": (
481
- "Sie sind ein berühmter professioneller Podcast-Drehbuchautor aus Deutschland. "
482
- "Erstellen Sie hochwertige Diskussionen auf Deutsch, die dem Publikum "
483
- "tiefgreifendes Fachwissen vermitteln. "
484
- "Erstellen Sie genau 12 Gesprächsaustausche und antworten Sie nur im JSON-Format."
485
- ),
486
- "Spanish": (
487
- "Eres un famoso guionista de podcast profesional español. "
488
- "Crea discusiones de alta calidad en español que brinden al público "
489
- "conocimientos profesionales profundos. "
490
- "Crea exactamente 12 intercambios de conversación y responde solo en formato JSON."
491
- ),
492
- "Chinese": (
493
- "您是中国著名的专业播客编剧。"
494
- "创建高���量的中文讨论,为观众提供深入的专业知识。"
495
- "创建恰好12次对话交换,仅以JSON格式回答。"
496
- ),
497
- "Russian": (
498
- "Вы известный профессиональный сценарист подкастов из России. "
499
- "Создавайте высококачественные дискуссии на русском языке, которые дают аудитории "
500
- "глубокие профессиональные знания. "
501
- "Создайте ровно 12 обменов разговором и отвечайте только в формате JSON."
502
- )
503
- }
504
-
505
- system_message = system_messages.get(language,
506
- f"You are a professional podcast scriptwriter creating high-quality, "
507
- f"insightful discussions in {language}. Create exactly 12 conversation exchanges "
508
- f"with professional expertise. All dialogue must be in {language}. "
509
- f"Respond only in JSON format."
510
- )
511
-
512
- agent = LlamaCppAgent(
513
- provider,
514
- system_prompt=system_message,
515
- predefined_messages_formatter_type=chat_template,
516
- debug_output=False
517
- )
518
-
519
- settings = provider.get_provider_default_settings()
520
- settings.temperature = 0.75
521
- settings.top_k = 40
522
- settings.top_p = 0.95
523
- settings.max_tokens = self.config.max_tokens
524
- settings.repeat_penalty = 1.1
525
- settings.stream = False
526
-
527
- messages = BasicChatHistory()
528
-
529
- prompt = self.prompt_builder.build_prompt(text, language, search_context)
530
- response = agent.get_chat_response(
531
- prompt,
532
- llm_sampling_settings=settings,
533
- chat_history=messages,
534
- returns_streaming_generator=False,
535
- print_output=False
536
- )
537
-
538
- # JSON 파싱
539
- pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
540
- json_match = re.search(pattern, response)
541
-
542
- if json_match:
543
- conversation_data = json.loads(json_match.group())
544
- return conversation_data
545
- else:
546
- raise ValueError("No valid JSON found in local LLM response")
547
-
548
- except Exception as e:
549
- print(f"Local LLM failed: {e}, falling back to legacy local method")
550
- return self.extract_conversation_legacy_local(text, language, progress, search_context)
551
-
552
- @spaces.GPU(duration=120)
553
- def extract_conversation_legacy_local(self, text: str, language: str = "English", progress=None, search_context: str = "") -> Dict:
554
- """Extract conversation using legacy local model"""
555
  try:
556
- self.initialize_legacy_local_mode()
557
-
558
- # 언어별 시스템 메시지는 config_prompts에서 가져옴
559
- messages = self.prompt_builder.build_messages_for_local(text, language, search_context)
560
-
561
- terminators = [
562
- self.legacy_tokenizer.eos_token_id,
563
- self.legacy_tokenizer.convert_tokens_to_ids("<|eot_id|>")
564
- ]
565
-
566
- chat_messages = self.legacy_tokenizer.apply_chat_template(
567
- messages, tokenize=False, add_generation_prompt=True
568
- )
569
- model_inputs = self.legacy_tokenizer([chat_messages], return_tensors="pt").to(self.device)
570
-
571
- streamer = TextIteratorStreamer(
572
- self.legacy_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
573
- )
574
-
575
- generate_kwargs = dict(
576
- model_inputs,
577
- streamer=streamer,
578
- max_new_tokens=self.config.max_new_tokens,
579
- do_sample=True,
580
- temperature=0.75,
581
- eos_token_id=terminators,
582
- )
583
-
584
- t = Thread(target=self.legacy_local_model.generate, kwargs=generate_kwargs)
585
- t.start()
586
-
587
- partial_text = ""
588
- for new_text in streamer:
589
- partial_text += new_text
590
-
591
- pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
592
- json_match = re.search(pattern, partial_text)
593
-
594
- if json_match:
595
- return json.loads(json_match.group())
596
- else:
597
- raise ValueError("No valid JSON found in legacy local response")
598
-
599
- except Exception as e:
600
- print(f"Legacy local model also failed: {e}")
601
- return DefaultConversations.get_conversation(language)
602
-
603
- def extract_conversation_api(self, text: str, language: str = "English") -> Dict:
604
- """Extract conversation using API"""
605
- if not self.llm_client:
606
- raise RuntimeError("API mode not initialized")
607
-
608
- try:
609
- # 검색 컨텍스트 생성
610
- search_context = ""
611
- if BRAVE_KEY and not text.startswith("Keyword-based content:"):
612
- try:
613
- keywords = extract_keywords_for_search(text, language)
614
- if keywords:
615
- search_query = keywords[0] if language == "Korean" else f"{keywords[0]} latest news"
616
- search_context = format_search_results(search_query)
617
- print(f"Search context added for: {search_query}")
618
- except Exception as e:
619
- print(f"Search failed, continuing without context: {e}")
620
-
621
- # 메시지 빌드
622
- messages = self.prompt_builder.build_messages_for_local(text, language, search_context)
623
-
624
- chat_completion = self.llm_client.chat.completions.create(
625
- messages=messages,
626
- model=self.config.api_model_name,
627
- temperature=0.75,
628
- )
629
-
630
- pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
631
- json_match = re.search(pattern, chat_completion.choices[0].message.content)
632
-
633
- if not json_match:
634
- raise ValueError("No valid JSON found in response")
635
-
636
- return json.loads(json_match.group())
637
- except Exception as e:
638
- raise RuntimeError(f"Failed to extract conversation: {e}")
639
-
640
- def parse_conversation_text(self, conversation_text: str) -> Dict:
641
- """Parse conversation text back to JSON format"""
642
- lines = conversation_text.strip().split('\n')
643
- conversation_data = {"conversation": []}
644
-
645
- for line in lines:
646
- if ':' in line:
647
- speaker, text = line.split(':', 1)
648
- conversation_data["conversation"].append({
649
- "speaker": speaker.strip(),
650
- "text": text.strip()
651
- })
652
-
653
- return conversation_data
654
-
655
- async def text_to_speech_edge(self, conversation_json: Dict, language: str = "English") -> Tuple[str, str]:
656
- """Convert text to speech using Edge TTS"""
657
- output_dir = Path(self._create_output_directory())
658
- filenames = []
659
-
660
- try:
661
- # 언어별 음성 설정
662
- voices = EDGE_TTS_VOICES.get(language, EDGE_TTS_VOICES["English"])
663
-
664
- for i, turn in enumerate(conversation_json["conversation"]):
665
- filename = output_dir / f"output_{i}.wav"
666
- voice = voices[i % len(voices)]
667
-
668
- tmp_path = await self._generate_audio_edge(turn["text"], voice)
669
- os.rename(tmp_path, filename)
670
- filenames.append(str(filename))
671
-
672
- # Combine audio files
673
- final_output = os.path.join(output_dir, "combined_output.wav")
674
- self._combine_audio_files(filenames, final_output)
675
-
676
- # Generate conversation text
677
- conversation_text = "\n".join(
678
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
679
- for i, turn in enumerate(conversation_json["conversation"])
680
- )
681
-
682
- return final_output, conversation_text
683
- except Exception as e:
684
- raise RuntimeError(f"Failed to convert text to speech: {e}")
685
-
686
- async def _generate_audio_edge(self, text: str, voice: str) -> str:
687
- """Generate audio using Edge TTS"""
688
- if not text.strip():
689
- raise ValueError("Text cannot be empty")
690
-
691
- voice_short_name = voice.split(" - ")[0] if " - " in voice else voice
692
- communicate = edge_tts.Communicate(text, voice_short_name)
693
-
694
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
695
- tmp_path = tmp_file.name
696
- await communicate.save(tmp_path)
697
-
698
- return tmp_path
699
-
700
- @spaces.GPU(duration=60)
701
- def text_to_speech_spark(self, conversation_json: Dict, language: str = "English", progress=None) -> Tuple[str, str]:
702
- """Convert text to speech using Spark TTS CLI"""
703
- if not SPARK_AVAILABLE or not self.spark_model_dir:
704
- raise RuntimeError("Spark TTS not available")
705
-
706
- try:
707
- output_dir = self._create_output_directory()
708
- audio_files = []
709
-
710
- # Create different voice characteristics for different speakers
711
- speaker1, speaker2 = self.prompt_builder.get_speaker_names(language)
712
-
713
- if language == "Korean":
714
- voice_configs = [
715
- {"prompt_text": f"안녕하세요, 오늘 팟캐스트 진행을 맡은 {speaker1}입니다.", "gender": "male"},
716
- {"prompt_text": f"안녕하세요, 저는 오늘 이 주제에 대해 설명드릴 {speaker2}입니다.", "gender": "male"}
717
- ]
718
- else:
719
- voice_configs = [
720
- {"prompt_text": f"Hello everyone, I'm {speaker1}, your host for today's podcast.", "gender": "male"},
721
- {"prompt_text": f"Hi, I'm {speaker2}. I'm excited to share my insights with you.", "gender": "male"}
722
- ]
723
-
724
- for i, turn in enumerate(conversation_json["conversation"]):
725
- text = turn["text"]
726
- if not text.strip():
727
- continue
728
-
729
- voice_config = voice_configs[i % len(voice_configs)]
730
- output_file = os.path.join(output_dir, f"spark_output_{i}.wav")
731
-
732
- cmd = [
733
- "python", "-m", "cli.inference",
734
- "--text", text,
735
- "--device", "0" if torch.cuda.is_available() else "cpu",
736
- "--save_dir", output_dir,
737
- "--model_dir", self.spark_model_dir,
738
- "--prompt_text", voice_config["prompt_text"],
739
- "--output_name", f"spark_output_{i}.wav"
740
- ]
741
-
742
- try:
743
- result = subprocess.run(
744
- cmd,
745
- capture_output=True,
746
- text=True,
747
- timeout=60,
748
- cwd="."
749
- )
750
-
751
- if result.returncode == 0:
752
- audio_files.append(output_file)
753
- else:
754
- print(f"Spark TTS error for turn {i}: {result.stderr}")
755
- silence = np.zeros(int(22050 * 1.0))
756
- sf.write(output_file, silence, 22050)
757
- audio_files.append(output_file)
758
-
759
- except subprocess.TimeoutExpired:
760
- print(f"Spark TTS timeout for turn {i}")
761
- silence = np.zeros(int(22050 * 1.0))
762
- sf.write(output_file, silence, 22050)
763
- audio_files.append(output_file)
764
- except Exception as e:
765
- print(f"Error running Spark TTS for turn {i}: {e}")
766
- silence = np.zeros(int(22050 * 1.0))
767
- sf.write(output_file, silence, 22050)
768
- audio_files.append(output_file)
769
-
770
- # Combine all audio files
771
- if audio_files:
772
- final_output = os.path.join(output_dir, "spark_combined.wav")
773
- self._combine_audio_files(audio_files, final_output)
774
- else:
775
- raise RuntimeError("No audio files generated")
776
-
777
- conversation_text = "\n".join(
778
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
779
- for i, turn in enumerate(conversation_json["conversation"])
780
- )
781
-
782
- return final_output, conversation_text
783
-
784
- except Exception as e:
785
- raise RuntimeError(f"Failed to convert text to speech with Spark TTS: {e}")
786
-
787
- @spaces.GPU(duration=60)
788
- def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
789
- """Convert text to speech using MeloTTS"""
790
- if not MELO_AVAILABLE or not self.melo_models:
791
- raise RuntimeError("MeloTTS not available")
792
-
793
- speakers = ["EN-Default", "EN-US"]
794
- combined_audio = AudioSegment.empty()
795
-
796
- for i, turn in enumerate(conversation_json["conversation"]):
797
- bio = io.BytesIO()
798
- text = turn["text"]
799
- speaker = speakers[i % 2]
800
- speaker_id = self.melo_models["EN"].hps.data.spk2id[speaker]
801
-
802
- self.melo_models["EN"].tts_to_file(
803
- text, speaker_id, bio, speed=1.0,
804
- pbar=progress.tqdm if progress else None,
805
- format="wav"
806
- )
807
-
808
- bio.seek(0)
809
- audio_segment = AudioSegment.from_file(bio, format="wav")
810
- combined_audio += audio_segment
811
-
812
- final_audio_path = "melo_podcast.mp3"
813
- combined_audio.export(final_audio_path, format="mp3")
814
-
815
- conversation_text = "\n".join(
816
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
817
- for i, turn in enumerate(import spaces
818
- import gradio as gr
819
- import os
820
- import asyncio
821
- import torch
822
- import io
823
- import json
824
- import re
825
- import httpx
826
- import tempfile
827
- import wave
828
- import base64
829
- import numpy as np
830
- import soundfile as sf
831
- import subprocess
832
- import shutil
833
- import requests
834
- import logging
835
- from datetime import datetime, timedelta
836
- from typing import List, Tuple, Dict, Optional
837
- from pathlib import Path
838
- from threading import Thread
839
- from dotenv import load_dotenv
840
-
841
- # PDF processing imports
842
- from langchain_community.document_loaders import PyPDFLoader
843
-
844
- # Edge TTS imports
845
- import edge_tts
846
- from pydub import AudioSegment
847
-
848
- # OpenAI imports
849
- from openai import OpenAI
850
-
851
- # Transformers imports (for legacy local mode)
852
- from transformers import (
853
- AutoModelForCausalLM,
854
- AutoTokenizer,
855
- TextIteratorStreamer,
856
- BitsAndBytesConfig,
857
- )
858
-
859
- # Llama CPP imports (for new local mode)
860
- try:
861
- from llama_cpp import Llama
862
- from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
863
- from llama_cpp_agent.providers import LlamaCppPythonProvider
864
- from llama_cpp_agent.chat_history import BasicChatHistory
865
- from llama_cpp_agent.chat_history.messages import Roles
866
- from huggingface_hub import hf_hub_download
867
- LLAMA_CPP_AVAILABLE = True
868
- except ImportError:
869
- LLAMA_CPP_AVAILABLE = False
870
-
871
- # Spark TTS imports
872
- try:
873
- from huggingface_hub import snapshot_download
874
- SPARK_AVAILABLE = True
875
- except:
876
- SPARK_AVAILABLE = False
877
-
878
- # MeloTTS imports (for local mode)
879
- try:
880
- # unidic 다운로드를 조건부로 처리
881
- if not os.path.exists("/usr/local/lib/python3.10/site-packages/unidic"):
882
- try:
883
- os.system("python -m unidic download")
884
  except:
885
  pass
886
- from melo.api import TTS as MeloTTS
887
- MELO_AVAILABLE = True
888
- except:
889
- MELO_AVAILABLE = False
890
-
891
- # Import config and prompts
892
- from config_prompts import (
893
- ConversationConfig,
894
- PromptBuilder,
895
- DefaultConversations,
896
- EDGE_TTS_ONLY_LANGUAGES,
897
- EDGE_TTS_VOICES
898
- )
899
-
900
- load_dotenv()
901
-
902
- # Brave Search API 설정
903
- BRAVE_KEY = os.getenv("BSEARCH_API")
904
- BRAVE_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
905
-
906
-
907
- def brave_search(query: str, count: int = 8, freshness_days: int | None = None):
908
- """Brave Search API를 사용하여 최신 정보 검색"""
909
- if not BRAVE_KEY:
910
- return []
911
- params = {"q": query, "count": str(count)}
912
- if freshness_days:
913
- dt_from = (datetime.utcnow() - timedelta(days=freshness_days)).strftime("%Y-%m-%d")
914
- params["freshness"] = dt_from
915
- try:
916
- r = requests.get(
917
- BRAVE_ENDPOINT,
918
- headers={"Accept": "application/json", "X-Subscription-Token": BRAVE_KEY},
919
- params=params,
920
- timeout=15
921
- )
922
- raw = r.json().get("web", {}).get("results") or []
923
- return [{
924
- "title": r.get("title", ""),
925
- "url": r.get("url", r.get("link", "")),
926
- "snippet": r.get("description", r.get("text", "")),
927
- "host": re.sub(r"https?://(www\.)?", "", r.get("url", "")).split("/")[0]
928
- } for r in raw[:count]]
929
- except Exception as e:
930
- logging.error(f"Brave search error: {e}")
931
- return []
932
-
933
-
934
- def format_search_results(query: str, for_keyword: bool = False) -> str:
935
- """검색 결과를 포맷팅하여 반환"""
936
- # 키워드 검색의 경우 더 많은 결과 사용
937
- count = 5 if for_keyword else 3
938
- rows = brave_search(query, count, freshness_days=7 if not for_keyword else None)
939
- if not rows:
940
- return ""
941
-
942
- results = []
943
- # 키워드 검색의 경우 더 상세한 정보 포함
944
- max_results = 4 if for_keyword else 2
945
- for r in rows[:max_results]:
946
- if for_keyword:
947
- # 키워드 검색은 더 긴 스니펫 사용
948
- snippet = r['snippet'][:200] + "..." if len(r['snippet']) > 200 else r['snippet']
949
- results.append(f"**{r['title']}**\n{snippet}\nSource: {r['host']}")
950
- else:
951
- # 일반 검색은 짧은 스니펫
952
- snippet = r['snippet'][:100] + "..." if len(r['snippet']) > 100 else r['snippet']
953
- results.append(f"- {r['title']}: {snippet}")
954
-
955
- return "\n\n".join(results) + "\n"
956
-
957
-
958
- def extract_keywords_for_search(text: str, language: str = "English") -> List[str]:
959
- """텍스트에서 검색할 키워드 추출 (개선)"""
960
- # 텍스트 앞부분만 사용 (너무 많은 텍스트 처리 방지)
961
- text_sample = text[:500]
962
-
963
- if language == "Korean":
964
- import re
965
- # 한국어 명사 추출 (2글자 이상)
966
- keywords = re.findall(r'[가-힣]{2,}', text_sample)
967
- # 중복 제거하고 가장 긴 단어 1개만 선택
968
- unique_keywords = list(dict.fromkeys(keywords))
969
- # 길이 순으로 정렬하고 가장 의미있을 것 같은 단어 선택
970
- unique_keywords.sort(key=len, reverse=True)
971
- return unique_keywords[:1] # 1개만 반환
972
- else:
973
- # 영어는 대문자로 시작하는 단어 중 가장 긴 것 1개
974
- words = text_sample.split()
975
- keywords = [word.strip('.,!?;:') for word in words
976
- if len(word) > 4 and word[0].isupper()]
977
- if keywords:
978
- return [max(keywords, key=len)] # 가장 긴 단어 1개
979
- return []
980
-
981
-
982
- def search_and_compile_content(keyword: str, language: str = "English") -> str:
983
- """키워드로 검색하여 충분한 콘텐츠 컴파일"""
984
- if not BRAVE_KEY:
985
- # API 없을 때도 기본 콘텐츠 생성
986
- if language == "Korean":
987
- return f"""
988
- '{keyword}'에 대한 종합적인 정보:
989
-
990
- {keyword}는 현대 사회에서 매우 중요한 주제입니다.
991
- 이 주제는 다양한 측면에서 우리의 삶에 영향을 미치고 있으며,
992
- 최근 들어 더��� 주목받고 있습니다.
993
-
994
- 주요 특징:
995
- 1. 기술적 발전과 혁신
996
- 2. 사회적 영향과 변화
997
- 3. 미래 전망과 가능성
998
- 4. 실용적 활용 방안
999
- 5. 글로벌 트렌드와 동향
1000
-
1001
- 전문가들은 {keyword}가 앞으로 더욱 중요해질 것으로 예상하고 있으며,
1002
- 이에 대한 깊이 있는 이해가 필요한 시점입니다.
1003
- """
1004
- else:
1005
- return f"""
1006
- Comprehensive information about '{keyword}':
1007
-
1008
- {keyword} is a significant topic in modern society.
1009
- This subject impacts our lives in various ways and has been
1010
- gaining increasing attention recently.
1011
-
1012
- Key aspects:
1013
- 1. Technological advancement and innovation
1014
- 2. Social impact and changes
1015
- 3. Future prospects and possibilities
1016
- 4. Practical applications
1017
- 5. Global trends and developments
1018
-
1019
- Experts predict that {keyword} will become even more important,
1020
- and it's crucial to develop a deep understanding of this topic.
1021
- """
1022
-
1023
- # 언어에 따른 다양한 검색 쿼리
1024
- if language == "Korean":
1025
- queries = [
1026
- f"{keyword} 최신 뉴스 2024",
1027
- f"{keyword} 정보 설명",
1028
- f"{keyword} 트렌드 전망",
1029
- f"{keyword} 장점 단점",
1030
- f"{keyword} 활용 방법",
1031
- f"{keyword} 전문가 의견"
1032
- ]
1033
- else:
1034
- queries = [
1035
- f"{keyword} latest news 2024",
1036
- f"{keyword} explained comprehensive",
1037
- f"{keyword} trends forecast",
1038
- f"{keyword} advantages disadvantages",
1039
- f"{keyword} how to use",
1040
- f"{keyword} expert opinions"
1041
- ]
1042
-
1043
- all_content = []
1044
- total_content_length = 0
1045
-
1046
- for query in queries:
1047
- results = brave_search(query, count=5) # 더 많은 결과 가져오기
1048
- for r in results[:3]: # 각 쿼리당 상위 3개
1049
- content = f"**{r['title']}**\n{r['snippet']}\nSource: {r['host']}\n"
1050
- all_content.append(content)
1051
- total_content_length += len(r['snippet'])
1052
-
1053
- # 콘텐츠가 부족하면 추가 생성
1054
- if total_content_length < 1000: # 최소 1000자 확보
1055
- if language == "Korean":
1056
- additional_content = f"""
1057
- 추가 정보:
1058
- {keyword}와 관련된 최근 동향을 살펴보면, 이 분야는 빠르게 발전하고 있습니다.
1059
- 많은 전문가들이 이 주제에 대해 활발히 연구하고 있으며,
1060
- 실생활에서의 응용 가능성도 계속 확대되고 있습니다.
1061
-
1062
- 특히 주목할 점은:
1063
- - 기술 혁신의 가속화
1064
- - 사용자 경험의 개선
1065
- - 접근성의 향상
1066
- - 비용 효율성 증대
1067
- - 글로벌 시장의 성장
1068
-
1069
- 이러한 요소들이 {keyword}의 미래를 더욱 밝게 만들고 있습니다.
1070
- """
1071
- else:
1072
- additional_content = f"""
1073
- Additional insights:
1074
- Recent developments in {keyword} show rapid advancement in this field.
1075
- Many experts are actively researching this topic, and its practical
1076
- applications continue to expand.
1077
-
1078
- Key points to note:
1079
- - Accelerating technological innovation
1080
- - Improving user experience
1081
- - Enhanced accessibility
1082
- - Increased cost efficiency
1083
- - Growing global market
1084
-
1085
- These factors are making the future of {keyword} increasingly promising.
1086
- """
1087
- all_content.append(additional_content)
1088
-
1089
- # 컴파일된 콘텐츠 반환
1090
- compiled = "\n\n".join(all_content)
1091
-
1092
- # 키워드 기반 소개
1093
- if language == "Korean":
1094
- intro = f"### '{keyword}'에 대한 종합적인 정보와 최신 동향:\n\n"
1095
- else:
1096
- intro = f"### Comprehensive information and latest trends about '{keyword}':\n\n"
1097
-
1098
- return intro + compiled
1099
-
1100
-
1101
- class UnifiedAudioConverter:
1102
- def __init__(self, config: ConversationConfig):
1103
- self.config = config
1104
- self.llm_client = None
1105
- self.legacy_local_model = None
1106
- self.legacy_tokenizer = None
1107
- # 새로운 로컬 LLM 관련
1108
- self.local_llm = None
1109
- self.local_llm_model = None
1110
- self.melo_models = None
1111
- self.spark_model_dir = None
1112
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
1113
- # 프롬프트 빌더 추가
1114
- self.prompt_builder = PromptBuilder()
1115
-
1116
- def initialize_api_mode(self, api_key: str):
1117
- """Initialize API mode with Together API"""
1118
- self.llm_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
1119
-
1120
- @spaces.GPU(duration=120)
1121
- def initialize_local_mode(self):
1122
- """Initialize new local mode with Llama CPP"""
1123
- if not LLAMA_CPP_AVAILABLE:
1124
- raise RuntimeError("Llama CPP dependencies not available. Please install llama-cpp-python and llama-cpp-agent.")
1125
-
1126
- if self.local_llm is None or self.local_llm_model != self.config.local_model_name:
1127
- try:
1128
- # 모델 다운로드
1129
- model_path = hf_hub_download(
1130
- repo_id=self.config.local_model_repo,
1131
- filename=self.config.local_model_name,
1132
- local_dir="./models"
1133
- )
1134
-
1135
- model_path_local = os.path.join("./models", self.config.local_model_name)
1136
-
1137
- if not os.path.exists(model_path_local):
1138
- raise RuntimeError(f"Model file not found at {model_path_local}")
1139
-
1140
- # Llama 모델 초기화
1141
- self.local_llm = Llama(
1142
- model_path=model_path_local,
1143
- flash_attn=True,
1144
- n_gpu_layers=81 if torch.cuda.is_available() else 0,
1145
- n_batch=1024,
1146
- n_ctx=16384,
1147
- )
1148
- self.local_llm_model = self.config.local_model_name
1149
- print(f"Local LLM initialized: {model_path_local}")
1150
-
1151
- except Exception as e:
1152
- print(f"Failed to initialize local LLM: {e}")
1153
- raise RuntimeError(f"Failed to initialize local LLM: {e}")
1154
-
1155
- @spaces.GPU(duration=60)
1156
- def initialize_legacy_local_mode(self):
1157
- """Initialize legacy local mode with Hugging Face model (fallback)"""
1158
- if self.legacy_local_model is None:
1159
- quantization_config = BitsAndBytesConfig(
1160
- load_in_4bit=True,
1161
- bnb_4bit_compute_dtype=torch.float16
1162
- )
1163
- self.legacy_local_model = AutoModelForCausalLM.from_pretrained(
1164
- self.config.legacy_local_model_name,
1165
- quantization_config=quantization_config
1166
- )
1167
- self.legacy_tokenizer = AutoTokenizer.from_pretrained(
1168
- self.config.legacy_local_model_name,
1169
- revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
1170
- )
1171
-
1172
- def initialize_spark_tts(self):
1173
- """Initialize Spark TTS model by downloading if needed"""
1174
- if not SPARK_AVAILABLE:
1175
- raise RuntimeError("Spark TTS dependencies not available")
1176
-
1177
- model_dir = "pretrained_models/Spark-TTS-0.5B"
1178
-
1179
- # Check if model exists, if not download it
1180
- if not os.path.exists(model_dir):
1181
- print("Downloading Spark-TTS model...")
1182
- try:
1183
- os.makedirs("pretrained_models", exist_ok=True)
1184
- snapshot_download(
1185
- "SparkAudio/Spark-TTS-0.5B",
1186
- local_dir=model_dir
1187
- )
1188
- print("Spark-TTS model downloaded successfully")
1189
- except Exception as e:
1190
- raise RuntimeError(f"Failed to download Spark-TTS model: {e}")
1191
-
1192
- self.spark_model_dir = model_dir
1193
-
1194
- # Check if we have the CLI inference script
1195
- if not os.path.exists("cli/inference.py"):
1196
- print("Warning: Spark-TTS CLI not found. Please clone the Spark-TTS repository.")
1197
-
1198
- @spaces.GPU(duration=60)
1199
- def initialize_melo_tts(self):
1200
- """Initialize MeloTTS models"""
1201
- if MELO_AVAILABLE and self.melo_models is None:
1202
- self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
1203
-
1204
- def fetch_text(self, url: str) -> str:
1205
- """Fetch text content from URL"""
1206
- if not url:
1207
- raise ValueError("URL cannot be empty")
1208
-
1209
- if not url.startswith("http://") and not url.startswith("https://"):
1210
- raise ValueError("URL must start with 'http://' or 'https://'")
1211
-
1212
- full_url = f"{self.config.prefix_url}{url}"
1213
- try:
1214
- response = httpx.get(full_url, timeout=60.0)
1215
- response.raise_for_status()
1216
- return response.text
1217
- except httpx.HTTPError as e:
1218
- raise RuntimeError(f"Failed to fetch URL: {e}")
1219
-
1220
- def extract_text_from_pdf(self, pdf_file) -> str:
1221
- """Extract text content from PDF file"""
1222
- try:
1223
- # Gradio returns file path, not file object
1224
- if isinstance(pdf_file, str):
1225
- pdf_path = pdf_file
1226
- else:
1227
- # If it's a file object (shouldn't happen with Gradio)
1228
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
1229
- tmp_file.write(pdf_file.read())
1230
- pdf_path = tmp_file.name
1231
-
1232
- # PDF 로드 및 텍스트 추출
1233
- loader = PyPDFLoader(pdf_path)
1234
- pages = loader.load()
1235
-
1236
- # 모든 페이지의 텍스트를 결합
1237
- text = "\n".join([page.page_content for page in pages])
1238
-
1239
- # 임시 파일인 경우 삭제
1240
- if not isinstance(pdf_file, str) and os.path.exists(pdf_path):
1241
- os.unlink(pdf_path)
1242
-
1243
- return text
1244
- except Exception as e:
1245
- raise RuntimeError(f"Failed to extract text from PDF: {e}")
1246
-
1247
- def _get_messages_formatter_type(self, model_name):
1248
- """Get appropriate message formatter for the model"""
1249
- if "Mistral" in model_name or "BitSix" in model_name:
1250
- return MessagesFormatterType.CHATML
1251
- else:
1252
- return MessagesFormatterType.LLAMA_3
1253
-
1254
- @spaces.GPU(duration=120)
1255
- def extract_conversation_local(self, text: str, language: str = "English", progress=None) -> Dict:
1256
- """Extract conversation using new local LLM with enhanced professional style"""
1257
- try:
1258
- # 검색 컨텍스트 생성 (키워드 기반이 아닌 경우)
1259
- search_context = ""
1260
- if BRAVE_KEY and not text.startswith("Keyword-based content:"):
1261
- try:
1262
- keywords = extract_keywords_for_search(text, language)
1263
- if keywords:
1264
- search_query = keywords[0] if language == "Korean" else f"{keywords[0]} latest news"
1265
- search_context = format_search_results(search_query)
1266
- print(f"Search context added for: {search_query}")
1267
- except Exception as e:
1268
- print(f"Search failed, continuing without context: {e}")
1269
-
1270
- # 먼저 새로운 로컬 LLM 시도
1271
- self.initialize_local_mode()
1272
-
1273
- chat_template = self._get_messages_formatter_type(self.config.local_model_name)
1274
- provider = LlamaCppPythonProvider(self.local_llm)
1275
-
1276
- # 언어별 시스템 메시지
1277
- system_messages = {
1278
- "Korean": (
1279
- "당신은 한국의 유명 팟캐스트 전문 작가입니다. "
1280
- "청취자들이 깊이 있는 전문 지식을 얻을 수 있는 고품질 대담을 한국어로 만듭니다. "
1281
- "반드시 서로 존댓말을 사용하며, 12회의 대화 교환으로 구성하세요. "
1282
- "모든 대화는 반드시 한국어로 작성하고 JSON 형식으로만 응답하세요."
1283
- ),
1284
- "Japanese": (
1285
- "あなたは日本の有名なポッドキャスト専門作家です。"
1286
- "聴衆が深い専門知識を得られる高品質な対談を日本語で作成します。"
1287
- "必ずお互いに丁寧語を使用し、12回の対話交換で構成してください。"
1288
- "すべての対話は必ず日本語で作成し、JSON形式でのみ回答してください。"
1289
- ),
1290
- "French": (
1291
- "Vous êtes un célèbre scénariste de podcast professionnel français. "
1292
- "Créez des discussions de haute qualité en français qui donnent au public "
1293
- "des connaissances professionnelles approfondies. "
1294
- "Créez exactement 12 échanges de conversation et répondez uniquement en format JSON."
1295
- ),
1296
- "German": (
1297
- "Sie sind ein berühmter professioneller Podcast-Drehbuchautor aus Deutschland. "
1298
- "Erstellen Sie hochwertige Diskussionen auf Deutsch, die dem Publikum "
1299
- "tiefgreifendes Fachwissen vermitteln. "
1300
- "Erstellen Sie genau 12 Gesprächsaustausche und antworten Sie nur im JSON-Format."
1301
- ),
1302
- "Spanish": (
1303
- "Eres un famoso guionista de podcast profesional español. "
1304
- "Crea discusiones de alta calidad en español que brinden al público "
1305
- "conocimientos profesionales profundos. "
1306
- "Crea exactamente 12 intercambios de conversación y responde solo en formato JSON."
1307
- ),
1308
- "Chinese": (
1309
- "您是中国著名的专业播客编剧。"
1310
- "创建高质量的中文讨论,为观众提供深入的专业知识。"
1311
- "创建恰好12次对话交换,仅以JSON格式回答。"
1312
- ),
1313
- "Russian": (
1314
- "Вы известный профессиональный сценарист подкастов из России. "
1315
- "Создавайте высококачественные дискуссии на русском языке, которые дают аудитории "
1316
- "глубокие профессиональные знания. "
1317
- "Создайте ровно 12 обменов разговором и отвечайте только в формате JSON."
1318
- )
1319
- }
1320
-
1321
- system_message = system_messages.get(language,
1322
- f"You are a professional podcast scriptwriter creating high-quality, "
1323
- f"insightful discussions in {language}. Create exactly 12 conversation exchanges "
1324
- f"with professional expertise. All dialogue must be in {language}. "
1325
- f"Respond only in JSON format."
1326
- )
1327
-
1328
- agent = LlamaCppAgent(
1329
- provider,
1330
- system_prompt=system_message,
1331
- predefined_messages_formatter_type=chat_template,
1332
- debug_output=False
1333
- )
1334
-
1335
- settings = provider.get_provider_default_settings()
1336
- settings.temperature = 0.75
1337
- settings.top_k = 40
1338
- settings.top_p = 0.95
1339
- settings.max_tokens = self.config.max_tokens
1340
- settings.repeat_penalty = 1.1
1341
- settings.stream = False
1342
-
1343
- messages = BasicChatHistory()
1344
-
1345
- prompt = self.prompt_builder.build_prompt(text, language, search_context)
1346
- response = agent.get_chat_response(
1347
- prompt,
1348
- llm_sampling_settings=settings,
1349
- chat_history=messages,
1350
- returns_streaming_generator=False,
1351
- print_output=False
1352
- )
1353
-
1354
- # JSON 파싱
1355
- pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
1356
- json_match = re.search(pattern, response)
1357
-
1358
- if json_match:
1359
- conversation_data = json.loads(json_match.group())
1360
- return conversation_data
1361
- else:
1362
- raise ValueError("No valid JSON found in local LLM response")
1363
-
1364
- except Exception as e:
1365
- print(f"Local LLM failed: {e}, falling back to legacy local method")
1366
- return self.extract_conversation_legacy_local(text, language, progress, search_context)
1367
-
1368
- @spaces.GPU(duration=120)
1369
- def extract_conversation_legacy_local(self, text: str, language: str = "English", progress=None, search_context: str = "") -> Dict:
1370
- """Extract conversation using legacy local model"""
1371
- try:
1372
- self.initialize_legacy_local_mode()
1373
-
1374
- # 언어별 시스템 메시지는 config_prompts에서 가져옴
1375
- messages = self.prompt_builder.build_messages_for_local(text, language, search_context)
1376
-
1377
- terminators = [
1378
- self.legacy_tokenizer.eos_token_id,
1379
- self.legacy_tokenizer.convert_tokens_to_ids("<|eot_id|>")
1380
- ]
1381
-
1382
- chat_messages = self.legacy_tokenizer.apply_chat_template(
1383
- messages, tokenize=False, add_generation_prompt=True
1384
- )
1385
- model_inputs = self.legacy_tokenizer([chat_messages], return_tensors="pt").to(self.device)
1386
-
1387
- streamer = TextIteratorStreamer(
1388
- self.legacy_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
1389
- )
1390
-
1391
- generate_kwargs = dict(
1392
- model_inputs,
1393
- streamer=streamer,
1394
- max_new_tokens=self.config.max_new_tokens,
1395
- do_sample=True,
1396
- temperature=0.75,
1397
- eos_token_id=terminators,
1398
- )
1399
-
1400
- t = Thread(target=self.legacy_local_model.generate, kwargs=generate_kwargs)
1401
- t.start()
1402
-
1403
- partial_text = ""
1404
- for new_text in streamer:
1405
- partial_text += new_text
1406
-
1407
- pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
1408
- json_match = re.search(pattern, partial_text)
1409
-
1410
- if json_match:
1411
- return json.loads(json_match.group())
1412
- else:
1413
- raise ValueError("No valid JSON found in legacy local response")
1414
-
1415
- except Exception as e:
1416
- print(f"Legacy local model also failed: {e}")
1417
- return DefaultConversations.get_conversation(language)
1418
-
1419
- def extract_conversation_api(self, text: str, language: str = "English") -> Dict:
1420
- """Extract conversation using API"""
1421
- if not self.llm_client:
1422
- raise RuntimeError("API mode not initialized")
1423
-
1424
- try:
1425
- # 검색 컨텍스트 생성
1426
- search_context = ""
1427
- if BRAVE_KEY and not text.startswith("Keyword-based content:"):
1428
- try:
1429
- keywords = extract_keywords_for_search(text, language)
1430
- if keywords:
1431
- search_query = keywords[0] if language == "Korean" else f"{keywords[0]} latest news"
1432
- search_context = format_search_results(search_query)
1433
- print(f"Search context added for: {search_query}")
1434
- except Exception as e:
1435
- print(f"Search failed, continuing without context: {e}")
1436
-
1437
- # 메시지 빌드
1438
- messages = self.prompt_builder.build_messages_for_local(text, language, search_context)
1439
-
1440
- chat_completion = self.llm_client.chat.completions.create(
1441
- messages=messages,
1442
- model=self.config.api_model_name,
1443
- temperature=0.75,
1444
- )
1445
-
1446
- pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
1447
- json_match = re.search(pattern, chat_completion.choices[0].message.content)
1448
-
1449
- if not json_match:
1450
- raise ValueError("No valid JSON found in response")
1451
-
1452
- return json.loads(json_match.group())
1453
- except Exception as e:
1454
- raise RuntimeError(f"Failed to extract conversation: {e}")
1455
-
1456
- def parse_conversation_text(self, conversation_text: str) -> Dict:
1457
- """Parse conversation text back to JSON format"""
1458
- lines = conversation_text.strip().split('\n')
1459
- conversation_data = {"conversation": []}
1460
-
1461
- for line in lines:
1462
- if ':' in line:
1463
- speaker, text = line.split(':', 1)
1464
- conversation_data["conversation"].append({
1465
- "speaker": speaker.strip(),
1466
- "text": text.strip()
1467
- })
1468
-
1469
- return conversation_data
1470
-
1471
- async def text_to_speech_edge(self, conversation_json: Dict, language: str = "English") -> Tuple[str, str]:
1472
- """Convert text to speech using Edge TTS"""
1473
- output_dir = Path(self._create_output_directory())
1474
- filenames = []
1475
-
1476
- try:
1477
- # 언어별 음성 설정
1478
- voices = EDGE_TTS_VOICES.get(language, EDGE_TTS_VOICES["English"])
1479
-
1480
- for i, turn in enumerate(conversation_json["conversation"]):
1481
- filename = output_dir / f"output_{i}.wav"
1482
- voice = voices[i % len(voices)]
1483
-
1484
- tmp_path = await self._generate_audio_edge(turn["text"], voice)
1485
- os.rename(tmp_path, filename)
1486
- filenames.append(str(filename))
1487
-
1488
- # Combine audio files
1489
- final_output = os.path.join(output_dir, "combined_output.wav")
1490
- self._combine_audio_files(filenames, final_output)
1491
-
1492
- # Generate conversation text
1493
- conversation_text = "\n".join(
1494
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
1495
- for i, turn in enumerate(conversation_json["conversation"])
1496
- )
1497
 
1498
- return final_output, conversation_text
1499
- except Exception as e:
1500
- raise RuntimeError(f"Failed to convert text to speech: {e}")
1501
-
1502
- async def _generate_audio_edge(self, text: str, voice: str) -> str:
1503
- """Generate audio using Edge TTS"""
1504
- if not text.strip():
1505
- raise ValueError("Text cannot be empty")
1506
-
1507
- voice_short_name = voice.split(" - ")[0] if " - " in voice else voice
1508
- communicate = edge_tts.Communicate(text, voice_short_name)
1509
-
1510
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
1511
- tmp_path = tmp_file.name
1512
- await communicate.save(tmp_path)
1513
-
1514
- return tmp_path
1515
-
1516
- @spaces.GPU(duration=60)
1517
- def text_to_speech_spark(self, conversation_json: Dict, language: str = "English", progress=None) -> Tuple[str, str]:
1518
- """Convert text to speech using Spark TTS CLI"""
1519
- if not SPARK_AVAILABLE or not self.spark_model_dir:
1520
- raise RuntimeError("Spark TTS not available")
1521
-
1522
- try:
1523
- output_dir = self._create_output_directory()
1524
- audio_files = []
1525
-
1526
- # Create different voice characteristics for different speakers
1527
- speaker1, speaker2 = self.prompt_builder.get_speaker_names(language)
1528
-
1529
- if language == "Korean":
1530
- voice_configs = [
1531
- {"prompt_text": f"안녕하세요, 오늘 팟캐스트 진행을 맡은 {speaker1}입니다.", "gender": "male"},
1532
- {"prompt_text": f"안녕하세요, 저는 오늘 이 주제에 대해 설명드릴 {speaker2}입니다.", "gender": "male"}
1533
- ]
1534
- else:
1535
- voice_configs = [
1536
- {"prompt_text": f"Hello everyone, I'm {speaker1}, your host for today's podcast.", "gender": "male"},
1537
- {"prompt_text": f"Hi, I'm {speaker2}. I'm excited to share my insights with you.", "gender": "male"}
1538
- ]
1539
-
1540
- for i, turn in enumerate(conversation_json["conversation"]):
1541
- text = turn["text"]
1542
- if not text.strip():
1543
- continue
1544
-
1545
- voice_config = voice_configs[i % len(voice_configs)]
1546
- output_file = os.path.join(output_dir, f"spark_output_{i}.wav")
1547
-
1548
- cmd = [
1549
- "python", "-m", "cli.inference",
1550
- "--text", text,
1551
- "--device", "0" if torch.cuda.is_available() else "cpu",
1552
- "--save_dir", output_dir,
1553
- "--model_dir", self.spark_model_dir,
1554
- "--prompt_text", voice_config["prompt_text"],
1555
- "--output_name", f"spark_output_{i}.wav"
1556
- ]
1557
-
1558
- try:
1559
- result = subprocess.run(
1560
- cmd,
1561
- capture_output=True,
1562
- text=True,
1563
- timeout=60,
1564
- cwd="."
1565
- )
1566
-
1567
- if result.returncode == 0:
1568
- audio_files.append(output_file)
1569
- else:
1570
- print(f"Spark TTS error for turn {i}: {result.stderr}")
1571
- silence = np.zeros(int(22050 * 1.0))
1572
- sf.write(output_file, silence, 22050)
1573
- audio_files.append(output_file)
1574
-
1575
- except subprocess.TimeoutExpired:
1576
- print(f"Spark TTS timeout for turn {i}")
1577
- silence = np.zeros(int(22050 * 1.0))
1578
- sf.write(output_file, silence, 22050)
1579
- audio_files.append(output_file)
1580
- except Exception as e:
1581
- print(f"Error running Spark TTS for turn {i}: {e}")
1582
- silence = np.zeros(int(22050 * 1.0))
1583
- sf.write(output_file, silence, 22050)
1584
- audio_files.append(output_file)
1585
-
1586
- # Combine all audio files
1587
- if audio_files:
1588
- final_output = os.path.join(output_dir, "spark_combined.wav")
1589
- self._combine_audio_files(audio_files, final_output)
1590
- else:
1591
- raise RuntimeError("No audio files generated")
1592
-
1593
- conversation_text = "\n".join(
1594
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
1595
- for i, turn in enumerate(conversation_json["conversation"])
1596
- )
1597
-
1598
- return final_output, conversation_text
1599
-
1600
- except Exception as e:
1601
- raise RuntimeError(f"Failed to convert text to speech with Spark TTS: {e}")
1602
-
1603
- @spaces.GPU(duration=60)
1604
- def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
1605
- """Convert text to speech using MeloTTS"""
1606
- if not MELO_AVAILABLE or not self.melo_models:
1607
- raise RuntimeError("MeloTTS not available")
1608
-
1609
- speakers = ["EN-Default", "EN-US"]
1610
- combined_audio = AudioSegment.empty()
1611
-
1612
- for i, turn in enumerate(conversation_json["conversation"]):
1613
- bio = io.BytesIO()
1614
- text = turn["text"]
1615
- speaker = speakers[i % 2]
1616
- speaker_id = self.melo_models["EN"].hps.data.spk2id[speaker]
1617
-
1618
- self.melo_models["EN"].tts_to_file(
1619
- text, speaker_id, bio, speed=1.0,
1620
- pbar=progress.tqdm if progress else None,
1621
- format="wav"
1622
- )
1623
-
1624
- bio.seek(0)
1625
- audio_segment = AudioSegment.from_file(bio, format="wav")
1626
- combined_audio += audio_segment
1627
-
1628
- final_audio_path = "melo_podcast.mp3"
1629
- combined_audio.export(final_audio_path, format="mp3")
1630
-
1631
- conversation_text = "\n".join(
1632
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
1633
- for i, turn in enumerate(
1634
-
1635
-
1636
-
1637
-
1638
-
1639
-
1640
-
1641
- conversation_json["conversation"])
1642
- )
1643
-
1644
- return final_audio_path, conversation_text
1645
-
1646
- def _create_output_directory(self) -> str:
1647
- """Create a unique output directory"""
1648
- random_bytes = os.urandom(8)
1649
- folder_name = base64.urlsafe_b64encode(random_bytes).decode("utf-8")
1650
- os.makedirs(folder_name, exist_ok=True)
1651
- return folder_name
1652
-
1653
- def _combine_audio_files(self, filenames: List[str], output_file: str) -> None:
1654
- """Combine multiple audio files into one"""
1655
- if not filenames:
1656
- raise ValueError("No input files provided")
1657
-
1658
- try:
1659
- audio_segments = []
1660
- for filename in filenames:
1661
- if os.path.exists(filename):
1662
- audio_segment = AudioSegment.from_file(filename)
1663
- audio_segments.append(audio_segment)
1664
-
1665
- if audio_segments:
1666
- combined = sum(audio_segments)
1667
- combined.export(output_file, format="wav")
1668
-
1669
- # Clean up temporary files
1670
- for filename in filenames:
1671
- if os.path.exists(filename):
1672
- os.remove(filename)
1673
-
1674
- except Exception as e:
1675
- raise RuntimeError(f"Failed to combine audio files: {e}")
1676
-
1677
-
1678
- # Global converter instance
1679
- converter = UnifiedAudioConverter(ConversationConfig())
1680
-
1681
-
1682
- async def synthesize(article_input, input_type: str = "URL", mode: str = "Local", tts_engine: str = "Edge-TTS", language: str = "English"):
1683
- """Main synthesis function - handles URL, PDF, and Keyword inputs"""
1684
- try:
1685
- # Extract text based on input type
1686
- if input_type == "URL":
1687
- if not article_input or not isinstance(article_input, str):
1688
- return "Please provide a valid URL.", None
1689
- text = converter.fetch_text(article_input)
1690
- elif input_type == "PDF":
1691
- if not article_input:
1692
- return "Please upload a PDF file.", None
1693
- text = converter.extract_text_from_pdf(article_input)
1694
- else: # Keyword
1695
- if not article_input or not isinstance(article_input, str):
1696
- return "Please provide a keyword or topic.", None
1697
- text = search_and_compile_content(article_input, language)
1698
- text = f"Keyword-based content:\n{text}"
1699
-
1700
- # Limit text to max words
1701
- words = text.split()
1702
- if len(words) > converter.config.max_words:
1703
- text = " ".join(words[:converter.config.max_words])
1704
-
1705
- # Extract conversation based on mode
1706
- if mode == "Local":
1707
- try:
1708
- conversation_json = converter.extract_conversation_local(text, language)
1709
- except Exception as e:
1710
- print(f"Local mode failed: {e}, trying API fallback")
1711
- api_key = os.environ.get("TOGETHER_API_KEY")
1712
- if api_key:
1713
- converter.initialize_api_mode(api_key)
1714
- conversation_json = converter.extract_conversation_api(text, language)
1715
- else:
1716
- raise RuntimeError("Local mode failed and no API key available for fallback")
1717
- else: # API mode
1718
- api_key = os.environ.get("TOGETHER_API_KEY")
1719
- if not api_key:
1720
- print("API key not found, falling back to local mode")
1721
- conversation_json = converter.extract_conversation_local(text, language)
1722
- else:
1723
- try:
1724
- converter.initialize_api_mode(api_key)
1725
- conversation_json = converter.extract_conversation_api(text, language)
1726
- except Exception as e:
1727
- print(f"API mode failed: {e}, falling back to local mode")
1728
- conversation_json = converter.extract_conversation_local(text, language)
1729
-
1730
- # Generate conversation text
1731
- conversation_text = "\n".join(
1732
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
1733
- for i, turn in enumerate(conversation_json["conversation"])
1734
- )
1735
-
1736
- return conversation_text, None
1737
-
1738
- except Exception as e:
1739
- return f"Error: {str(e)}", None
1740
-
1741
-
1742
- async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
1743
- """Regenerate audio from edited conversation text"""
1744
- if not conversation_text.strip():
1745
- return "Please provide conversation text.", None
1746
-
1747
- try:
1748
- conversation_json = converter.parse_conversation_text(conversation_text)
1749
-
1750
- if not conversation_json["conversation"]:
1751
- return "No valid conversation found in the text.", None
1752
-
1753
- # Edge TTS 전용 언어는 자동으로 Edge-TTS 사용
1754
- if language in EDGE_TTS_ONLY_LANGUAGES and tts_engine != "Edge-TTS":
1755
- tts_engine = "Edge-TTS"
1756
-
1757
- # Generate audio based on TTS engine
1758
- if tts_engine == "Edge-TTS":
1759
- output_file, _ = await converter.text_to_speech_edge(conversation_json, language)
1760
- elif tts_engine == "Spark-TTS":
1761
- if not SPARK_AVAILABLE:
1762
- return "Spark TTS not available. Please install required dependencies and clone the Spark-TTS repository.", None
1763
- converter.initialize_spark_tts()
1764
- output_file, _ = converter.text_to_speech_spark(conversation_json, language)
1765
- else: # MeloTTS
1766
- if not MELO_AVAILABLE:
1767
- return "MeloTTS not available. Please install required dependencies.", None
1768
- if language in EDGE_TTS_ONLY_LANGUAGES:
1769
- return f"MeloTTS does not support {language}. Please use Edge-TTS for this language.", None
1770
- converter.initialize_melo_tts()
1771
- output_file, _ = converter.text_to_speech_melo(conversation_json)
1772
-
1773
- return "Audio generated successfully!", output_file
1774
-
1775
  except Exception as e:
1776
- return f"Error generating audio: {str(e)}", None
1777
-
1778
-
1779
- def synthesize_sync(article_input, input_type: str = "URL", mode: str = "Local", tts_engine: str = "Edge-TTS", language: str = "English"):
1780
- """Synchronous wrapper for async synthesis"""
1781
- return asyncio.run(synthesize(article_input, input_type, mode, tts_engine, language))
1782
-
1783
-
1784
- def regenerate_audio_sync(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
1785
- """Synchronous wrapper for async audio regeneration"""
1786
- return asyncio.run(regenerate_audio(conversation_text, tts_engine, language))
1787
-
1788
-
1789
- def update_tts_engine_for_language(language):
1790
- """언어별 TTS 엔진 옵션 업데이트"""
1791
- if language in EDGE_TTS_ONLY_LANGUAGES:
1792
- language_info = {
1793
- "Korean": "한국어는 Edge-TTS만 지원됩니다",
1794
- "Japanese": "日本語はEdge-TTSのみサポートされています",
1795
- "French": "Le français n'est pris en charge que par Edge-TTS",
1796
- "German": "Deutsch wird nur von Edge-TTS unterstützt",
1797
- "Spanish": "El español solo es compatible con Edge-TTS",
1798
- "Italian": "L'italiano è supportato solo da Edge-TTS",
1799
- "Portuguese": "O português é suportado apenas pelo Edge-TTS",
1800
- "Dutch": "Nederlands wordt alleen ondersteund door Edge-TTS",
1801
- "Thai": "ภาษาไทยรองรับเฉพาะ Edge-TTS เท่านั้น",
1802
- "Vietnamese": "Tiếng Việt chỉ được hỗ trợ bởi Edge-TTS",
1803
- "Arabic": "العربية مدعومة فقط من Edge-TTS",
1804
- "Hebrew": "עברית נתמכת רק על ידי Edge-TTS",
1805
- "Indonesian": "Bahasa Indonesia hanya didukung oleh Edge-TTS",
1806
- "Hindi": "हिंदी केवल Edge-TTS द्वारा समर्थित है",
1807
- "Russian": "Русский поддерживается только Edge-TTS",
1808
- "Chinese": "中文仅支持Edge-TTS"
1809
- }
1810
- info_text = language_info.get(language, f"{language} is only supported by Edge-TTS")
1811
-
1812
- return gr.Radio(
1813
- choices=["Edge-TTS"],
1814
- value="Edge-TTS",
1815
- label="TTS Engine",
1816
- info=info_text,
1817
- interactive=False
1818
- )
1819
- else:
1820
- return gr.Radio(
1821
- choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
1822
- value="Edge-TTS",
1823
- label="TTS Engine",
1824
- info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU",
1825
- interactive=True
1826
- )
1827
-
1828
-
1829
- def toggle_input_visibility(input_type):
1830
- """Toggle visibility of URL input, file upload, and keyword input based on input type"""
1831
- if input_type == "URL":
1832
- return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
1833
- elif input_type == "PDF":
1834
- return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
1835
- else: # Keyword
1836
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
1837
-
1838
-
1839
- # 모델 초기화 (앱 시작 시)
1840
- if LLAMA_CPP_AVAILABLE:
1841
- try:
1842
- model_path = hf_hub_download(
1843
- repo_id=converter.config.local_model_repo,
1844
- filename=converter.config.local_model_name,
1845
- local_dir="./models"
1846
- )
1847
- print(f"Model downloaded to: {model_path}")
1848
- except Exception as e:
1849
- print(f"Failed to download model at startup: {e}")
1850
-
1851
-
1852
- # Gradio Interface - 개선된 다국어 레이아웃
1853
- with gr.Blocks(theme='soft', title="AI Podcast Generator", css="""
1854
- .container {max-width: 1200px; margin: auto; padding: 20px;}
1855
- .header-text {text-align: center; margin-bottom: 30px;}
1856
- .input-group {background: #f7f7f7; padding: 20px; border-radius: 10px; margin-bottom: 20px;}
1857
- .output-group {background: #f0f0f0; padding: 20px; border-radius: 10px;}
1858
- .status-box {background: #e8f4f8; padding: 15px; border-radius: 8px; margin-top: 10px;}
1859
- """) as demo:
1860
- with gr.Column(elem_classes="container"):
1861
- # 헤더
1862
- with gr.Row(elem_classes="header-text"):
1863
- gr.Markdown("""
1864
- # 🎙️ AI Podcast Generator - Professional Multi-Language Edition
1865
- ### Convert any article, blog, PDF document, or topic into an engaging professional podcast conversation in 24+ languages!
1866
- """)
1867
-
1868
- with gr.Row(elem_classes="discord-badge"):
1869
- gr.HTML("""
1870
- <p style="text-align: center;">
1871
- <a href="https://discord.gg/openfreeai" target="_blank">
1872
- <img src="https://img.shields.io/static/v1?label=Discord&message=Openfree%20AI&color=%230000ff&labelColor=%23800080&logo=discord&logoColor=white&style=for-the-badge" alt="badge">
1873
- </a>
1874
- </p>
1875
- """)
1876
-
1877
- # 상태 표시 섹션
1878
- with gr.Row():
1879
- with gr.Column(scale=1):
1880
- gr.Markdown(f"""
1881
- #### 🤖 System Status
1882
- - **LLM**: {converter.config.local_model_name.split('.')[0]}
1883
- - **Fallback**: {converter.config.api_model_name.split('/')[-1]}
1884
- - **Llama CPP**: {"✅ Ready" if LLAMA_CPP_AVAILABLE else "❌ Not Available"}
1885
- - **Search**: {"✅ Brave API" if BRAVE_KEY else "❌ No API"}
1886
- """)
1887
- with gr.Column(scale=1):
1888
- gr.Markdown("""
1889
- #### 🌍 Multi-Language Support
1890
- - **24+ Languages**: Korean, Japanese, French, German, Spanish, Italian, etc.
1891
- - **Native Voices**: Optimized for each language
1892
- - **Professional Style**: Expert discussions with data & insights
1893
- - **Auto-TTS Selection**: Best engine per language
1894
- """)
1895
-
1896
- # 메인 입력 섹션
1897
- with gr.Group(elem_classes="input-group"):
1898
- with gr.Row():
1899
- # 왼쪽: 입력 옵션들
1900
- with gr.Column(scale=2):
1901
- # 입력 타입 선택
1902
- input_type_selector = gr.Radio(
1903
- choices=["URL", "PDF", "Keyword"],
1904
- value="URL",
1905
- label="📥 Input Type",
1906
- info="Choose your content source"
1907
- )
1908
-
1909
- # URL 입력
1910
- url_input = gr.Textbox(
1911
- label="🔗 Article URL",
1912
- placeholder="Enter the article URL here...",
1913
- value="",
1914
- visible=True,
1915
- lines=2
1916
- )
1917
-
1918
- # PDF 업로드
1919
- pdf_input = gr.File(
1920
- label="📄 Upload PDF",
1921
- file_types=[".pdf"],
1922
- visible=False
1923
- )
1924
-
1925
- # 키워드 입력
1926
- keyword_input = gr.Textbox(
1927
- label="🔍 Topic/Keyword",
1928
- placeholder="Enter a topic (e.g., 'AI trends 2024', '인공지능', 'IA tendances', 'KI Trends')",
1929
- value="",
1930
- visible=False,
1931
- info="System will search and compile latest information",
1932
- lines=2
1933
- )
1934
-
1935
- # 오른쪽: 설정 옵션들
1936
- with gr.Column(scale=1):
1937
- # 언어 선택
1938
- language_selector = gr.Radio(
1939
- choices=[
1940
- "English", "Korean", "Japanese", "French", "German",
1941
- "Spanish", "Italian", "Portuguese", "Dutch", "Thai",
1942
- "Vietnamese", "Arabic", "Hebrew", "Indonesian", "Hindi",
1943
- "Russian", "Chinese", "Norwegian", "Swedish", "Finnish",
1944
- "Danish", "Polish", "Turkish", "Greek", "Czech"
1945
- ],
1946
- value="English",
1947
- label="🌐 Language / 언어 / 语言",
1948
- info="Select podcast language"
1949
- )
1950
-
1951
- # 처리 모드
1952
- mode_selector = gr.Radio(
1953
- choices=["Local", "API"],
1954
- value="Local",
1955
- label="⚙️ Processing Mode",
1956
- info="Local: On-device | API: Cloud"
1957
- )
1958
-
1959
- # TTS 엔진
1960
- tts_selector = gr.Radio(
1961
- choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
1962
- value="Edge-TTS",
1963
- label="🔊 TTS Engine",
1964
- info="Voice synthesis engine"
1965
- )
1966
-
1967
- # 생성 버튼
1968
- with gr.Row():
1969
- convert_btn = gr.Button(
1970
- "🎯 Generate Professional Conversation",
1971
- variant="primary",
1972
- size="lg",
1973
- scale=1
1974
- )
1975
-
1976
- # 출력 섹션
1977
- with gr.Group(elem_classes="output-group"):
1978
- with gr.Row():
1979
- # 왼쪽: 대화 텍스트
1980
- with gr.Column(scale=3):
1981
- conversation_output = gr.Textbox(
1982
- label="💬 Generated Professional Conversation (Editable)",
1983
- lines=25,
1984
- max_lines=50,
1985
- interactive=True,
1986
- placeholder="Professional podcast conversation will appear here...\n전문 팟캐스트 대화가 여기에 표시됩니다...\nLa conversation professionnelle du podcast apparaîtra ici...",
1987
- info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
1988
- )
1989
-
1990
- # 오디오 생성 버튼
1991
- with gr.Row():
1992
- generate_audio_btn = gr.Button(
1993
- "🎙️ Generate Audio from Text",
1994
- variant="secondary",
1995
- size="lg"
1996
- )
1997
-
1998
- # 오른쪽: 오디오 출력 및 상태
1999
- with gr.Column(scale=2):
2000
- audio_output = gr.Audio(
2001
- label="🎧 Professional Podcast Audio",
2002
- type="filepath",
2003
- interactive=False
2004
- )
2005
-
2006
- status_output = gr.Textbox(
2007
- label="📊 Status",
2008
- interactive=False,
2009
- lines=3,
2010
- elem_classes="status-box"
2011
- )
2012
-
2013
- # 도움말
2014
- gr.Markdown("""
2015
- #### 💡 Quick Tips:
2016
- - **URL**: Paste any article link
2017
- - **PDF**: Upload documents directly
2018
- - **Keyword**: Enter topics for AI research
2019
- - **24+ Languages** fully supported
2020
- - Edit conversation before audio generation
2021
- - Auto TTS engine selection per language
2022
- """)
2023
-
2024
- # 예제 섹션
2025
- with gr.Accordion("📚 Multi-Language Examples", open=False):
2026
- gr.Examples(
2027
- examples=[
2028
- ["https://huggingface.co/blog/openfreeai/cycle-navigator", "URL", "Local", "Edge-TTS", "English"],
2029
- ["quantum computing breakthroughs", "Keyword", "Local", "Edge-TTS", "English"],
2030
- ["인공지능 윤리와 규제", "Keyword", "Local", "Edge-TTS", "Korean"],
2031
- ["https://huggingface.co/papers/2505.14810", "URL", "Local", "Edge-TTS", "Japanese"],
2032
- ["intelligence artificielle tendances", "Keyword", "Local", "Edge-TTS", "French"],
2033
- ["künstliche intelligenz entwicklung", "Keyword", "Local", "Edge-TTS", "German"],
2034
- ["inteligencia artificial avances", "Keyword", "Local", "Edge-TTS", "Spanish"],
2035
- ],
2036
- inputs=[url_input, input_type_selector, mode_selector, tts_selector, language_selector],
2037
- outputs=[conversation_output, status_output],
2038
- fn=synthesize_sync,
2039
- cache_examples=False,
2040
- )
2041
-
2042
- # Input type change handler
2043
- input_type_selector.change(
2044
- fn=toggle_input_visibility,
2045
- inputs=[input_type_selector],
2046
- outputs=[url_input, pdf_input, keyword_input]
2047
- )
2048
-
2049
- # 언어 변경 시 TTS 엔진 옵션 업데이트
2050
- language_selector.change(
2051
- fn=update_tts_engine_for_language,
2052
- inputs=[language_selector],
2053
- outputs=[tts_selector]
2054
- )
2055
-
2056
- # 이벤트 연결
2057
- def get_article_input(input_type, url_input, pdf_input, keyword_input):
2058
- """Get the appropriate input based on input type"""
2059
- if input_type == "URL":
2060
- return url_input
2061
- elif input_type == "PDF":
2062
- return pdf_input
2063
- else: # Keyword
2064
- return keyword_input
2065
-
2066
- convert_btn.click(
2067
- fn=lambda input_type, url_input, pdf_input, keyword_input, mode, tts, lang: synthesize_sync(
2068
- get_article_input(input_type, url_input, pdf_input, keyword_input), input_type, mode, tts, lang
2069
- ),
2070
- inputs=[input_type_selector, url_input, pdf_input, keyword_input, mode_selector, tts_selector, language_selector],
2071
- outputs=[conversation_output, status_output]
2072
- )
2073
-
2074
- generate_audio_btn.click(
2075
- fn=regenerate_audio_sync,
2076
- inputs=[conversation_output, tts_selector, language_selector],
2077
- outputs=[status_output, audio_output]
2078
- )
2079
-
2080
 
2081
- # Launch the app
2082
  if __name__ == "__main__":
2083
- demo.queue(api_open=True, default_concurrency_limit=10).launch(
2084
- show_api=True,
2085
- share=False,
2086
- server_name="0.0.0.0",
2087
- server_port=7860
2088
- )
 
 
 
1
  import os
2
+ import sys
3
+ import streamlit as st
4
+ from tempfile import NamedTemporaryFile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ def main():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  try:
8
+ # Get the code from secrets
9
+ code = os.environ.get("MAIN_CODE")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ if not code:
12
+ st.error("⚠️ The application code wasn't found in secrets. Please add the MAIN_CODE secret.")
13
+ return
14
 
15
+ # Create a temporary Python file
16
+ with NamedTemporaryFile(suffix='.py', delete=False, mode='w') as tmp:
17
+ tmp.write(code)
18
+ tmp_path = tmp.name
 
 
 
 
 
 
 
 
19
 
20
+ # Execute the code
21
+ exec(compile(code, tmp_path, 'exec'), globals())
22
 
23
+ # Clean up the temporary file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  try:
25
+ os.unlink(tmp_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  except:
27
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  except Exception as e:
30
+ st.error(f"⚠️ Error loading or executing the application: {str(e)}")
31
+ import traceback
32
+ st.code(traceback.format_exc())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
 
34
  if __name__ == "__main__":
35
+ main()