Spaces:

VIDraft
/

voice-trans

Running on Zero

App Files Files Community

openfree commited on 6 days ago

Commit

aba9e95

verified ·

1 Parent(s): e1fe24a

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -573

app.py CHANGED Viewed

@@ -1,582 +1,35 @@
-# SMARTok Demo - 실시간 다국어 번역 시스템
-#
-# 필수 패키지:
-# pip install gradio openai python-dotenv pdfplumber numpy websockets
-#
-# 선택 패키지 (비디오 처리):
-# - ffmpeg 설치: sudo apt-get install ffmpeg (Linux) / brew install ffmpeg (Mac)
-# - 또는 pip install moviepy
-#
-# 환경 변수:
-# .env 파일에 OPENAI_API_KEY 설정 필요
-import os, asyncio, json, tempfile, websockets, pdfplumber
-import gradio as gr
-import openai
-from dotenv import load_dotenv
-import numpy as np
-import wave
-import subprocess
-import mimetypes
-# ─── 0. 초기화 ───────────────────────────────────────────────
-load_dotenv()
-openai.api_key = os.getenv("OPENAI_API_KEY")
-if not openai.api_key:
-    raise RuntimeError("OPENAI_API_KEY 가 .env 에 없습니다!")
-# ffmpeg 설치 확인
-def check_ffmpeg():
-    try:
-        subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
-        return True
-    except:
-        return False
-HAS_FFMPEG = check_ffmpeg()
-if not HAS_FFMPEG:
-    print("⚠️ ffmpeg가 설치되어 있지 않습니다. 비디오 처리가 제한될 수 있습니다.")
-    print("설치 방법: sudo apt-get install ffmpeg (Linux) / brew install ffmpeg (Mac)")
-LANG = ["Korean","English","Japanese","Chinese",
-        "Thai","Russian","Vietnamese","Spanish","French"]
-VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
-         for l in LANG}
-FOUR = ["English","Chinese","Thai","Russian"]
-WS_URL = "wss://api.openai.com/v1/realtime"  # 올바른 엔드포인트로 수정
-# ─── 1. 공통 GPT 번역 / TTS ─────────────────────────────────
-# 전역 클라이언트 관리
-client = None
-def get_client():
-    global client
-    if client is None:
-        client = openai.AsyncClient()
-    return client
-async def gpt_translate(text, src, tgt):
-    try:
-        client = get_client()
-        rsp = await client.chat.completions.create(
-            model="gpt-3.5-turbo",
-            messages=[{"role":"system",
-                       "content":f"Translate {src} → {tgt}. Return only the text."},
-                      {"role":"user","content":text}],
-            temperature=0.3,max_tokens=2048)
-        return rsp.choices[0].message.content.strip()
-    except Exception as e:
-        print(f"번역 오류: {e}")
-        return ""
-async def gpt_tts(text, lang):
-    try:
-        client = get_client()
-        rsp = await client.audio.speech.create(
-            model="tts-1", voice=VOICE[lang], input=text[:4096])
-        tmp = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
-        tmp.write(rsp.content); tmp.close(); return tmp.name
-    except Exception as e:
-        print(f"TTS 오류: {e}")
-        return None
-# ─── 2. PDF 번역 ────────────────────────────────────────────
-def translate_pdf(file, src, tgt):
-    if not file: return "⚠️ PDF 업로드 필요", ""
-    with pdfplumber.open(file.name) as pdf:
-        text = "\n".join(p.extract_text() or "" for p in pdf.pages[:5]).strip()
-    if not text:
-        return "⚠️ 텍스트 추출 실패", ""
-    return text, asyncio.run(gpt_translate(text, src, tgt))
-# ─── 2-1. 오디오 번역 (탭1용) ────────────────────────────────
-def extract_audio_from_video(video_path):
-    """MP4 등 비디오 파일에서 오디오 추출"""
-    audio_output = None
-    try:
-        # 임시 오디오 파일 생성
-        audio_output = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-        audio_output.close()
-        # 방법 1: ffmpeg 사용 시도
-        if HAS_FFMPEG:
-            cmd = [
-                'ffmpeg',
-                '-i', video_path,
-                '-vn',  # 비디오 스트림 제거
-                '-acodec', 'pcm_s16le',  # WAV 포맷
-                '-ar', '16000',  # 16kHz 샘플링
-                '-ac', '1',  # 모노
-                '-y',  # 덮어쓰기
-                audio_output.name
-            ]
-            result = subprocess.run(cmd, capture_output=True, text=True)
-            if result.returncode == 0:
-                return audio_output.name
-            else:
-                print(f"ffmpeg 오류: {result.stderr}")
-        # 방법 2: moviepy 사용 시도
-        try:
-            from moviepy.editor import VideoFileClip
-            print("moviepy를 사용하여 오디오 추출 중...")
-            video = VideoFileClip(video_path)
-            video.audio.write_audiofile(
-                audio_output.name,
-                fps=16000,
-                nbytes=2,
-                codec='pcm_s16le',
-                verbose=False,
-                logger=None
-            )
-            video.close()
-            return audio_output.name
-        except ImportError:
-            raise Exception(
-                "비디오 처리를 위해 ffmpeg 또는 moviepy가 필요합니다.\n"
-                "설치: pip install moviepy 또는 ffmpeg 설치"
-            )
-        except Exception as e:
-            raise Exception(f"moviepy 오류: {str(e)}")
-    except Exception as e:
-        # 오류 시 임시 파일 정리
-        if audio_output and os.path.exists(audio_output.name):
-            os.unlink(audio_output.name)
-        raise e
-async def translate_audio_async(file, src, tgt):
-    if not file: return "⚠️ 오디오/비디오 업로드 필요", "", None
     try:
-        # 파일 타입 확인
-        mime_type, _ = mimetypes.guess_type(file)
-        audio_file_path = file
-        temp_audio_path = None
-        # 비디오 파일인 경우 오디오 추출
-        if mime_type and mime_type.startswith('video/'):
-            print(f"비디오 파일 감지: {mime_type}")
-            print(f"파일 크기: {os.path.getsize(file) / 1024 / 1024:.1f} MB")
-            print("비디오에서 오디오 추출 중... (시간이 걸릴 수 있습니다)")
-            temp_audio_path = extract_audio_from_video(file)
-            audio_file_path = temp_audio_path
-            print("오디오 추출 완료!")
-        # STT: Whisper API 사용
-        print("음성 인식 중...")
-        client = get_client()
-        with open(audio_file_path, 'rb') as audio_file:
-            transcript = await client.audio.transcriptions.create(
-                model="whisper-1",
-                file=audio_file,
-                language=src[:2].lower()  # 언어 코드 간소화
-            )
-        # 임시 파일 정리
-        if temp_audio_path and os.path.exists(temp_audio_path):
-            os.unlink(temp_audio_path)
-        orig_text = transcript.text
-        if not orig_text.strip():
-            return "⚠️ 음성이 감지되지 않았습니다", "", None
-        print(f"인식된 텍스트: {orig_text[:50]}...")
-        # 번역
-        print(f"{src} → {tgt} 번역 중...")
-        trans_text = await gpt_translate(orig_text, src, tgt)
-        # TTS
-        print("음성 합성 중...")
-        audio_path = await gpt_tts(trans_text, tgt)
-        return orig_text, trans_text, audio_path
-    except Exception as e:
-        print(f"오디오 번역 오류: {e}")
-        # 임시 파일 정리
-        if 'temp_audio_path' in locals() and temp_audio_path and os.path.exists(temp_audio_path):
-            os.unlink(temp_audio_path)
-        error_msg = str(e)
-        if "ffmpeg" in error_msg.lower():
-            error_msg += "\n\n💡 해결 방법:\n1. ffmpeg 설치: sudo apt-get install ffmpeg\n2. 또는 pip install moviepy"
-        return "⚠️ 번역 중 오류 발생", error_msg, None
-def translate_audio(file, src, tgt):
-    return asyncio.run(translate_audio_async(file, src, tgt))
-# ─── 3. 실시간 STT (Whisper API 사용) ──────────────────────────
-async def process_audio_chunk(audio_data, src_lang):
-    """오디오 청크를 처리하여 텍스트로 변환"""
-    if audio_data is None:
-        return ""
-    try:
-        # Gradio는 (sample_rate, audio_array) 튜플을 반환
-        if isinstance(audio_data, tuple):
-            sample_rate, audio_array = audio_data
-            # 오디오가 너무 짧으면 무시 (0.5초 미만)
-            if len(audio_array) < sample_rate * 0.5:
-                return ""
-            # 오디오 정규화 및 노이즈 필터링
-            audio_array = audio_array.astype(np.float32)
-            # 무음 감지 - RMS가 너무 낮으면 무시
-            rms = np.sqrt(np.mean(audio_array**2))
-            if rms < 0.01:  # 무음 임계값
-                return ""
-            # 정규화
-            max_val = np.max(np.abs(audio_array))
-            if max_val > 0:
-                audio_array = audio_array / max_val * 0.95
-            # numpy array를 WAV 파일로 변환
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
-                with wave.open(tmp.name, 'wb') as wav_file:
-                    wav_file.setnchannels(1)  # mono
-                    wav_file.setsampwidth(2)  # 16-bit
-                    wav_file.setframerate(sample_rate)
-                    # float32를 16-bit PCM으로 변환
-                    audio_int16 = (audio_array * 32767).astype(np.int16)
-                    wav_file.writeframes(audio_int16.tobytes())
-                tmp_path = tmp.name
-        else:
-            # bytes 데이터인 경우
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
-                tmp.write(audio_data)
-                tmp_path = tmp.name
-        # Whisper API로 변환 - 언어 힌트와 프롬프트 추가
-        with open(tmp_path, 'rb') as audio_file:
-            # 언어별 프롬프트 설정으로 hallucination 방지
-            language_prompts = {
-                "Korean": "이것은 한국어 대화입니다.",
-                "English": "This is an English conversation.",
-                "Japanese": "これは日本語の会話です。",
-                "Chinese": "这是中文对话。",
-            }
-            prompt = language_prompts.get(src_lang, "")
-            client = get_client()
-            transcript = await client.audio.transcriptions.create(
-                model="whisper-1",
-                file=audio_file,
-                language=src_lang[:2].lower(),
-                prompt=prompt,
-                temperature=0.0  # 더 보수적인 추론
-            )
-        os.unlink(tmp_path)  # 임시 파일 삭제
-        # 결과 후처리 - 반복되는 패턴 제거
-        text = transcript.text.strip()
-        # 같은 문장이 반복되는 경우 처리
-        sentences = text.split('.')
-        if len(sentences) > 1:
-            unique_sentences = []
-            for sent in sentences:
-                sent = sent.strip()
-                if sent and (not unique_sentences or sent != unique_sentences[-1]):
-                    unique_sentences.append(sent)
-            text = '. '.join(unique_sentences)
-            if text and not text.endswith('.'):
-                text += '.'
-        # 뉴스 관련 hallucination 패턴 감지 및 제거
-        hallucination_patterns = [
-            "MBC 뉴스", "KBS 뉴스", "SBS 뉴스", "JTBC 뉴스",
-            "뉴스룸", "뉴스데스크", "앵커", "기자입니다"
-        ]
-        # 짧은 텍스트에서 뉴스 패턴이 감지되면 무시
-        if len(text) < 50 and any(pattern in text for pattern in hallucination_patterns):
-            return ""
-        return text
     except Exception as e:
-        print(f"STT 오류: {e}")
-        return ""
-# ─── 4. Gradio 스트림 핸들러 (동기 버전) ─────────────────────
-def realtime_single_sync(audio, src, tgt, state):
-    """동기 버전의 실시간 단일 언어 번역"""
-    if state is None:
-        state = {"orig": "", "trans": "", "audio_buffer": [], "sample_rate": None}
-    if audio is None:
-        # 스트림 종료 시 남은 버퍼 처리
-        if state["audio_buffer"] and state["sample_rate"]:
-            try:
-                # 버퍼의 오디오 합치기
-                combined_audio = np.concatenate(state["audio_buffer"])
-                audio_data = (state["sample_rate"], combined_audio)
-                # 비동기 작업 실행
-                text = asyncio.run(process_audio_chunk(audio_data, src))
-                if text:
-                    state["orig"] = state["orig"] + " " + text if state["orig"] else text
-                    trans = asyncio.run(gpt_translate(text, src, tgt))
-                    state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
-            except Exception as e:
-                print(f"처리 오류: {e}")
-            state["audio_buffer"] = []
-        return state["orig"], state["trans"], state
-    # 오디오 데이터 버퍼링
-    if isinstance(audio, tuple):
-        sample_rate, audio_array = audio
-        state["sample_rate"] = sample_rate
-        state["audio_buffer"].append(audio_array)
-        # 버퍼가 충분히 쌓였을 때만 처리 (약 2-3초 분량)
-        if state["audio_buffer"]:  # 버퍼가 비어있지 않은지 확인
-            buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
-            if buffer_duration >= 2.0:  # 2초마다 처리
-            try:
-                # 버퍼의 오디오 합치기
-                combined_audio = np.concatenate(state["audio_buffer"])
-                audio_data = (sample_rate, combined_audio)
-                # STT
-                text = asyncio.run(process_audio_chunk(audio_data, src))
-                if text:
-                    state["orig"] = state["orig"] + " " + text if state["orig"] else text
-                    # 번역
-                    trans = asyncio.run(gpt_translate(text, src, tgt))
-                    state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
-                # 버퍼 초기화
-                state["audio_buffer"] = []
-            except Exception as e:
-                print(f"처리 오류: {e}")
-                state["audio_buffer"] = []  # 오류 시에도 버퍼 초기화
-    return state["orig"], state["trans"], state
-def realtime_four_sync(audio, src, state):
-    """동기 버전의 실시간 4언어 번역"""
-    if state is None:
-        state = {"orig": "", "English": "", "Chinese": "", "Thai": "", "Russian": "",
-                 "audio_buffer": [], "sample_rate": None}
-    if audio is None:
-        # 스트림 종료 시 남은 버퍼 처리
-        if state["audio_buffer"] and state["sample_rate"]:
-            try:
-                combined_audio = np.concatenate(state["audio_buffer"])
-                audio_data = (state["sample_rate"], combined_audio)
-                text = asyncio.run(process_audio_chunk(audio_data, src))
-                if text:
-                    state["orig"] = state["orig"] + " " + text if state["orig"] else text
-                    # 순차적으로 번역 (병렬 처리 시 문제 발생 가능)
-                    for lang in FOUR:
-                        trans = asyncio.run(gpt_translate(text, src, lang))
-                        state[lang] = state[lang] + " " + trans if state[lang] else trans
-            except Exception as e:
-                print(f"처리 오류: {e}")
-            state["audio_buffer"] = []
-        return (state["orig"], state["English"], state["Chinese"],
-                state["Thai"], state["Russian"], state)
-    # 오디오 데이터 버퍼링
-    if isinstance(audio, tuple):
-        sample_rate, audio_array = audio
-        state["sample_rate"] = sample_rate
-        state["audio_buffer"].append(audio_array)
-        # 버퍼가 충분히 쌓였을 때만 처리
-        if state["audio_buffer"]:  # 버퍼가 비어있지 않은지 확인
-            buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
-            if buffer_duration >= 2.0:  # 2초마다 처리
-            try:
-                combined_audio = np.concatenate(state["audio_buffer"])
-                audio_data = (sample_rate, combined_audio)
-                # STT
-                text = asyncio.run(process_audio_chunk(audio_data, src))
-                if text:
-                    state["orig"] = state["orig"] + " " + text if state["orig"] else text
-                    # 4개 언어로 순차 번역
-                    for lang in FOUR:
-                        trans = asyncio.run(gpt_translate(text, src, lang))
-                        state[lang] = state[lang] + " " + trans if state[lang] else trans
-                state["audio_buffer"] = []
-            except Exception as e:
-                print(f"처리 오류: {e}")
-                state["audio_buffer"] = []
-    return (state["orig"], state["English"], state["Chinese"],
-            state["Thai"], state["Russian"], state)
-# ─── 5. UI ──────────────────────────────────────────────────
-with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        """
-        # 🌍 SMARTok 실시간 번역 시스템
-        다국어 실시간 번역을 지원하는 통합 번역 플랫폼
-        """
-    )
-    with gr.Tabs():
-        # 탭 1 – 오디오 번역
-        with gr.TabItem("🎙️ 오디오/비디오"):
-            gr.Markdown("### 🌐 오디오/비디오 파일 번역")
-            with gr.Row():
-                src1 = gr.Dropdown(LANG, value="Korean", label="입력 언어")
-                tgt1 = gr.Dropdown(LANG, value="English", label="출력 언어")
-            with gr.Tabs():
-                with gr.TabItem("📁 파일 업로드"):
-                    # 파일 업로드 - 오디오와 비디오 모두 지원
-                    aud1_file = gr.File(
-                        label="오디오/비디오 파일 업로드",
-                        file_types=[".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus",
-                                   ".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv"],
-                        type="filepath"
-                    )
-                    gr.Markdown(
-                        "📌 **지원 형식**\n"
-                        "- 오디오: MP3, WAV, M4A, FLAC, OGG, OPUS\n"
-                        "- 비디오: MP4, AVI, MOV, MKV, WebM, FLV\n\n"
-                        "⚠️ **주의사항**\n"
-                        "- 비디오 파일은 오디오 추출 시간이 필요합니다\n"
-                        "- 대용량 파일은 처리 시간이 오래 걸릴 수 있습니다"
-                    )
-                with gr.TabItem("🎤 마이크 녹음"):
-                    aud1_mic = gr.Audio(
-                        sources=["microphone"],
-                        type="filepath",
-                        label="마이크 녹음"
-                    )
-                    gr.Markdown("💡 **팁**: 녹음 후 '정지' 버튼을 눌러주세요")
-            btn1 = gr.Button("🔄 번역 시작", variant="primary", size="lg")
-            # 진행 상태 표시
-            status1 = gr.Textbox(label="진행 상태", value="대기 중...", interactive=False)
-            with gr.Row():
-                with gr.Column():
-                    o1 = gr.Textbox(label="📝 원문", lines=6)
-                with gr.Column():
-                    t1 = gr.Textbox(label="📝 번역", lines=6)
-            a1 = gr.Audio(label="🔊 번역된 음성 (TTS)", type="filepath", autoplay=True)
-            # 파일이나 마이크 중 활성화된 입력 사용
-            def translate_with_status(file_input, mic_input, src, tgt):
-                active_input = file_input if file_input else mic_input
-                if not active_input:
-                    return "⚠️ 파일을 업로드하거나 녹음을 해주세요", "", None
-                # 상태 업데이트는 동기 함수에서 처리
-                return translate_audio(active_input, src, tgt)
-            btn1.click(
-                lambda: "처리 중... 잠시만 기다려주세요 ⏳",
-                outputs=status1
-            ).then(
-                translate_with_status,
-                [aud1_file, aud1_mic, src1, tgt1],
-                [o1, t1, a1]
-            ).then(
-                lambda: "✅ 완료!",
-                outputs=status1
-            )
-        # 탭 2 – PDF 번역
-        with gr.TabItem("📄 PDF"):
-            src2 = gr.Dropdown(LANG, value="Korean", label="입력 언어")
-            tgt2 = gr.Dropdown(LANG, value="English", label="출력 언어")
-            pdf = gr.File(file_types=[".pdf"])
-            btn2 = gr.Button("번역")
-            o2 = gr.Textbox(label="추출 원문", lines=15)
-            t2 = gr.Textbox(label="번역 결과", lines=15)
-            btn2.click(translate_pdf, [pdf, src2, tgt2], [o2, t2])
-        # 탭 3 – 실시간 1언어
-        with gr.TabItem("⏱️ 실시간 1"):
-            src3 = gr.Dropdown(LANG, value="Korean", label="입력 언어")
-            tgt3 = gr.Dropdown(LANG, value="English", label="출력 언어")
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("🎤 **마이크 입력**")
-                    mic3 = gr.Audio(
-                        sources=["microphone"],
-                        streaming=True,
-                        type="numpy",  # numpy 형식 명시
-                        label="마이크"
-                    )
-                    gr.Markdown("💡 **사용 방법**\n- 2-3초 정도 문장을 말씀해주세요\n- 너무 짧거나 긴 문장은 인식이 어려울 수 있습니다")
-                with gr.Column():
-                    o3 = gr.Textbox(label="원문(실시간)", lines=8, interactive=False)
-                    t3 = gr.Textbox(label="번역(실시간)", lines=8, interactive=False)
-            st3 = gr.State()
-            # stream 메서드 수정
-            mic3.stream(
-                realtime_single_sync,
-                inputs=[mic3, src3, tgt3, st3],
-                outputs=[o3, t3, st3],
-                stream_every=0.5  # 0.5초마다 스트림 (time_limit 제거)
-            )
-        # 탭 4 – 실시간 4언어
-        with gr.TabItem("🌏 실시간 4"):
-            src4 = gr.Dropdown(LANG, value="Korean", label="입력 언어")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("🎤 **마이크 입력**")
-                    mic4 = gr.Audio(
-                        sources=["microphone"],
-                        streaming=True,
-                        type="numpy",
-                        label="마이크"
-                    )
-                    o4 = gr.Textbox(label="원문", lines=8, interactive=False)
-                with gr.Column(scale=2):
-                    with gr.Row():
-                        e4 = gr.Textbox(label="English", lines=8, interactive=False)
-                        c4 = gr.Textbox(label="Chinese(简体)", lines=8, interactive=False)
-                    with gr.Row():
-                        th4 = gr.Textbox(label="Thai", lines=8, interactive=False)
-                        r4 = gr.Textbox(label="Russian", lines=8, interactive=False)
-            st4 = gr.State()
-            # stream 메서드 수정
-            mic4.stream(
-                realtime_four_sync,
-                inputs=[mic4, src4, st4],
-                outputs=[o4, e4, c4, th4, r4, st4],
-                stream_every=0.5
-            )
-demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)

+import os
+import sys
+import streamlit as st
+from tempfile import NamedTemporaryFile
+def main():
     try:
+        # Get the code from secrets
+        code = os.environ.get("MAIN_CODE")
+        if not code:
+            st.error("⚠️ The application code wasn't found in secrets. Please add the MAIN_CODE secret.")
+            return
+        # Create a temporary Python file
+        with NamedTemporaryFile(suffix='.py', delete=False, mode='w') as tmp:
+            tmp.write(code)
+            tmp_path = tmp.name
+        # Execute the code
+        exec(compile(code, tmp_path, 'exec'), globals())
+        # Clean up the temporary file
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass
     except Exception as e:
+        st.error(f"⚠️ Error loading or executing the application: {str(e)}")
+        import traceback
+        st.code(traceback.format_exc())
+if __name__ == "__main__":
+    main()