Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	array input audio
Browse files- app.py +12 -7
- packages.txt +1 -0
    	
        app.py
    CHANGED
    
    | @@ -6,6 +6,7 @@ import google.generativeai as genai | |
| 6 | 
             
            import re
         | 
| 7 | 
             
            import torch
         | 
| 8 | 
             
            from transformers import pipeline
         | 
|  | |
| 9 | 
             
            import time
         | 
| 10 | 
             
            import spaces
         | 
| 11 |  | 
| @@ -54,14 +55,14 @@ def summarize_transcription(transcription, model, gemini_prompt): | |
| 54 | 
             
                    return f"Error summarizing transcription: {str(e)}"
         | 
| 55 |  | 
| 56 | 
             
            @spaces.GPU(duration=120)
         | 
| 57 | 
            -
            def process_audio( | 
| 58 | 
             
                print("Starting transcription...")
         | 
| 59 | 
             
                if language:
         | 
| 60 | 
             
                    print(f"Using language: {language}")
         | 
| 61 | 
            -
                    transcription = pipe( | 
| 62 | 
             
                else:
         | 
| 63 | 
             
                    print("No language defined, using default language")
         | 
| 64 | 
            -
                    transcription = pipe( | 
| 65 | 
             
                return transcription
         | 
| 66 |  | 
| 67 | 
             
            def transcribe(youtube_url, audio_file, whisper_model, gemini_api_key, gemini_prompt, gemini_model_variant, language, progress=gr.Progress()):
         | 
| @@ -72,15 +73,13 @@ def transcribe(youtube_url, audio_file, whisper_model, gemini_api_key, gemini_pr | |
| 72 | 
             
                        gemini_api_key = default_gemini_api_key
         | 
| 73 | 
             
                    model = configure_genai(gemini_api_key, gemini_model_variant)
         | 
| 74 |  | 
| 75 | 
            -
                     | 
| 76 | 
            -
                    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         | 
| 77 | 
             
                    pipe = pipeline(
         | 
| 78 | 
             
                        task="automatic-speech-recognition",
         | 
| 79 | 
             
                        model=whisper_model,
         | 
| 80 | 
             
                        chunk_length_s=30,
         | 
| 81 | 
             
                        device=device,
         | 
| 82 | 
             
                    )
         | 
| 83 | 
            -
                    pipe.model = pipe.model.to(device)
         | 
| 84 |  | 
| 85 | 
             
                    if youtube_url:
         | 
| 86 | 
             
                        progress(0.1, desc="Extracting YouTube ID")
         | 
| @@ -97,9 +96,15 @@ def transcribe(youtube_url, audio_file, whisper_model, gemini_api_key, gemini_pr | |
| 97 | 
             
                        progress(0.2, desc="Reading audio file")
         | 
| 98 | 
             
                        audio_file = f"{audio_file.name}"
         | 
| 99 | 
             
                        print(f"Audio file read: {audio_file}")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 100 |  | 
| 101 | 
             
                    progress(0.4, desc="Starting transcription")
         | 
| 102 | 
            -
                    transcription = process_audio( | 
| 103 |  | 
| 104 | 
             
                    progress(0.6, desc="Cleaning up")
         | 
| 105 | 
             
                    # Delete the audio file after transcription
         | 
|  | |
| 6 | 
             
            import re
         | 
| 7 | 
             
            import torch
         | 
| 8 | 
             
            from transformers import pipeline
         | 
| 9 | 
            +
            from transformers.pipelines.audio_utils import ffmpeg_read
         | 
| 10 | 
             
            import time
         | 
| 11 | 
             
            import spaces
         | 
| 12 |  | 
|  | |
| 55 | 
             
                    return f"Error summarizing transcription: {str(e)}"
         | 
| 56 |  | 
| 57 | 
             
            @spaces.GPU(duration=120)
         | 
| 58 | 
            +
            def process_audio(inputs, pipe, language):
         | 
| 59 | 
             
                print("Starting transcription...")
         | 
| 60 | 
             
                if language:
         | 
| 61 | 
             
                    print(f"Using language: {language}")
         | 
| 62 | 
            +
                    transcription = pipe(inputs, batch_size=8, generate_kwargs={"task": "transcribe", "language": language}, return_timestamps=True)["text"]
         | 
| 63 | 
             
                else:
         | 
| 64 | 
             
                    print("No language defined, using default language")
         | 
| 65 | 
            +
                    transcription = pipe(inputs, batch_size=8, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
         | 
| 66 | 
             
                return transcription
         | 
| 67 |  | 
| 68 | 
             
            def transcribe(youtube_url, audio_file, whisper_model, gemini_api_key, gemini_prompt, gemini_model_variant, language, progress=gr.Progress()):
         | 
|  | |
| 73 | 
             
                        gemini_api_key = default_gemini_api_key
         | 
| 74 | 
             
                    model = configure_genai(gemini_api_key, gemini_model_variant)
         | 
| 75 |  | 
| 76 | 
            +
                    device = 0 if torch.cuda.is_available() else "cpu"
         | 
|  | |
| 77 | 
             
                    pipe = pipeline(
         | 
| 78 | 
             
                        task="automatic-speech-recognition",
         | 
| 79 | 
             
                        model=whisper_model,
         | 
| 80 | 
             
                        chunk_length_s=30,
         | 
| 81 | 
             
                        device=device,
         | 
| 82 | 
             
                    )
         | 
|  | |
| 83 |  | 
| 84 | 
             
                    if youtube_url:
         | 
| 85 | 
             
                        progress(0.1, desc="Extracting YouTube ID")
         | 
|  | |
| 96 | 
             
                        progress(0.2, desc="Reading audio file")
         | 
| 97 | 
             
                        audio_file = f"{audio_file.name}"
         | 
| 98 | 
             
                        print(f"Audio file read: {audio_file}")
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                    with open(audio_file, "rb") as f:
         | 
| 101 | 
            +
                        inputs = f.read()
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
         | 
| 104 | 
            +
                    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
         | 
| 105 |  | 
| 106 | 
             
                    progress(0.4, desc="Starting transcription")
         | 
| 107 | 
            +
                    transcription = process_audio(inputs, pipe, language)
         | 
| 108 |  | 
| 109 | 
             
                    progress(0.6, desc="Cleaning up")
         | 
| 110 | 
             
                    # Delete the audio file after transcription
         | 
    	
        packages.txt
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            ffmpeg
         | 
