import gradio as gr from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC import torch # Load the model and processor processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn") model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn") # Function to transcribe the audio def transcribe_audio(audio): input_values = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_values # Inference with torch.no_grad(): logits = model(input_values).logits # Decode the transcription predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) return transcription[0] # Since we're only handling one audio file # Set up the Gradio interface interface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(source="microphone", type="filepath"), # Accept audio files outputs="text", title="Chinese Audio Transcription", description="Upload or record an audio file to transcribe it into Chinese." ) # Launch the interface interface.launch()