Spaces:

akhil2808
/

ASR-Opensource

Sleeping

File size: 2,255 Bytes

1f9ba8e
 
 
 
74be4e1
 
1f9ba8e
74be4e1
 
 
 
 
 
 
1f9ba8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e116aa
1f9ba8e
 
 
 
 
 
 
 
 
 
 
 
26b053d
 
 
 
 
60fd95b
1f9ba8e
 
 
d4eb915
1e116aa
d4eb915
 
1f9ba8e
 
1e116aa
1f9ba8e
fcad81d
1f9ba8e
26b053d
 
1f9ba8e
 
26b053d
1f9ba8e
 
fcad81d

import gradio as gr
import torch
from wenet.cli.model import load_model

import os
from huggingface_hub import login

# Load the API token from the environment variables
api_token = os.getenv('HUGGINGFACE_API_TOKEN')
if not api_token:
    raise ValueError("No Hugging Face API token found. Please set the HUGGING_FACE_API_TOKEN environment variable.")

# Login to Hugging Face Hub
login(token=api_token, add_to_git_credential=True)    

def process_cat_embs(cat_embs):
    device = "cpu"
    cat_embs = torch.tensor(
        [float(c) for c in cat_embs.split(',')]).to(device)
    return cat_embs


def download_rev_models():
    from huggingface_hub import hf_hub_download
    import joblib

    REPO_ID = "Revai/reverb-asr"

    files = ['reverb_asr_v1.jit.zip', 'tk.units.txt']
    downloaded_files = [hf_hub_download(repo_id=REPO_ID, filename=f) for f in files]
    model = load_model(downloaded_files[0], downloaded_files[1])
    return model

model = download_rev_models()
    

def recognition(audio, style=0):
    if audio is None:
        return "Input Error! Please enter one audio!"
    

    cat_embs = ','.join([str(s) for s in (style, 1-style)])
    cat_embs = process_cat_embs(cat_embs)
    ans = model.transcribe(audio, cat_embs = cat_embs)

    if ans is None:
        return "ERROR! No text output! Please try again!"
    txt = ans['text']
    txt = txt.replace('▁', ' ')
    return txt


audio_input = gr.Audio(type="filepath", label="Upload or Record Audio")
style_slider = gr.Slider(0, 1, value=0, step=0.1, label="Transcription Style",
                             info="Adjust the transcription style: 0 (casual) to 1 (formal).")
output_textbox = gr.Textbox(label="Transcription Output")
    
text = "ASR Transcription Opensource Demo-CPU"

# description
description = (
    " Opensource Automatic Speech Recognition in English"
    
      "Verbatim Transcript style(1) refers to word to word-to-word transcription of an audio" 
      "Non Verbatim Transcript style(0) refers to just conserving the message of the original audio"
)



iface = gr.Interface(
    fn=recognition,
    inputs=[audio_input, style_slider],
    outputs=output_textbox,
    title=text,
    description=description,
    theme='default',
)

iface.launch()