File size: 1,185 Bytes
add165b
fe8d4db
 
c891b07
add165b
 
 
fe8d4db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c891b07
fe8d4db
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from huggingface_hub import InferenceClient
from pathlib import Path
import gradio as gr
import os

MODEL_NAME = "meta-llama/Meta-Llama-3-70b-Instruct"

def split_text_into_chunks(text, chunk_size=600):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def clean_transcript(audio_file, options, prompt, transcript: str):
    text = f"### {Path(audio_file).with_suffix('').name}\n\n"
    if options == []:
        text += transcript    
    else:
        chunks = split_text_into_chunks(transcript)
        for chunk in chunks:
            messages = [
                {"role": "user", "content": prompt + "\n" + chunk}
            ]
            client = InferenceClient(model=MODEL_NAME, token=os.getenv("HF_TOKEN"))
            for c in client.chat_completion(messages, max_tokens=1000, stream=True):
                token = c.choices[0].delta.content
                text += token
                yield text, None
        
    # write text to md file
    md_file = Path(audio_file).with_suffix('.md')
    md_file.write_text(text)
    return text, gr.DownloadButton(interactive=True, value=md_file)