import gradio as gr import fitz from transformers import pipeline, MBart50TokenizerFast, MBartForConditionalGeneration from multiprocessing import Pool, cpu_count import tempfile # Load summarization pipeline summarizer = pipeline("summarization", model="Falconsai/text_summarization") # Load translation model and tokenizer model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt") tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX") # Define max chunk length max_chunk_length = 1024 # Function to chunk text def chunk_text(text, max_chunk_length): chunks = [] current_chunk = "" for sentence in text.split("."): if len(current_chunk) + len(sentence) + 1 <= max_chunk_length: if current_chunk != "": current_chunk += " " current_chunk += sentence.strip() else: chunks.append(current_chunk) current_chunk = sentence.strip() if current_chunk != "": chunks.append(current_chunk) return chunks # Function to summarize and translate a chunk def summarize_and_translate_chunk(chunk, lang): summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False) summary_text = summary[0]['summary_text'] # Translate summary translated_chunk = translate_summary(summary_text, lang) return translated_chunk # Function to translate the summary def translate_summary(summary, lang): # Chunk text if it exceeds maximum length if len(summary) > max_chunk_length: chunks = chunk_text(summary, max_chunk_length) else: chunks = [summary] # Translate each chunk translated_chunks = [] for chunk in chunks: inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True) generated_tokens = model.generate( **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[lang], max_length=1024, num_beams=4, early_stopping=True, length_penalty=2.0, ) translated_chunks.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]) return " ".join(translated_chunks) # Function to read PDF and summarize and translate chunk by chunk def summarize_and_translate_pdf(pdf_content, lang): # Save PDF content to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: temp_file.write(pdf_content) try: doc = fitz.open(temp_file.name) except FileNotFoundError: return "File not found. Please make sure the file path is correct." total_chunks = len(doc) chunks = [] for i in range(total_chunks): page = doc.load_page(i) text = page.get_text() chunks.extend([text[j:j+max_chunk_length] for j in range(0, len(text), max_chunk_length)]) # Use multiprocessing to parallelize the process with Pool(cpu_count()) as pool: translated_chunks = pool.starmap(summarize_and_translate_chunk, [(chunk, lang) for chunk in chunks]) # Delete temporary file temp_file.close() return translated_chunks # Gradio Interface def summarize_and_translate_interface(pdf_content, lang): translated_chunks = summarize_and_translate_pdf(pdf_content, lang) return "\n".join(translated_chunks) # Gradio UI input_pdf = gr.inputs.File(label="Upload a PDF file", type="file") language = gr.inputs.Dropdown(choices=["Arabic", "Czech", "German", "English", "Spanish", "Estonian", "Finnish", "French", "Gujarati", "Hindi", "Italian", "Japanese", "Kazakh", "Korean", "Lithuanian", "Latvian", "Burmese", "Nepali", "Dutch", "Romanian", "Russian", "Sinhala", "Turkish", "Vietnamese", "Chinese", "Afrikaans", "Azerbaijani", "Bengali", "Persian", "Hebrew", "Croatian", "Indonesian", "Georgian", "Khmer", "Macedonian", "Malayalam", "Mongolian", "Marathi", "Polish", "Pashto", "Portuguese", "Swedish", "Swahili", "Tamil", "Telugu", "Thai", "Tagalog", "Ukrainian", "Urdu", "Xhosa", "Galician", "Slovene"], label="Select language for translation") output_text = gr.outputs.Textbox(label="Translated Summary") gr.Interface(summarize_and_translate_interface, inputs=[input_pdf, language], outputs=output_text).launch()