Sanmayi commited on
Commit
6683f91
·
verified ·
1 Parent(s): 118d988

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -137
app.py CHANGED
@@ -1,137 +1,76 @@
1
-
2
- import gradio as gr
3
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
4
- from PyPDF2 import PdfReader
5
- import docx
6
-
7
- # Load multilingual model
8
- model_name = "csebuetnlp/mT5_multilingual_XLSum"
9
- tokenizer = AutoTokenizer.from_pretrained(model_name)
10
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
11
-
12
- # Languages supported by mT5_multilingual_XLSum
13
- supported_languages = [
14
- "english", "hindi", "telugu", "french", "german", "spanish", "bengali", "tamil", "marathi", "urdu"
15
- # (Add more as needed from the model's card)
16
- ]
17
-
18
- def read_pdf(file):
19
- reader = PdfReader(file)
20
- return "".join([page.extract_text() for page in reader.pages])
21
-
22
- def read_docx(file):
23
- doc = docx.Document(file)
24
- return "\n".join([para.text for para in doc.paragraphs])
25
-
26
- def summarize_file(file, language):
27
- if file.name.endswith(".pdf"):
28
- text = read_pdf(file)
29
- elif file.name.endswith(".docx"):
30
- text = read_docx(file)
31
- elif file.name.endswith(".txt"):
32
- text = file.read().decode("utf-8")
33
- else:
34
- return "Unsupported file format."
35
-
36
- if not text.strip():
37
- return "The file is empty or unreadable."
38
-
39
- text = text[:3000]
40
-
41
- # Prepare input as per mT5 format
42
- prefix = f"summarize {language}: "
43
- inputs = tokenizer(prefix + text, return_tensors="pt", max_length=1024, truncation=True)
44
- summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=30, length_penalty=2.0, num_beams=4)
45
- summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
46
-
47
- return summary
48
-
49
- iface = gr.Interface(
50
- fn=summarize_file,
51
- inputs=[
52
- gr.File(file_types=[".pdf", ".docx", ".txt"]),
53
- gr.Dropdown(supported_languages, label="Select Language", value="english")
54
- ],
55
- outputs="text",
56
- title="Multilingual AI Document Summarizer",
57
- description="Upload a document and get summaries in multiple languages using mT5."
58
- )
59
-
60
- if __name__ == "__main__":
61
- iface.launch(share=True)'''
62
-
63
-
64
- import gradio as gr
65
- from PyPDF2 import PdfReader
66
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
67
-
68
- # Load summarizer model (LaMini-Flan-T5)
69
- summarizer_tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
70
- summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
71
-
72
- # Load translators
73
- translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
74
- translator_te = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mul")
75
-
76
- # Extract text from PDF
77
- def extract_text_from_pdf(file):
78
- reader = PdfReader(file)
79
- text = ""
80
- for page in reader.pages:
81
- text += page.extract_text()
82
- return text
83
-
84
- # Summarize based on doc type
85
- def summarize_text(text, doc_type):
86
- prompt = f"Summarize this {doc_type} document clearly:\n{text}\nSummary:"
87
- inputs = summarizer_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
88
- outputs = summarizer_model.generate(**inputs, max_length=300, num_beams=4, early_stopping=True)
89
- return summarizer_tokenizer.decode(outputs[0], skip_special_tokens=True)
90
-
91
- # Translate summary
92
- def translate_summary(summary, lang):
93
- if lang == "hindi":
94
- return translator_hi(summary)[0]["translation_text"]
95
- elif lang == "telugu":
96
- return translator_te(summary)[0]["translation_text"]
97
- else:
98
- return summary # English or unsupported
99
-
100
- # Main processing logic
101
- def process(file, lang, doc_type):
102
- text = extract_text_from_pdf(file)
103
- if not text.strip():
104
- return "Error: PDF has no extractable text."
105
-
106
- summary = summarize_text(text, doc_type)
107
- return translate_summary(summary, lang)
108
-
109
- # Gradio UI
110
- with gr.Blocks() as app:
111
- gr.Markdown("## Multilingual AI Document Summarizer")
112
- gr.Markdown("Upload a document and get summaries in multiple languages using mT5.")
113
-
114
- file_input = gr.File(label="Upload PDF")
115
-
116
- with gr.Row():
117
- language_input = gr.Dropdown(
118
- label="Select Language",
119
- choices=["english", "hindi", "telugu"],
120
- value="english"
121
- )
122
- type_input = gr.Dropdown(
123
- label="Select Document Type",
124
- choices=["legal", "medical", "general"],
125
- value="general"
126
- )
127
-
128
- output = gr.Textbox(label="Summary Output", lines=10)
129
-
130
- with gr.Row():
131
- clear = gr.Button("Clear")
132
- submit = gr.Button("Submit")
133
-
134
- submit.click(fn=process, inputs=[file_input, language_input, type_input], outputs=output)
135
- clear.click(lambda: "", inputs=[], outputs=output)
136
-
137
- app.launch()
 
1
+
2
+
3
+ import gradio as gr
4
+ from PyPDF2 import PdfReader
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
+
7
+ # Load summarizer model (LaMini-Flan-T5)
8
+ summarizer_tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
9
+ summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
10
+
11
+ # Load translators
12
+ translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
13
+ translator_te = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mul")
14
+
15
+ # Extract text from PDF
16
+ def extract_text_from_pdf(file):
17
+ reader = PdfReader(file)
18
+ text = ""
19
+ for page in reader.pages:
20
+ text += page.extract_text()
21
+ return text
22
+
23
+ # Summarize based on doc type
24
+ def summarize_text(text, doc_type):
25
+ prompt = f"Summarize this {doc_type} document clearly:\n{text}\nSummary:"
26
+ inputs = summarizer_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
27
+ outputs = summarizer_model.generate(**inputs, max_length=300, num_beams=4, early_stopping=True)
28
+ return summarizer_tokenizer.decode(outputs[0], skip_special_tokens=True)
29
+
30
+ # Translate summary
31
+ def translate_summary(summary, lang):
32
+ if lang == "hindi":
33
+ return translator_hi(summary)[0]["translation_text"]
34
+ elif lang == "telugu":
35
+ return translator_te(summary)[0]["translation_text"]
36
+ else:
37
+ return summary # English or unsupported
38
+
39
+ # Main processing logic
40
+ def process(file, lang, doc_type):
41
+ text = extract_text_from_pdf(file)
42
+ if not text.strip():
43
+ return "Error: PDF has no extractable text."
44
+
45
+ summary = summarize_text(text, doc_type)
46
+ return translate_summary(summary, lang)
47
+
48
+ # Gradio UI
49
+ with gr.Blocks() as app:
50
+ gr.Markdown("## Multilingual AI Document Summarizer")
51
+ gr.Markdown("Upload a document and get summaries in multiple languages using mT5.")
52
+
53
+ file_input = gr.File(label="Upload PDF")
54
+
55
+ with gr.Row():
56
+ language_input = gr.Dropdown(
57
+ label="Select Language",
58
+ choices=["english", "hindi", "telugu"],
59
+ value="english"
60
+ )
61
+ type_input = gr.Dropdown(
62
+ label="Select Document Type",
63
+ choices=["legal", "medical", "general"],
64
+ value="general"
65
+ )
66
+
67
+ output = gr.Textbox(label="Summary Output", lines=10)
68
+
69
+ with gr.Row():
70
+ clear = gr.Button("Clear")
71
+ submit = gr.Button("Submit")
72
+
73
+ submit.click(fn=process, inputs=[file_input, language_input, type_input], outputs=output)
74
+ clear.click(lambda: "", inputs=[], outputs=output)
75
+
76
+ app.launch()