Sanmayi commited on
Commit
118d988
Β·
verified Β·
1 Parent(s): 244d3f6

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +8 -7
  2. app.py +137 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: Summarizer
3
- emoji: 🌍
4
- colorFrom: gray
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.29.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: AI Doc Sum
3
+ emoji: πŸ“„
4
+ colorFrom: indigo
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.14.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
+ # AI Document Summarizer
13
+
14
+ This project summarizes PDF and DOCX files using transformers.
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
4
+ from PyPDF2 import PdfReader
5
+ import docx
6
+
7
+ # Load multilingual model
8
+ model_name = "csebuetnlp/mT5_multilingual_XLSum"
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
11
+
12
+ # Languages supported by mT5_multilingual_XLSum
13
+ supported_languages = [
14
+ "english", "hindi", "telugu", "french", "german", "spanish", "bengali", "tamil", "marathi", "urdu"
15
+ # (Add more as needed from the model's card)
16
+ ]
17
+
18
+ def read_pdf(file):
19
+ reader = PdfReader(file)
20
+ return "".join([page.extract_text() for page in reader.pages])
21
+
22
+ def read_docx(file):
23
+ doc = docx.Document(file)
24
+ return "\n".join([para.text for para in doc.paragraphs])
25
+
26
+ def summarize_file(file, language):
27
+ if file.name.endswith(".pdf"):
28
+ text = read_pdf(file)
29
+ elif file.name.endswith(".docx"):
30
+ text = read_docx(file)
31
+ elif file.name.endswith(".txt"):
32
+ text = file.read().decode("utf-8")
33
+ else:
34
+ return "Unsupported file format."
35
+
36
+ if not text.strip():
37
+ return "The file is empty or unreadable."
38
+
39
+ text = text[:3000]
40
+
41
+ # Prepare input as per mT5 format
42
+ prefix = f"summarize {language}: "
43
+ inputs = tokenizer(prefix + text, return_tensors="pt", max_length=1024, truncation=True)
44
+ summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=30, length_penalty=2.0, num_beams=4)
45
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
46
+
47
+ return summary
48
+
49
+ iface = gr.Interface(
50
+ fn=summarize_file,
51
+ inputs=[
52
+ gr.File(file_types=[".pdf", ".docx", ".txt"]),
53
+ gr.Dropdown(supported_languages, label="Select Language", value="english")
54
+ ],
55
+ outputs="text",
56
+ title="Multilingual AI Document Summarizer",
57
+ description="Upload a document and get summaries in multiple languages using mT5."
58
+ )
59
+
60
+ if __name__ == "__main__":
61
+ iface.launch(share=True)'''
62
+
63
+
64
+ import gradio as gr
65
+ from PyPDF2 import PdfReader
66
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
67
+
68
+ # Load summarizer model (LaMini-Flan-T5)
69
+ summarizer_tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
70
+ summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
71
+
72
+ # Load translators
73
+ translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
74
+ translator_te = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mul")
75
+
76
+ # Extract text from PDF
77
+ def extract_text_from_pdf(file):
78
+ reader = PdfReader(file)
79
+ text = ""
80
+ for page in reader.pages:
81
+ text += page.extract_text()
82
+ return text
83
+
84
+ # Summarize based on doc type
85
+ def summarize_text(text, doc_type):
86
+ prompt = f"Summarize this {doc_type} document clearly:\n{text}\nSummary:"
87
+ inputs = summarizer_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
88
+ outputs = summarizer_model.generate(**inputs, max_length=300, num_beams=4, early_stopping=True)
89
+ return summarizer_tokenizer.decode(outputs[0], skip_special_tokens=True)
90
+
91
+ # Translate summary
92
+ def translate_summary(summary, lang):
93
+ if lang == "hindi":
94
+ return translator_hi(summary)[0]["translation_text"]
95
+ elif lang == "telugu":
96
+ return translator_te(summary)[0]["translation_text"]
97
+ else:
98
+ return summary # English or unsupported
99
+
100
+ # Main processing logic
101
+ def process(file, lang, doc_type):
102
+ text = extract_text_from_pdf(file)
103
+ if not text.strip():
104
+ return "Error: PDF has no extractable text."
105
+
106
+ summary = summarize_text(text, doc_type)
107
+ return translate_summary(summary, lang)
108
+
109
+ # Gradio UI
110
+ with gr.Blocks() as app:
111
+ gr.Markdown("## Multilingual AI Document Summarizer")
112
+ gr.Markdown("Upload a document and get summaries in multiple languages using mT5.")
113
+
114
+ file_input = gr.File(label="Upload PDF")
115
+
116
+ with gr.Row():
117
+ language_input = gr.Dropdown(
118
+ label="Select Language",
119
+ choices=["english", "hindi", "telugu"],
120
+ value="english"
121
+ )
122
+ type_input = gr.Dropdown(
123
+ label="Select Document Type",
124
+ choices=["legal", "medical", "general"],
125
+ value="general"
126
+ )
127
+
128
+ output = gr.Textbox(label="Summary Output", lines=10)
129
+
130
+ with gr.Row():
131
+ clear = gr.Button("Clear")
132
+ submit = gr.Button("Submit")
133
+
134
+ submit.click(fn=process, inputs=[file_input, language_input, type_input], outputs=output)
135
+ clear.click(lambda: "", inputs=[], outputs=output)
136
+
137
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ python-docx
5
+ PyPDF2
6
+ sentencepiece