Warco-B commited on
Commit
4e4f095
1 Parent(s): de1a412

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -0
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from io import BytesIO
3
+ import fitz
4
+
5
+ import openai
6
+
7
+
8
+ class TranslationAgent:
9
+ def __init__(self, openai_key):
10
+ self.memory = []
11
+ system_msg = "You are a translator from english to italian.\n" \
12
+ " The only thing you do is to translate.\n" \
13
+ " You don't write anything other then the translation of the text you get.\n" \
14
+ " The user will only provide the text without asking anything, but what he wants is the translation.\n" \
15
+ " Never return the translation of a previously translated part!\n " \
16
+ "The text you will need to translate will often include none sense stuff because it is coming from a text extraction of a pdf file including images and table.\n" \
17
+ " Do your best to translate also this messy parts."
18
+
19
+ self.memory.append({"role": "system", "content": system_msg})
20
+
21
+ openai.api_key = openai_key
22
+
23
+ def fade_memory(self):
24
+ if len(self.memory) >= 5:
25
+ del self.memory[1:3]
26
+
27
+ def translate_chunk(self, chunk):
28
+ self.memory.append({"role": "user", "content": chunk})
29
+ response = openai.ChatCompletion.create(
30
+ model="gpt-3.5-turbo",
31
+ messages=self.memory
32
+ )
33
+ reply = response["choices"][0]["message"]["content"]
34
+ self.memory.append({"role": "assistant", "content": reply})
35
+ self.fade_memory()
36
+ return reply
37
+
38
+
39
+ def extract_text_from_pdf(pdf, start, stop):
40
+ text = ""
41
+ with fitz.open(stream=BytesIO(pdf), filetype='pdf') as doc: # remove .read()
42
+ for i, page in enumerate(doc):
43
+ if start <= i:
44
+ if i <= stop:
45
+ text += page.get_text()
46
+ else:
47
+ break
48
+ return text
49
+
50
+
51
+ def split_text(text, chunk_size=100):
52
+ words = text.split()
53
+ chunks = []
54
+ current_chunk_words = []
55
+
56
+ for word in words:
57
+ current_chunk_words.append(word)
58
+ if word.endswith('.') and len(current_chunk_words) >= chunk_size:
59
+ chunks.append(' '.join(current_chunk_words))
60
+ current_chunk_words = []
61
+
62
+ # add the last chunk if any words remain
63
+ if current_chunk_words:
64
+ chunks.append(' '.join(current_chunk_words))
65
+
66
+ return chunks
67
+
68
+
69
+ def translate_pdf(openai_key, pdf, start, stop):
70
+ translator = TranslationAgent(openai_key)
71
+
72
+ # extract text
73
+ if pdf is not None:
74
+ text = extract_text_from_pdf(pdf, start=start, stop=stop)
75
+ chunks = split_text(text)
76
+
77
+ translated_chunks = []
78
+ for chunk in chunks:
79
+ translated_chunk = translator.translate_chunk(chunk)
80
+ translated_chunks.append(translated_chunk + " ")
81
+
82
+ translated_text = ' '.join(translated_chunks)
83
+ with open('translated.txt', 'w') as f:
84
+ f.write(translated_text)
85
+
86
+ return translated_text, "Translation Successful"
87
+
88
+
89
+
90
+ iface = gr.Interface(title="Pdf Translator English -> Italian",
91
+ fn=translate_pdf,
92
+ inputs=[
93
+ gr.inputs.Textbox(lines=1, label="OpenAI API key",
94
+ placeholder="Enter your OpenAI API key here"),
95
+ gr.inputs.File(type="binary", label="PDF file", ),
96
+ gr.inputs.Number(label="Starting Page", ),
97
+ gr.inputs.Number(label="Final Page")
98
+ ],
99
+ outputs=["text", "text"]
100
+ )
101
+
102
+ iface.launch(share=True)