Spaces:

bhaskartripathi
/

pdfChatterSandbox

Runtime error

App Files Files Community

bhaskartripathi commited on Dec 5, 2023

Commit

1e8edc1

1 Parent(s): efb89c8

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -194

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import urllib.request
 import fitz
 import re
 import numpy as np
@@ -8,208 +8,34 @@ import gradio as gr
 import os
 from sklearn.neighbors import NearestNeighbors
-def download_pdf(url, output_path):
-    urllib.request.urlretrieve(url, output_path)
-def preprocess(text):
-    text = text.replace('\n', ' ')
-    text = re.sub('\s+', ' ', text)
-    return text
-def pdf_to_text(path, start_page=1, end_page=None):
-    doc = fitz.open(path)
-    total_pages = doc.page_count
-    if end_page is None:
-        end_page = total_pages
-    text_list = []
-    for i in range(start_page-1, end_page):
-        text = doc.load_page(i).get_text("text")
-        text = preprocess(text)
-        text_list.append(text)
-    doc.close()
-    return text_list
-def text_to_chunks(texts, word_length=150, start_page=1):
-    text_toks = [t.split(' ') for t in texts]
-    page_nums = []
-    chunks = []
-    for idx, words in enumerate(text_toks):
-        for i in range(0, len(words), word_length):
-            chunk = words[i:i+word_length]
-            if (i+word_length) > len(words) and (len(chunk) < word_length) and (
-                len(text_toks) != (idx+1)):
-                text_toks[idx+1] = chunk + text_toks[idx+1]
-                continue
-            chunk = ' '.join(chunk).strip()
-            chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
-            chunks.append(chunk)
-    return chunks
-class SemanticSearch:
-    def __init__(self):
-        self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
-        self.fitted = False
-    def fit(self, data, batch=1000, n_neighbors=5):
-        self.data = data
-        self.embeddings = self.get_text_embedding(data, batch=batch)
-        n_neighbors = min(n_neighbors, len(self.embeddings))
-        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
-        self.nn.fit(self.embeddings)
-        self.fitted = True
-    def __call__(self, text, return_data=True):
-        inp_emb = self.use([text])
-        neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
-        if return_data:
-            return [self.data[i] for i in neighbors]
-        else:
-            return neighbors
-    def get_text_embedding(self, texts, batch=1000):
-        embeddings = []
-        for i in range(0, len(texts), batch):
-            text_batch = texts[i:(i+batch)]
-            emb_batch = self.use(text_batch)
-            embeddings.append(emb_batch)
-        embeddings = np.vstack(embeddings)
-        return embeddings
-def load_recommender(path, start_page=1):
-    global recommender
-    texts = pdf_to_text(path, start_page=start_page)
-    chunks = text_to_chunks(texts, start_page=start_page)
-    recommender.fit(chunks)
-    return 'Corpus Loaded.'
-def generate_text(openAI_key,prompt, engine="text-davinci-003"):
-    openai.api_key = openAI_key
-    completions = openai.Completion.create(
-        engine=engine,
-        prompt=prompt,
-        max_tokens=512,
-        n=1,
-        stop=None,
-        temperature=0.7,
-    )
-    message = completions.choices[0].text
-    return message
-def generate_answer(question,openAI_key):
-    topn_chunks = recommender(question)
-    prompt = ""
-    prompt += 'search results:\n\n'
-    for c in topn_chunks:
-        prompt += c + '\n\n'
-    prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
-              "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
-              "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
-              "with the same name, create separate answers for each. Only include information found in the results and "\
-              "don't add any additional information. Make sure the answer is correct and don't output false content. "\
-              "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
-              "search results which has nothing to do with the question. Only answer what is asked. The "\
-              "answer should be short and concise. \n\nQuery: {question}\nAnswer: "
-    prompt += f"Query: {question}\nAnswer:"
-    answer = generate_text(openAI_key, prompt,"text-davinci-003")
-    return answer
-def question_answer(url, file, question,openAI_key):
-    if openAI_key.strip()=='':
-        return '[ERROR]: Please enter you Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
-    if url.strip() == '' and file == None:
-        return '[ERROR]: Both URL and PDF is empty. Provide atleast one.'
-    if url.strip() != '' and file != None:
-        return '[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF).'
-    if url.strip() != '':
-        glob_url = url
-        download_pdf(glob_url, 'corpus.pdf')
-        load_recommender('corpus.pdf')
-    else:
-        old_file_name = file.name
-        file_name = file.name
-        file_name = file_name[:-12] + file_name[-4:]
-        os.rename(old_file_name, file_name)
-        load_recommender(file_name)
-    if question.strip() == '':
-        return '[ERROR]: Question field is empty'
-    return generate_answer(question,openAI_key)
-recommender = SemanticSearch()
-title = 'PDF GPT'
-description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
-questions = [
-    "what did the study investigate?",
-    "what are the methods used in this study?",
-    "what are the data intervals used in this study? Give me the start dates and end dates?",
-    "what are the main limitations of this study?",
-    "what are the main shortcomings of this study?",
-    "what are the main findings of the study?",
-    "what are the main results of the study?",
-    "what are the input features used in this study?",
-    "what is the dependent variable in this study?"
-]
-question = gr.Textbox(label='Enter your question here')
-# Create a button for each question, which will call set_question_text when clicked
-question_buttons = [gr.Button(f"{q}", fn=set_question_text) for q in questions]
-# Create a function to set the text of the question Textbox
-def set_question_text(q):
-    question.set_value(q)
-# Add the question buttons to the Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown(f'<center><h1>{title}</h1></center>')
     gr.Markdown(description)
     with gr.Row():
         with gr.Group():
-            gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
-            openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
-            url = gr.Textbox(label='Enter PDF URL here')
-            gr.Markdown("<center><h4>OR<h4></center>")
-            file = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'])
-            # Add the question buttons to the Gradio interface
-            for btn in question_buttons:
-                gr.Block(btn)
-            question = gr.Textbox(label='Enter your question here')
-            btn = gr.Button(value='Submit')
-            btn.style(full_width=True)
         with gr.Group():
-            answer = gr.Textbox(label='The answer to your question is :')
-        btn.click(question_answer, inputs=[url, file, question, openAI_key], outputs=[answer])
-#openai.api_key = os.getenv('Your_Key_Here')
 demo.launch()

+import urllib.request
 import fitz
 import re
 import numpy as np
 import os
 from sklearn.neighbors import NearestNeighbors
+title = 'MediDiagnostix AI'
+description = """MediDiagnostix AI allows you to upload medical reports for analysis. Just click a picture of your medical report or upload a pdf report, it will
+ extract, analyze and provide you the medical interpretations of the report, potential diagnoses, and recommended follow-up actions. Furthermore, you can save diagnosis for future reference"""
+with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as demo:
+    gr.Markdown(f'<center><h3>{title}</h3></center>')
     gr.Markdown(description)
     with gr.Row():
         with gr.Group():
+            gr.Markdown(f'<p style="text-align:center">Enter the number of reports to analyze</p>')
+            num_reports = gr.Number(label='Number of Reports', value=1)
+            with gr.Accordion("Upload Reports"):
+                file_upload = gr.File(label='Upload Reports (PDF/Image)', file_types=['.pdf', '.jpg', '.png'], interactive=True, type="file", allow_multiple=True)
+            analyze_button = gr.Button(value='Analyze Reports')
         with gr.Group():
+            analysis_results = gr.Textbox(label='Analysis Results', placeholder="Results will appear here after analysis", lines=20)
+    analyze_button.click(
+        func=analyze_reports,  # This function needs to be defined to handle the report analysis.
+        inputs=[file_upload, num_reports],
+        outputs=[analysis_results],
+    )
 demo.launch()