CauseWriterPDF / app.py
wholewhale's picture
fixing not found problem
d2fbc5e
import urllib.request
import fitz # PyMuPDF
import re
import numpy as np
import tensorflow_hub as hub
from sklearn.neighbors import NearestNeighbors
import os
import gradio as gr
def download_pdf(url, output_path):
try:
urllib.request.urlretrieve(url, output_path)
return True
except Exception as e:
print(f"Error downloading PDF: {e}")
return False
def preprocess(text):
text = text.replace('\n', ' ')
text = re.sub('\s+', ' ', text).strip()
return text
def pdf_to_text(path, start_page=1, end_page=None):
try:
doc = fitz.open(path)
total_pages = doc.page_count
if end_page is None:
end_page = total_pages
text_list = []
for i in range(start_page - 1, end_page):
page = doc.load_page(i)
text = page.get_text("text")
text = preprocess(text)
text_list.append(text)
doc.close()
return text_list
except Exception as e:
print(f"Error in PDF to text conversion: {e}")
return None
def text_to_chunks(texts, word_length=150, start_page=1):
text_tokens = [t.split(' ') for t in texts]
chunks = []
for idx, words in enumerate(text_tokens):
for i in range(0, len(words), word_length):
chunk = words[i:i + word_length]
chunk = ' '.join(chunk).strip()
chunk = f'[Page no. {idx + start_page}] ' + chunk
chunks.append(chunk)
return chunks
class SemanticSearch:
def __init__(self):
self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
self.fitted = False
def fit(self, data):
self.data = data
self.embeddings = self.use(data)
self.nn = NearestNeighbors(n_neighbors=5)
self.nn.fit(self.embeddings)
self.fitted = True
def __call__(self, text):
if not self.fitted:
return "Model not fitted yet."
query_embedding = self.use([text])
neighbors = self.nn.kneighbors(query_embedding, return_distance=False)[0]
return [self.data[i] for i in neighbors]
recommender = SemanticSearch()
def gui(url, question):
if url.strip():
if not download_pdf(url, "temp.pdf"):
return "Failed to download PDF."
texts = pdf_to_text("temp.pdf")
if texts is None:
return "Failed to extract text from PDF."
chunks = text_to_chunks(texts)
recommender.fit(chunks)
else:
return "Please provide a valid URL."
if question.strip():
results = recommender(question)
return results
else:
return "Please enter a question."
iface = gr.Interface(
fn=gui,
inputs=["text", "text"],
outputs="text"
)
iface.launch()