CauseWriterPDF

Sleeping

App Files Files Community

CauseWriterPDF / app.py

wholewhale

fixing not found problem

d2fbc5e almost 2 years ago

raw

history blame contribute delete

2.84 kB

	import urllib.request
	import fitz # PyMuPDF
	import re
	import numpy as np
	import tensorflow_hub as hub
	from sklearn.neighbors import NearestNeighbors
	import os
	import gradio as gr

	def download_pdf(url, output_path):
	try:
	urllib.request.urlretrieve(url, output_path)
	return True
	except Exception as e:
	print(f"Error downloading PDF: {e}")
	return False

	def preprocess(text):
	text = text.replace('\n', ' ')
	text = re.sub('\s+', ' ', text).strip()
	return text

	def pdf_to_text(path, start_page=1, end_page=None):
	try:
	doc = fitz.open(path)
	total_pages = doc.page_count

	if end_page is None:
	end_page = total_pages

	text_list = []

	for i in range(start_page - 1, end_page):
	page = doc.load_page(i)
	text = page.get_text("text")
	text = preprocess(text)
	text_list.append(text)

	doc.close()
	return text_list
	except Exception as e:
	print(f"Error in PDF to text conversion: {e}")
	return None

	def text_to_chunks(texts, word_length=150, start_page=1):
	text_tokens = [t.split(' ') for t in texts]
	chunks = []

	for idx, words in enumerate(text_tokens):
	for i in range(0, len(words), word_length):
	chunk = words[i:i + word_length]
	chunk = ' '.join(chunk).strip()
	chunk = f'[Page no. {idx + start_page}] ' + chunk
	chunks.append(chunk)

	return chunks

	class SemanticSearch:
	def __init__(self):
	self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
	self.fitted = False

	def fit(self, data):
	self.data = data
	self.embeddings = self.use(data)
	self.nn = NearestNeighbors(n_neighbors=5)
	self.nn.fit(self.embeddings)
	self.fitted = True

	def __call__(self, text):
	if not self.fitted:
	return "Model not fitted yet."

	query_embedding = self.use([text])
	neighbors = self.nn.kneighbors(query_embedding, return_distance=False)[0]
	return [self.data[i] for i in neighbors]

	recommender = SemanticSearch()

	def gui(url, question):
	if url.strip():
	if not download_pdf(url, "temp.pdf"):
	return "Failed to download PDF."

	texts = pdf_to_text("temp.pdf")
	if texts is None:
	return "Failed to extract text from PDF."

	chunks = text_to_chunks(texts)
	recommender.fit(chunks)
	else:
	return "Please provide a valid URL."

	if question.strip():
	results = recommender(question)
	return results
	else:
	return "Please enter a question."

	iface = gr.Interface(
	fn=gui,
	inputs=["text", "text"],
	outputs="text"
	)
	iface.launch()