Spaces:

priyanshu23456
/

pdfassistant

Sleeping

App Files Files Community

priyanshu23456 commited on 15 days ago

Commit

d8a5f00

verified ·

1 Parent(s): 4ab0eb9

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -326

app.py CHANGED Viewed

@@ -2,30 +2,32 @@ from flask import Flask, request, jsonify
 from werkzeug.utils import secure_filename
 from flask_cors import CORS
 import os
-import torch
-import fitz  # PyMuPDF
-import pytesseract
-from pdf2image import convert_from_path
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
-from sentence_transformers import SentenceTransformer
-import faiss
-import numpy as np
 import tempfile
-from PIL import Image
-import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Fix caching issue on Hugging Face Spaces
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 os.environ["HF_HOME"] = "/tmp"
 os.environ["XDG_CACHE_HOME"] = "/tmp"
 app = Flask(__name__)
-CORS(app)  # Enable CORS for all routes
 UPLOAD_FOLDER = "/tmp/uploads"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
@@ -34,334 +36,192 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 logger.info(f"Using device: {device}")
 # Global model variables
-embedder = None
-qa_pipeline = None
-tokenizer = None
-model = None
-# Initialize models once on startup
 def initialize_models():
-    global embedder, qa_pipeline, tokenizer, model
     try:
-        logger.info("Loading SentenceTransformer model...")
-        embedder = SentenceTransformer("all-MiniLM-L6-v2")
-        logger.info("Loading QA pipeline...")
-        qa_pipeline = pipeline(
-            "question-answering",
-            model="distilbert-base-cased-distilled-squad",
-            tokenizer="distilbert-base-cased",
-            device=-1  # Force CPU
         )
-        logger.info("Loading language model...")
-        model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            torch_dtype=torch.float16,  # Use float16 for lower memory on CPU
-            device_map="cpu",  # Explicitly set to CPU
-            low_cpu_mem_usage=True  # Optimize memory loading
         )
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-            model.config.pad_token_id = model.config.eos_token_id
         logger.info("Models initialized successfully")
     except Exception as e:
         logger.error(f"Error initializing models: {str(e)}")
-        raise
-# Generation-based answering
-def answer_with_generation(index, embeddings, chunks, question):
     try:
-        logger.info(f"Answering with generation model: '{question}'")
-        global tokenizer, model
-        if tokenizer is None or model is None:
-            logger.info("Generation models not initialized, creating now...")
-            model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-            model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                torch_dtype=torch.float16,
-                device_map="cpu",
-                low_cpu_mem_usage=True
-            )
-            if tokenizer.pad_token is None:
-                tokenizer.pad_token = tokenizer.eos_token
-                model.config.pad_token_id = model.config.eos_token_id
-        # Get embeddings for question
-        q_embedding = embedder.encode([question])
-        # Find relevant chunks
-        _, top_k_indices = index.search(q_embedding, k=3)
-        relevant_chunks = [chunks[i] for i in top_k_indices[0]]
-        context = " ".join(relevant_chunks)
-        # Limit context size
-        if len(context) > 2000:
-            context = context[:2000]
-        # Create prompt
-        prompt = f"""<|im_start|>system
-You are a helpful assistant answering questions based on provided PDF content. Use the information below to give a clear, concise, and accurate answer. Avoid speculation and focus on the context.
-<|im_end|>
-<|im_start|>user
-**Context**: {context}
-**Question**: {question}
-**Instruction**: Provide a detailed and accurate answer based on the context. If the context doesn't contain enough information, say so clearly. <|im_end|>"""
-        # Handle inputs
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
-        # Move inputs to CPU
-        inputs = {k: v.to('cpu') for k, v in inputs.items()}
-        # Generate answer
-        output = model.generate(
-            **inputs,
-            max_new_tokens=300,
-            temperature=0.7,
-            top_p=0.9,
-            do_sample=True,
-            num_beams=2,
-            no_repeat_ngram_size=2
         )
-        # Decode and format answer
-        answer = tokenizer.decode(output[0], skip_special_tokens=True)
-        if "<|im_end|>" in answer:
-            answer = answer.split("<|im_end|>")[1].strip()
-        elif "Instruction" in answer:
-            answer = answer.split("Instruction")[1].strip()
-        logger.info(f"Generation answer: '{answer[:50]}...' (length: {len(answer)})")
-        return answer.strip()
-    except Exception as e:
-        logger.error(f"Generation error: {str(e)}")
-        return "I couldn't generate a good answer based on the PDF content."
-# Cleanup function for temporary files
-def cleanup_temp_files(filepath):
-    try:
-        if os.path.exists(filepath):
-            os.remove(filepath)
-            logger.info(f"Removed temporary file: {filepath}")
-    except Exception as e:
-        logger.warning(f"Failed to clean up file {filepath}: {str(e)}")
-# Improved OCR function
-def ocr_pdf(pdf_path):
-    try:
-        logger.info(f"Starting OCR for {pdf_path}")
-        # Use a higher DPI for better quality
-        images = convert_from_path(
-            pdf_path,
-            dpi=300,  # Higher DPI for better quality
-            grayscale=False,  # Color might help with some PDFs
-            thread_count=2,  # Use multiple threads
-            use_pdftocairo=True  # pdftocairo often gives better results
         )
-        text = ""
-        for i, img in enumerate(images):
-            logger.info(f"Processing page {i+1} of {len(images)}")
-            # Preprocess the image for better OCR results
-            preprocessed = preprocess_image_for_ocr(img)
-            # Use tesseract with more options
-            page_text = pytesseract.image_to_string(
-                preprocessed,
-                config='--psm 1 --oem 3 -l eng'  # Page segmentation mode 1 (auto), OCR Engine mode 3 (default)
-            )
-            text += page_text
-        logger.info(f"OCR completed with {len(text)} characters extracted")
-        return text
     except Exception as e:
-        logger.error(f"OCR error: {str(e)}")
-        return ""
-# Image preprocessing function for better OCR
-def preprocess_image_for_ocr(img):
-    # Convert to grayscale
-    gray = img.convert('L')
-    # Optional: You could add more preprocessing here like:
-    # - Thresholding
-    # - Noise removal
-    # - Contrast enhancement
-    return gray
-# Improved extract_text function with better text detection
-def extract_text(pdf_path):
     try:
-        logger.info(f"Extracting text from {pdf_path}")
-        doc = fitz.open(pdf_path)
-        text = ""
-        for page_num, page in enumerate(doc):
-            page_text = page.get_text()
-            text += page_text
-            logger.info(f"Extracted {len(page_text)} characters from page {page_num+1}")
-        # Check if the text is meaningful (more sophisticated check)
-        words = text.split()
-        unique_words = set(word.lower() for word in words if len(word) > 2)
-        logger.info(f"PDF text extraction: {len(text)} chars, {len(words)} words, {len(unique_words)} unique words")
-        # If we don't have enough meaningful text, try OCR
-        if len(unique_words) < 20 or len(text.strip()) < 100:
-            logger.info("Text extraction yielded insufficient results, trying OCR...")
-            ocr_text = ocr_pdf(pdf_path)
-            # If OCR gave us more text, use it
-            if len(ocr_text.strip()) > len(text.strip()):
-                logger.info(f"Using OCR result: {len(ocr_text)} chars (better than {len(text)} chars)")
-                text = ocr_text
-        return text
     except Exception as e:
-        logger.error(f"Text extraction error: {str(e)}")
-        return ""
-# Split into chunks
-def split_into_chunks(text, max_tokens=300, overlap=50):
-    logger.info(f"Splitting text into chunks (max_tokens={max_tokens}, overlap={overlap})")
-    sentences = text.split('.')
-    chunks, current = [], ''
-    for sentence in sentences:
-        sentence = sentence.strip() + '.'
-        if len(current) + len(sentence) < max_tokens:
-            current += sentence
-        else:
-            chunks.append(current.strip())
-            words = current.split()
-            if len(words) > overlap:
-                current = ' '.join(words[-overlap:]) + ' ' + sentence
-            else:
-                current = sentence
-    if current:
-        chunks.append(current.strip())
-    logger.info(f"Split text into {len(chunks)} chunks")
-    return chunks
-# Setup FAISS
-def setup_faiss(chunks):
-    try:
-        logger.info("Setting up FAISS index")
-        global embedder
-        if embedder is None:
-            embedder = SentenceTransformer("all-MiniLM-L6-v2")
-        embeddings = embedder.encode(chunks)
-        dim = embeddings.shape[1]
-        index = faiss.IndexFlatL2(dim)
-        index.add(embeddings)
-        logger.info(f"FAISS index created with {len(chunks)} chunks and dimension {dim}")
-        return index, embeddings, chunks
-    except Exception as e:
-        logger.error(f"FAISS setup error: {str(e)}")
-        raise
-# QA pipeline
-def answer_with_qa_pipeline(chunks, question):
-    try:
-        logger.info(f"Answering with QA pipeline: '{question}'")
-        global qa_pipeline
-        if qa_pipeline is None:
-            logger.info("QA pipeline not initialized, creating now...")
-            qa_pipeline = pipeline(
-                "question-answering",
-                model="distilbert-base-cased-distilled-squad",
-                tokenizer="distilbert-base-cased",
-                device=0 if device == "cuda" else -1
-            )
-        # Limit context size to avoid token length issues
-        context = " ".join(chunks[:5])
-        if len(context) > 5000:  # Approx token limit
-            context = context[:5000]
-        result = qa_pipeline(question=question, context=context)
-        logger.info(f"QA pipeline answer: '{result['answer']}' (score: {result['score']})")
-        return result["answer"]
-    except Exception as e:
-        logger.error(f"QA pipeline error: {str(e)}")
-        return ""
-# Generation-based answering
-def answer_with_generation(index, embeddings, chunks, question):
     try:
-        logger.info(f"Answering with generation model: '{question}'")
-        global tokenizer, model
-        if tokenizer is None or model is None:
-            logger.info("Generation models not initialized, creating now...")
-            tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-            model = AutoModelForCausalLM.from_pretrained(
-                "distilgpt2",
-                device_map="auto",
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
-            )
-            if tokenizer.pad_token is None:
-                tokenizer.pad_token = tokenizer.eos_token
-                model.config.pad_token_id = model.config.eos_token_id
-        # Get embeddings for question
-        q_embedding = embedder.encode([question])
-        # Find relevant chunks
-        _, top_k_indices = index.search(q_embedding, k=3)
-        relevant_chunks = [chunks[i] for i in top_k_indices[0]]
-        context = " ".join(relevant_chunks)
-        # Limit context size to avoid token length issues
-        if len(context) > 4000:
-            context = context[:4000]
-        # Create prompt
-        prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
-        # Handle inputs
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
-        # Move inputs to the right device if needed
-        if torch.cuda.is_available():
-            inputs = {k: v.to('cuda') for k, v in inputs.items()}
-        # Generate answer
-        output = model.generate(
-            **inputs,
-            max_new_tokens=300,
-            temperature=0.7,
-            top_p=0.9,
-            do_sample=True,
-            num_beams=3,
-            no_repeat_ngram_size=2
-        )
-        # Decode and format answer
-        answer = tokenizer.decode(output[0], skip_special_tokens=True)
-        if "Detailed answer:" in answer:
-            answer = answer.split("Detailed answer:")[-1].strip()
-        logger.info(f"Generation answer: '{answer[:50]}...' (length: {len(answer)})")
-        return answer.strip()
     except Exception as e:
-        logger.error(f"Generation error: {str(e)}")
-        return "I couldn't generate a good answer based on the PDF content."
-# API route
 @app.route('/')
 def home():
-    return jsonify({"message": "PDF QA API is running!"})
 @app.route('/ask', methods=['POST'])
 def ask():
@@ -379,46 +239,35 @@ def ask():
         logger.info(f"Processing file: {filename}, Question: '{question}'")
-        # Process PDF and generate answer
-        text = extract_text(filepath)
-        if not text.strip():
             return jsonify({"error": "Could not extract text from the PDF"}), 400
-        chunks = split_into_chunks(text)
-        if not chunks:
-            return jsonify({"error": "PDF content couldn't be processed"}), 400
-        try:
-            answer = answer_with_qa_pipeline(chunks, question)
-        except Exception as e:
-            logger.warning(f"QA pipeline failed: {str(e)}")
-            answer = ""
-        # If QA pipeline didn't give a good answer, try generation
-        if not answer or len(answer.strip()) < 20:
-            try:
-                logger.info("QA pipeline answer insufficient, trying generation...")
-                index, embeddings, chunks = setup_faiss(chunks)
-                answer = answer_with_generation(index, embeddings, chunks, question)
-            except Exception as e:
-                logger.error(f"Generation fallback failed: {str(e)}")
-                return jsonify({"error": "Failed to generate answer from PDF content"}), 500
         return jsonify({"answer": answer})
     except Exception as e:
         logger.error(f"Error processing request: {str(e)}")
-        return jsonify({"error": f"An error occurred processing your request: {str(e)}"}), 500
     finally:
-        # Always clean up, even if errors occur
         if filepath:
             cleanup_temp_files(filepath)
 if __name__ == "__main__":
     try:
-        # Initialize models at startup
-        initialize_models()
-        logger.info("Starting Flask application")
-        app.run(host="0.0.0.0", port=7860)
     except Exception as e:
         logger.critical(f"Failed to start application: {str(e)}")

 from werkzeug.utils import secure_filename
 from flask_cors import CORS
 import os
+import logging
+from typing import List
 import tempfile
+# LangChain imports
+from langchain.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.llms import HuggingFacePipeline
+from langchain.chains import RetrievalQA
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Optimize for low resource environments
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 os.environ["HF_HOME"] = "/tmp"
 os.environ["XDG_CACHE_HOME"] = "/tmp"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 app = Flask(__name__)
+CORS(app)
 UPLOAD_FOLDER = "/tmp/uploads"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 logger.info(f"Using device: {device}")
 # Global model variables
+embeddings_model = None
+llm_chain = None
 def initialize_models():
+    """Initialize lightweight models optimized for resource constraints"""
+    global embeddings_model, llm_chain
     try:
+        logger.info("Initializing lightweight models...")
+        # Use lightweight embeddings model
+        logger.info("Loading embeddings model (all-MiniLM-L6-v2)...")
+        embeddings_model = HuggingFaceEmbeddings(
+            model_name="all-MiniLM-L6-v2",
+            model_kwargs={
+                "device": device,
+                "trust_remote_code": True
+            },
+            encode_kwargs={
+                "normalize_embeddings": True,
+                "batch_size": 8
+            }
+        )
+        # Use a very lightweight LLM (phi-2 or similar)
+        logger.info("Loading lightweight LLM (phi-2)...")
+        model_name = "microsoft/phi-2"
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            cache_dir="/tmp"
         )
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
+            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+            device_map="auto" if device == "cuda" else "cpu",
+            trust_remote_code=True,
+            cache_dir="/tmp",
+            low_cpu_mem_usage=True,
+            load_in_8bit=False  # Disable if memory is very limited
+        )
+        # Create text generation pipeline
+        text_gen_pipeline = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            max_new_tokens=256,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True,
+            device=0 if device == "cuda" else -1,
+            return_full_text=False
+        )
+        # Create LLM wrapper
+        llm_chain = HuggingFacePipeline(
+            pipeline=text_gen_pipeline,
+            model_kwargs={
+                "temperature": 0.7,
+                "max_length": 512
+            }
         )
         logger.info("Models initialized successfully")
+        return True
     except Exception as e:
         logger.error(f"Error initializing models: {str(e)}")
+        return False
+def load_pdf(filepath: str) -> List[str]:
+    """Load PDF using LangChain with fallback"""
     try:
+        logger.info(f"Loading PDF: {filepath}")
+        loader = PyPDFLoader(filepath)
+        pages = loader.load()
+        if not pages:
+            logger.warning("No pages extracted from PDF")
+            return []
+        # Combine page content
+        docs = [page.page_content for page in pages if page.page_content.strip()]
+        logger.info(f"Loaded {len(pages)} pages with {sum(len(d) for d in docs)} characters")
+        return docs
+    except Exception as e:
+        logger.error(f"Error loading PDF: {str(e)}")
+        return []
+def create_retriever(docs: List[str]):
+    """Create FAISS retriever for efficient similarity search"""
+    try:
+        logger.info("Creating retriever from documents")
+        # Split documents into smaller chunks
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=400,
+            chunk_overlap=50,
+            separators=["\n\n", "\n", " ", ""]
         )
+        # Process all docs and split
+        all_chunks = []
+        for doc in docs:
+            chunks = text_splitter.split_text(doc)
+            all_chunks.extend(chunks)
+        logger.info(f"Created {len(all_chunks)} chunks")
+        # Create FAISS vectorstore
+        vectorstore = FAISS.from_texts(
+            all_chunks,
+            embeddings_model
+        )
+        # Return retriever with limited top_k for speed
+        retriever = vectorstore.as_retriever(
+            search_kwargs={"k": 3}  # Only get top 3 most relevant chunks
         )
+        logger.info("Retriever created successfully")
+        return retriever
     except Exception as e:
+        logger.error(f"Error creating retriever: {str(e)}")
+        raise
+def answer_question(retriever, question: str) -> str:
+    """Answer question using RAG chain"""
     try:
+        logger.info(f"Answering question: {question}")
+        # Create RAG chain
+        qa_chain = RetrievalQA.from_chain_type(
+            llm=llm_chain,
+            chain_type="stuff",  # Use "stuff" for simplicity
+            retriever=retriever,
+            return_source_documents=False,
+            chain_type_kwargs={
+                "prompt": _get_qa_prompt()
+            }
+        )
+        result = qa_chain({"query": question})
+        answer = result.get("result", "No answer generated")
+        logger.info(f"Answer generated: {answer[:100]}...")
+        return answer.strip()
     except Exception as e:
+        logger.error(f"Error answering question: {str(e)}")
+        return "Sorry, I couldn't generate an answer from the PDF content."
+def _get_qa_prompt():
+    """Create a simple QA prompt template"""
+    from langchain.prompts import PromptTemplate
+    template = """Use the following pieces of context to answer the question at the end.
+If you don't know the answer, just say you don't know.
+Context:
+{context}
+Question: {question}
+Answer:"""
+    return PromptTemplate(
+        template=template,
+        input_variables=["context", "question"]
+    )
+def cleanup_temp_files(filepath):
+    """Clean up temporary files"""
     try:
+        if os.path.exists(filepath):
+            os.remove(filepath)
+            logger.info(f"Removed temporary file: {filepath}")
     except Exception as e:
+        logger.warning(f"Failed to clean up file {filepath}: {str(e)}")
 @app.route('/')
 def home():
+    return jsonify({"message": "PDF QA API is running with LangChain!"})
 @app.route('/ask', methods=['POST'])
 def ask():
         logger.info(f"Processing file: {filename}, Question: '{question}'")
+        # Load PDF
+        docs = load_pdf(filepath)
+        if not docs:
             return jsonify({"error": "Could not extract text from the PDF"}), 400
+        # Create retriever
+        retriever = create_retriever(docs)
+        # Get answer
+        answer = answer_question(retriever, question)
+        if not answer or len(answer.strip()) < 5:
+            return jsonify({"error": "Failed to generate answer from PDF content"}), 500
         return jsonify({"answer": answer})
     except Exception as e:
         logger.error(f"Error processing request: {str(e)}")
+        return jsonify({"error": f"An error occurred: {str(e)}"}), 500
     finally:
         if filepath:
             cleanup_temp_files(filepath)
 if __name__ == "__main__":
     try:
+        if initialize_models():
+            logger.info("Starting Flask application")
+            app.run(host="0.0.0.0", port=7860, threaded=False)
+        else:
+            logger.error("Failed to initialize models")
     except Exception as e:
         logger.critical(f"Failed to start application: {str(e)}")