Spaces:

priyanshu23456
/

pdfassistant

Sleeping

App Files Files Community

priyanshu23456 commited on Apr 13

Commit

0460eee

verified ·

1 Parent(s): a2f8fca

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -29

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ import faiss
 import numpy as np
 import tempfile
 from PIL import Image
-from transformers import BitsAndBytesConfig
 import logging
 # Set up logging
@@ -51,24 +51,17 @@ def initialize_models():
             "question-answering",
             model="distilbert-base-cased-distilled-squad",
             tokenizer="distilbert-base-cased",
-            device=-1  # Force CPU for free tier
         )
         logger.info("Loading language model...")
-        model_name = "Qwen/Qwen2.5-1.5B-Instruct"  # Replace distilgpt2
-        # Configure 4-bit quantization
-        quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_use_double_quant=True
-        )
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            quantization_config=quantization_config,  # Use 4-bit
-            device_map="auto",
-            torch_dtype=torch.float16  # Optimize for CPU fallback
         )
         if tokenizer.pad_token is None:
@@ -89,18 +82,12 @@ def answer_with_generation(index, embeddings, chunks, question):
         if tokenizer is None or model is None:
             logger.info("Generation models not initialized, creating now...")
             model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-            quantization_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_compute_dtype=torch.float16,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_use_double_quant=True
-            )
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             model = AutoModelForCausalLM.from_pretrained(
                 model_name,
-                quantization_config=quantization_config,
-                device_map="auto",
-                torch_dtype=torch.float16
             )
             if tokenizer.pad_token is None:
@@ -115,11 +102,11 @@ def answer_with_generation(index, embeddings, chunks, question):
         relevant_chunks = [chunks[i] for i in top_k_indices[0]]
         context = " ".join(relevant_chunks)
-        # Limit context size for efficiency
-        if len(context) > 2000:  # Reduced for Qwen's efficiency
             context = context[:2000]
-        # Create prompt (optimized for Qwen's instruction format)
         prompt = f"""<|im_start|>system
 You are a helpful assistant answering questions based on provided PDF content. Use the information below to give a clear, concise, and accurate answer. Avoid speculation and focus on the context.
 <|im_end|>
@@ -131,9 +118,9 @@ You are a helpful assistant answering questions based on provided PDF content. U
 **Instruction**: Provide a detailed and accurate answer based on the context. If the context doesn't contain enough information, say so clearly. <|im_end|>"""
         # Handle inputs
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)  # Increased for Qwen
-        # Move inputs to CPU (free tier)
         inputs = {k: v.to('cpu') for k, v in inputs.items()}
         # Generate answer
@@ -143,13 +130,12 @@ You are a helpful assistant answering questions based on provided PDF content. U
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
-            num_beams=2,  # Reduced for speed
             no_repeat_ngram_size=2
         )
         # Decode and format answer
         answer = tokenizer.decode(output[0], skip_special_tokens=True)
-        # Extract the answer after the instruction
         if "<|im_end|>" in answer:
             answer = answer.split("<|im_end|>")[1].strip()
         elif "Instruction" in answer:
@@ -161,6 +147,9 @@ You are a helpful assistant answering questions based on provided PDF content. U
         logger.error(f"Generation error: {str(e)}")
         return "I couldn't generate a good answer based on the PDF content."
 # Cleanup function for temporary files
 def cleanup_temp_files(filepath):
     try:

 import numpy as np
 import tempfile
 from PIL import Image
 import logging
 # Set up logging
             "question-answering",
             model="distilbert-base-cased-distilled-squad",
             tokenizer="distilbert-base-cased",
+            device=-1  # Force CPU
         )
         logger.info("Loading language model...")
+        model_name = "Qwen/Qwen2.5-1.5B-Instruct"
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
+            torch_dtype=torch.float16,  # Use float16 for lower memory on CPU
+            device_map="cpu",  # Explicitly set to CPU
+            low_cpu_mem_usage=True  # Optimize memory loading
         )
         if tokenizer.pad_token is None:
         if tokenizer is None or model is None:
             logger.info("Generation models not initialized, creating now...")
             model_name = "Qwen/Qwen2.5-1.5B-Instruct"
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             model = AutoModelForCausalLM.from_pretrained(
                 model_name,
+                torch_dtype=torch.float16,
+                device_map="cpu",
+                low_cpu_mem_usage=True
             )
             if tokenizer.pad_token is None:
         relevant_chunks = [chunks[i] for i in top_k_indices[0]]
         context = " ".join(relevant_chunks)
+        # Limit context size
+        if len(context) > 2000:
             context = context[:2000]
+        # Create prompt
         prompt = f"""<|im_start|>system
 You are a helpful assistant answering questions based on provided PDF content. Use the information below to give a clear, concise, and accurate answer. Avoid speculation and focus on the context.
 <|im_end|>
 **Instruction**: Provide a detailed and accurate answer based on the context. If the context doesn't contain enough information, say so clearly. <|im_end|>"""
         # Handle inputs
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
+        # Move inputs to CPU
         inputs = {k: v.to('cpu') for k, v in inputs.items()}
         # Generate answer
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
+            num_beams=2,
             no_repeat_ngram_size=2
         )
         # Decode and format answer
         answer = tokenizer.decode(output[0], skip_special_tokens=True)
         if "<|im_end|>" in answer:
             answer = answer.split("<|im_end|>")[1].strip()
         elif "Instruction" in answer:
         logger.error(f"Generation error: {str(e)}")
         return "I couldn't generate a good answer based on the PDF content."
 # Cleanup function for temporary files
 def cleanup_temp_files(filepath):
     try: