priyanshu23456 commited on
Commit
0460eee
·
verified ·
1 Parent(s): a2f8fca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -29
app.py CHANGED
@@ -12,7 +12,7 @@ import faiss
12
  import numpy as np
13
  import tempfile
14
  from PIL import Image
15
- from transformers import BitsAndBytesConfig
16
  import logging
17
 
18
  # Set up logging
@@ -51,24 +51,17 @@ def initialize_models():
51
  "question-answering",
52
  model="distilbert-base-cased-distilled-squad",
53
  tokenizer="distilbert-base-cased",
54
- device=-1 # Force CPU for free tier
55
  )
56
 
57
  logger.info("Loading language model...")
58
- model_name = "Qwen/Qwen2.5-1.5B-Instruct" # Replace distilgpt2
59
- # Configure 4-bit quantization
60
- quantization_config = BitsAndBytesConfig(
61
- load_in_4bit=True,
62
- bnb_4bit_compute_dtype=torch.float16,
63
- bnb_4bit_quant_type="nf4",
64
- bnb_4bit_use_double_quant=True
65
- )
66
  tokenizer = AutoTokenizer.from_pretrained(model_name)
67
  model = AutoModelForCausalLM.from_pretrained(
68
  model_name,
69
- quantization_config=quantization_config, # Use 4-bit
70
- device_map="auto",
71
- torch_dtype=torch.float16 # Optimize for CPU fallback
72
  )
73
 
74
  if tokenizer.pad_token is None:
@@ -89,18 +82,12 @@ def answer_with_generation(index, embeddings, chunks, question):
89
  if tokenizer is None or model is None:
90
  logger.info("Generation models not initialized, creating now...")
91
  model_name = "Qwen/Qwen2.5-1.5B-Instruct"
92
- quantization_config = BitsAndBytesConfig(
93
- load_in_4bit=True,
94
- bnb_4bit_compute_dtype=torch.float16,
95
- bnb_4bit_quant_type="nf4",
96
- bnb_4bit_use_double_quant=True
97
- )
98
  tokenizer = AutoTokenizer.from_pretrained(model_name)
99
  model = AutoModelForCausalLM.from_pretrained(
100
  model_name,
101
- quantization_config=quantization_config,
102
- device_map="auto",
103
- torch_dtype=torch.float16
104
  )
105
 
106
  if tokenizer.pad_token is None:
@@ -115,11 +102,11 @@ def answer_with_generation(index, embeddings, chunks, question):
115
  relevant_chunks = [chunks[i] for i in top_k_indices[0]]
116
  context = " ".join(relevant_chunks)
117
 
118
- # Limit context size for efficiency
119
- if len(context) > 2000: # Reduced for Qwen's efficiency
120
  context = context[:2000]
121
 
122
- # Create prompt (optimized for Qwen's instruction format)
123
  prompt = f"""<|im_start|>system
124
  You are a helpful assistant answering questions based on provided PDF content. Use the information below to give a clear, concise, and accurate answer. Avoid speculation and focus on the context.
125
  <|im_end|>
@@ -131,9 +118,9 @@ You are a helpful assistant answering questions based on provided PDF content. U
131
  **Instruction**: Provide a detailed and accurate answer based on the context. If the context doesn't contain enough information, say so clearly. <|im_end|>"""
132
 
133
  # Handle inputs
134
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024) # Increased for Qwen
135
 
136
- # Move inputs to CPU (free tier)
137
  inputs = {k: v.to('cpu') for k, v in inputs.items()}
138
 
139
  # Generate answer
@@ -143,13 +130,12 @@ You are a helpful assistant answering questions based on provided PDF content. U
143
  temperature=0.7,
144
  top_p=0.9,
145
  do_sample=True,
146
- num_beams=2, # Reduced for speed
147
  no_repeat_ngram_size=2
148
  )
149
 
150
  # Decode and format answer
151
  answer = tokenizer.decode(output[0], skip_special_tokens=True)
152
- # Extract the answer after the instruction
153
  if "<|im_end|>" in answer:
154
  answer = answer.split("<|im_end|>")[1].strip()
155
  elif "Instruction" in answer:
@@ -161,6 +147,9 @@ You are a helpful assistant answering questions based on provided PDF content. U
161
  logger.error(f"Generation error: {str(e)}")
162
  return "I couldn't generate a good answer based on the PDF content."
163
 
 
 
 
164
  # Cleanup function for temporary files
165
  def cleanup_temp_files(filepath):
166
  try:
 
12
  import numpy as np
13
  import tempfile
14
  from PIL import Image
15
+
16
  import logging
17
 
18
  # Set up logging
 
51
  "question-answering",
52
  model="distilbert-base-cased-distilled-squad",
53
  tokenizer="distilbert-base-cased",
54
+ device=-1 # Force CPU
55
  )
56
 
57
  logger.info("Loading language model...")
58
+ model_name = "Qwen/Qwen2.5-1.5B-Instruct"
 
 
 
 
 
 
 
59
  tokenizer = AutoTokenizer.from_pretrained(model_name)
60
  model = AutoModelForCausalLM.from_pretrained(
61
  model_name,
62
+ torch_dtype=torch.float16, # Use float16 for lower memory on CPU
63
+ device_map="cpu", # Explicitly set to CPU
64
+ low_cpu_mem_usage=True # Optimize memory loading
65
  )
66
 
67
  if tokenizer.pad_token is None:
 
82
  if tokenizer is None or model is None:
83
  logger.info("Generation models not initialized, creating now...")
84
  model_name = "Qwen/Qwen2.5-1.5B-Instruct"
 
 
 
 
 
 
85
  tokenizer = AutoTokenizer.from_pretrained(model_name)
86
  model = AutoModelForCausalLM.from_pretrained(
87
  model_name,
88
+ torch_dtype=torch.float16,
89
+ device_map="cpu",
90
+ low_cpu_mem_usage=True
91
  )
92
 
93
  if tokenizer.pad_token is None:
 
102
  relevant_chunks = [chunks[i] for i in top_k_indices[0]]
103
  context = " ".join(relevant_chunks)
104
 
105
+ # Limit context size
106
+ if len(context) > 2000:
107
  context = context[:2000]
108
 
109
+ # Create prompt
110
  prompt = f"""<|im_start|>system
111
  You are a helpful assistant answering questions based on provided PDF content. Use the information below to give a clear, concise, and accurate answer. Avoid speculation and focus on the context.
112
  <|im_end|>
 
118
  **Instruction**: Provide a detailed and accurate answer based on the context. If the context doesn't contain enough information, say so clearly. <|im_end|>"""
119
 
120
  # Handle inputs
121
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
122
 
123
+ # Move inputs to CPU
124
  inputs = {k: v.to('cpu') for k, v in inputs.items()}
125
 
126
  # Generate answer
 
130
  temperature=0.7,
131
  top_p=0.9,
132
  do_sample=True,
133
+ num_beams=2,
134
  no_repeat_ngram_size=2
135
  )
136
 
137
  # Decode and format answer
138
  answer = tokenizer.decode(output[0], skip_special_tokens=True)
 
139
  if "<|im_end|>" in answer:
140
  answer = answer.split("<|im_end|>")[1].strip()
141
  elif "Instruction" in answer:
 
147
  logger.error(f"Generation error: {str(e)}")
148
  return "I couldn't generate a good answer based on the PDF content."
149
 
150
+
151
+
152
+
153
  # Cleanup function for temporary files
154
  def cleanup_temp_files(filepath):
155
  try: