Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
# Web Content Q&A Tool for Hugging Face Spaces
|
| 2 |
-
#
|
|
|
|
| 3 |
# Includes keyword search fallback for low-confidence QA answers
|
| 4 |
|
| 5 |
import gradio as gr
|
|
@@ -57,6 +58,16 @@ model = torch.quantization.quantize_dynamic(
|
|
| 57 |
# Create the QA pipeline with PyTorch
|
| 58 |
qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1) # device=-1 for CPU
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
# Keyword search function for fallback
|
| 61 |
def keyword_search(question, corpus, sources_list):
|
| 62 |
stop_words = set(["what", "is", "the", "a", "an", "in", "on", "at", "for", "with", "and", "or", "but", "not", "this", "that", "these", "those", "to", "of", "it", "by", "as", "if", "when", "where", "who", "which", "how", "why"])
|
|
@@ -85,6 +96,8 @@ def keyword_search(question, corpus, sources_list):
|
|
| 85 |
if best_paragraph is None:
|
| 86 |
return "No relevant paragraph found.", None
|
| 87 |
|
|
|
|
|
|
|
| 88 |
return best_paragraph, best_source
|
| 89 |
|
| 90 |
def ingest_urls(urls):
|
|
@@ -140,6 +153,7 @@ def answer_question(question):
|
|
| 140 |
Retrieves top 3 paragraphs to improve answer accuracy.
|
| 141 |
If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
|
| 142 |
If QA confidence is below 0.4, falls back to keyword search.
|
|
|
|
| 143 |
"""
|
| 144 |
global corpus, embeddings, sources_list
|
| 145 |
if not corpus or embeddings is None:
|
|
@@ -153,7 +167,7 @@ def answer_question(question):
|
|
| 153 |
top_k = min(2, len(corpus)) # Get top 3 paragraphs to improve accuracy
|
| 154 |
top_indices = np.argsort(-cos_scores)[:top_k]
|
| 155 |
|
| 156 |
-
# Retrieve context (top
|
| 157 |
contexts = [corpus[i] for i in top_indices]
|
| 158 |
context = " ".join(contexts) # Concatenate with space
|
| 159 |
sources = [sources_list[i] for i in top_indices]
|
|
@@ -165,7 +179,11 @@ def answer_question(question):
|
|
| 165 |
confidence = result['score']
|
| 166 |
|
| 167 |
if confidence >= 0.4:
|
| 168 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
sources_str = "\n".join(set(sources)) # Unique sources
|
| 170 |
return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
|
| 171 |
else:
|
|
|
|
| 1 |
# Web Content Q&A Tool for Hugging Face Spaces
|
| 2 |
+
# Optimized for memory constraints (2GB RAM) and 24-hour timeline
|
| 3 |
+
# Features: Ingest up to 3 URLs, ask questions, get concise one-line answers using DistilBERT with PyTorch
|
| 4 |
# Includes keyword search fallback for low-confidence QA answers
|
| 5 |
|
| 6 |
import gradio as gr
|
|
|
|
| 58 |
# Create the QA pipeline with PyTorch
|
| 59 |
qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, framework="pt", device=-1) # device=-1 for CPU
|
| 60 |
|
| 61 |
+
# Utility function to truncate text to one line
|
| 62 |
+
def truncate_to_one_line(text):
|
| 63 |
+
# Split by sentence-ending punctuation and take the first sentence
|
| 64 |
+
sentences = re.split(r'[.!?]+', text.strip())
|
| 65 |
+
first_sentence = sentences[0].strip() if sentences else text.strip()
|
| 66 |
+
# If the sentence is too long, truncate to 100 characters
|
| 67 |
+
if len(first_sentence) > 100:
|
| 68 |
+
first_sentence = first_sentence[:100].rsplit(' ', 1)[0] + "..."
|
| 69 |
+
return first_sentence if first_sentence else "No answer available."
|
| 70 |
+
|
| 71 |
# Keyword search function for fallback
|
| 72 |
def keyword_search(question, corpus, sources_list):
|
| 73 |
stop_words = set(["what", "is", "the", "a", "an", "in", "on", "at", "for", "with", "and", "or", "but", "not", "this", "that", "these", "those", "to", "of", "it", "by", "as", "if", "when", "where", "who", "which", "how", "why"])
|
|
|
|
| 96 |
if best_paragraph is None:
|
| 97 |
return "No relevant paragraph found.", None
|
| 98 |
|
| 99 |
+
# Truncate the paragraph to one line
|
| 100 |
+
best_paragraph = truncate_to_one_line(best_paragraph)
|
| 101 |
return best_paragraph, best_source
|
| 102 |
|
| 103 |
def ingest_urls(urls):
|
|
|
|
| 153 |
Retrieves top 3 paragraphs to improve answer accuracy.
|
| 154 |
If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
|
| 155 |
If QA confidence is below 0.4, falls back to keyword search.
|
| 156 |
+
Ensures answers are one line (max 100 chars).
|
| 157 |
"""
|
| 158 |
global corpus, embeddings, sources_list
|
| 159 |
if not corpus or embeddings is None:
|
|
|
|
| 167 |
top_k = min(2, len(corpus)) # Get top 3 paragraphs to improve accuracy
|
| 168 |
top_indices = np.argsort(-cos_scores)[:top_k]
|
| 169 |
|
| 170 |
+
# Retrieve context (top 2 paragraphs)
|
| 171 |
contexts = [corpus[i] for i in top_indices]
|
| 172 |
context = " ".join(contexts) # Concatenate with space
|
| 173 |
sources = [sources_list[i] for i in top_indices]
|
|
|
|
| 179 |
confidence = result['score']
|
| 180 |
|
| 181 |
if confidence >= 0.4:
|
| 182 |
+
# Truncate QA answer to one line
|
| 183 |
+
answer = truncate_to_one_line(answer)
|
| 184 |
+
# Ensure at least one line
|
| 185 |
+
if not answer:
|
| 186 |
+
answer = "No answer available."
|
| 187 |
sources_str = "\n".join(set(sources)) # Unique sources
|
| 188 |
return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
|
| 189 |
else:
|