Spaces:
Sleeping
Sleeping
File size: 3,767 Bytes
26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import time
import functools
from typing import Callable, Any, Dict, List
import torch
import psutil
import json
from evaluate import load
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def time_function(func: Callable) -> Callable:
"""
Decorator to time function execution
"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
print(f"{func.__name__} took {end_time - start_time:.2f} seconds to execute")
return result
return wrapper
def evaluate_response(generated_response: str, ground_truth: str = None) -> Dict[str, Any]:
"""
Evaluate generated response with BLEU, ROUGE, and word overlap
"""
results = {
"length": len(generated_response),
"word_count": len(generated_response.split())
}
if ground_truth:
bleu = load("bleu")
rouge = load("rouge")
bleu_score = bleu.compute(predictions=[generated_response], references=[[ground_truth]])
rouge_score = rouge.compute(predictions=[generated_response], references=[ground_truth])
generated_words = set(generated_response.lower().split())
ground_truth_words = set(ground_truth.lower().split())
overlap = len(generated_words.intersection(ground_truth_words))
results.update({
"bleu": bleu_score["bleu"],
"rouge": rouge_score["rougeL"],
"word_overlap": overlap / len(ground_truth_words) if ground_truth_words else 0
})
return results
def evaluate_retrieval(embedder, test_set_path: str, k: int = 3) -> Dict[str, float]:
"""
Evaluate retrieval quality with Precision@k and Recall@k
"""
with open(test_set_path, 'r') as f:
test_set = json.load(f)
precision, recall = [], []
for item in test_set:
query = item['query']
true_ids = set(item['relevant_ids'])
retrieved_faqs = embedder.retrieve_relevant_faqs(query, k)
retrieved_ids = set(range(len(retrieved_faqs)))
true_positives = len(true_ids & retrieved_ids)
precision.append(true_positives / k if k > 0 else 0)
recall.append(true_positives / len(true_ids) if true_ids else 0)
return {
"Precision@k": sum(precision) / len(precision) if precision else 0,
"Recall@k": sum(recall) / len(recall) if recall else 0
}
def baseline_keyword_search(query: str, faqs: List[Dict[str, Any]], k: int = 3) -> List[Dict[str, Any]]:
"""
Keyword-based search baseline using TF-IDF
"""
questions = [faq['question'] for faq in faqs]
vectorizer = TfidfVectorizer()
question_vectors = vectorizer.fit_transform(questions)
query_vector = vectorizer.transform([query])
similarities = cosine_similarity(query_vector, question_vectors).flatten()
top_k_indices = similarities.argsort()[-k:][::-1]
return [faqs[i] for i in top_k_indices]
def format_memory_stats():
"""
Format memory usage statistics
"""
system_stats = {
"RAM": f"{psutil.virtual_memory().used / (1024 ** 3):.1f}GB / {psutil.virtual_memory().total / (1024 ** 3):.1f}GB",
"RAM Usage": f"{psutil.virtual_memory().percent}%"
}
if torch.cuda.is_available():
gpu_stats = {}
for i in range(torch.cuda.device_count()):
gpu_stats[f"GPU {i}"] = f"{torch.cuda.get_device_name(i)}"
gpu_stats[f"GPU {i} Memory"] = f"{torch.cuda.memory_allocated(i) / (1024 ** 3):.1f}GB / {torch.cuda.get_device_properties(i).total_memory / (1024 ** 3):.1f}GB"
system_stats.update(gpu_stats)
return system_stats |