Spaces:

Techbite
/

faq-rag-chatbot

Sleeping

App Files Files Community

faq-rag-chatbot / src /data_processing.py

Techbite

initial commit

26d1a81 7 months ago

raw

history blame

3.37 kB

	import pandas as pd
	import json
	from typing import List, Dict, Any
	from datasets import load_dataset

	def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]:
	"""
	Load FAQ data from Hugging Face datasets
	"""
	print(f"Loading dataset {dataset_name} from Hugging Face...")

	try:
	# Load the dataset
	dataset = load_dataset(dataset_name)

	# Get the train split (as seen in the screenshots)
	train_data = dataset["train"]

	# Convert to list of dictionaries
	faqs = []
	for item in train_data:
	# Extract the required fields
	faq = {
	"question": item["question"],
	"answer": item["answer"],
	# Include additional metadata
	"category": item.get("category", ""),
	"question_id": item.get("question_id", ""),
	"faq_url": item.get("faq_url", "")
	}
	faqs.append(faq)

	print(f"Loaded {len(faqs)} FAQ entries from Hugging Face")
	return faqs

	except Exception as e:
	print(f"Error loading dataset from Hugging Face: {e}")
	print("Falling back to local data...")
	return load_faq_data("data/faq_data.csv")

	def load_faq_data(file_path: str) -> List[Dict[str, Any]]:
	"""
	Load FAQ data from a local CSV or JSON file
	"""
	print(f"Loading data from {file_path}")
	try:
	if file_path.endswith('.csv'):
	df = pd.read_csv(file_path)
	# Assume CSV has 'question' and 'answer' columns
	faqs = df.to_dict('records')
	elif file_path.endswith('.json'):
	with open(file_path, 'r') as f:
	faqs = json.load(f)
	else:
	raise ValueError(f"Unsupported file format: {file_path}")

	print(f"Loaded {len(faqs)} FAQ entries")
	return faqs
	except Exception as e:
	print(f"Error loading data: {e}")
	# Create a minimal sample dataset as fallback
	print("Creating sample dataset as fallback")
	sample_faqs = [
	{"question": "How do I track my order?",
	"answer": "You can track your order by logging into your account and visiting the Order History section."},
	{"question": "How do I reset my password?",
	"answer": "To reset your password, click on the 'Forgot Password' link on the login page."}
	]
	return sample_faqs

	def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""
	Preprocess FAQ data: clean text, handle formatting
	"""
	processed_faqs = []
	for faq in faqs:
	# Basic cleaning - remove extra whitespace
	if 'question' in faq and faq['question'] is not None:
	faq['question'] = faq['question'].strip()
	else:
	faq['question'] = ""

	if 'answer' in faq and faq['answer'] is not None:
	faq['answer'] = faq['answer'].strip()
	else:
	faq['answer'] = ""

	# Only include FAQs with both question and answer
	if faq.get('question') and faq.get('answer'):
	processed_faqs.append(faq)

	print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries")
	return processed_faqs