Spaces:

Techbite
/

faq-rag-chatbot

Sleeping

App Files Files Community

faq-rag-chatbot / src /data_processing.py

Techbite

changed to deep translator

43ebacc 5 months ago

raw

history blame

11.5 kB

	import pandas as pd
	import json
	import os
	import nltk
	from typing import List, Dict, Any
	from datasets import load_dataset
	import nlpaug.augmenter.word as naw
	from deep_translator import GoogleTranslator # Updated import

	# Configure NLTK data path and download required resources
	NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data")
	os.makedirs(NLTK_DATA_PATH, exist_ok=True)
	nltk.data.path.append(NLTK_DATA_PATH)

	def ensure_nltk_resources():
	"""
	Ensure NLTK resources are downloaded and available
	"""
	try:
	nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH)
	nltk.download('punkt', download_dir=NLTK_DATA_PATH)
	print(f"NLTK resources downloaded to {NLTK_DATA_PATH}")
	return True
	except Exception as e:
	print(f"Failed to download NLTK resources: {e}")
	return False

	def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]:
	"""
	Load FAQ data from Hugging Face datasets, cache locally
	"""
	local_path = "data/ecommerce_faqs.json"
	if os.path.exists(local_path):
	print(f"Loading cached dataset from {local_path}")
	with open(local_path, 'r') as f:
	return json.load(f)

	print(f"Loading dataset {dataset_name} from Hugging Face...")
	try:
	dataset = load_dataset(dataset_name)
	faqs = [{
	"question": item["question"],
	"answer": item["answer"],
	"category": item.get("category", ""),
	"question_id": item.get("question_id", ""),
	"faq_url": item.get("faq_url", "")
	} for item in dataset["train"]]
	with open(local_path, 'w') as f:
	json.dump(faqs, f)
	print(f"Saved dataset to {local_path}, loaded {len(faqs)} FAQs")
	return faqs
	except Exception as e:
	print(f"Error loading dataset: {e}")
	print("Falling back to local data...")
	return load_faq_data("data/faq_data.csv")

	def load_faq_data(file_path: str) -> List[Dict[str, Any]]:
	"""
	Load FAQ data from a local CSV or JSON file
	"""
	print(f"Loading data from {file_path}")
	try:
	if file_path.endswith('.csv'):
	df = pd.read_csv(file_path)
	faqs = df.to_dict('records')
	elif file_path.endswith('.json'):
	with open(file_path, 'r') as f:
	faqs = json.load(f)
	else:
	raise ValueError(f"Unsupported file format: {file_path}")
	print(f"Loaded {len(faqs)} FAQ entries")
	return faqs
	except Exception as e:
	print(f"Error loading data: {e}")
	print("Creating sample dataset as fallback")
	sample_faqs = [
	{"question": "How do I track my order?", "answer": "You can track your order by logging into your account and visiting the Order History section."},
	{"question": "How do I reset my password?", "answer": "To reset your password, click on the 'Forgot Password' link on the login page."}
	]
	return sample_faqs

	def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""
	Preprocess FAQ data: clean text, handle formatting, and filter invalid entries
	"""
	processed_faqs = []
	for faq in faqs:
	# Safely handle question and answer fields
	question = faq.get('question')
	answer = faq.get('answer')

	# Convert to string and strip, handling None values
	question = str(question).strip() if question is not None else ""
	answer = str(answer).strip() if answer is not None else ""

	# Update FAQ dictionary
	faq['question'] = question
	faq['answer'] = answer

	# Only include FAQs with both question and answer
	if question and answer:
	processed_faqs.append(faq)
	else:
	print(f"Skipping invalid FAQ: question='{question}', answer='{answer}'")

	print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries")
	return processed_faqs

	def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmentation: bool = True) -> List[Dict[str, Any]]:
	"""
	Augment FAQs with paraphrased questions if enabled
	"""
	if not enable_augmentation:
	print("Augmentation disabled; returning original FAQs")
	return faqs

	if not ensure_nltk_resources():
	print("NLTK resources unavailable; skipping augmentation")
	return faqs

	aug = naw.SynonymAug()
	augmented = []
	for faq in faqs:
	augmented.append(faq)
	if len(augmented) < max_faqs:
	try:
	aug_question = aug.augment(faq['question'])[0]
	augmented.append({"question": aug_question, "answer": faq['answer'], "category": faq.get("category", "")})
	except Exception as e:
	print(f"Augmentation error for question '{faq['question'][:50]}...': {e}")
	print(f"Augmented to {len(augmented)} FAQs")
	return augmented

	def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]:
	"""
	Translate FAQ to a target language using deep-translator
	"""
	try:
	translator = GoogleTranslator(source='en', target=target_lang)
	translated = faq.copy()
	translated["question"] = translator.translate(faq["question"])
	translated["answer"] = translator.translate(faq["answer"])
	translated["language"] = target_lang
	return translated
	except Exception as e:
	print(f"Translation error: {e}")
	return faq




	# import pandas as pd
	# import json
	# import os
	# import nltk
	# from typing import List, Dict, Any
	# from datasets import load_dataset
	# import nlpaug.augmenter.word as naw
	# from googletrans import Translator

	# # Configure NLTK data path and download required resources
	# NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data")
	# os.makedirs(NLTK_DATA_PATH, exist_ok=True)
	# nltk.data.path.append(NLTK_DATA_PATH)

	# def ensure_nltk_resources():
	# """
	# Ensure NLTK resources are downloaded and available
	# """
	# try:
	# nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH)
	# nltk.download('punkt', download_dir=NLTK_DATA_PATH)
	# print(f"NLTK resources downloaded to {NLTK_DATA_PATH}")
	# return True
	# except Exception as e:
	# print(f"Failed to download NLTK resources: {e}")
	# return False

	# def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]:
	# """
	# Load FAQ data from Hugging Face datasets, cache locally
	# """
	# local_path = "data/ecommerce_faqs.json"
	# if os.path.exists(local_path):
	# print(f"Loading cached dataset from {local_path}")
	# with open(local_path, 'r') as f:
	# return json.load(f)

	# print(f"Loading dataset {dataset_name} from Hugging Face...")
	# try:
	# dataset = load_dataset(dataset_name)
	# faqs = [{
	# "question": item["question"],
	# "answer": item["answer"],
	# "category": item.get("category", ""),
	# "question_id": item.get("question_id", ""),
	# "faq_url": item.get("faq_url", "")
	# } for item in dataset["train"]]
	# with open(local_path, 'w') as f:
	# json.dump(faqs, f)
	# print(f"Saved dataset to {local_path}, loaded {len(faqs)} FAQs")
	# return faqs
	# except Exception as e:
	# print(f"Error loading dataset: {e}")
	# print("Falling back to local data...")
	# return load_faq_data("data/faq_data.csv")

	# def load_faq_data(file_path: str) -> List[Dict[str, Any]]:
	# """
	# Load FAQ data from a local CSV or JSON file
	# """
	# print(f"Loading data from {file_path}")
	# try:
	# if file_path.endswith('.csv'):
	# df = pd.read_csv(file_path)
	# faqs = df.to_dict('records')
	# elif file_path.endswith('.json'):
	# with open(file_path, 'r') as f:
	# faqs = json.load(f)
	# else:
	# raise ValueError(f"Unsupported file format: {file_path}")
	# print(f"Loaded {len(faqs)} FAQ entries")
	# return faqs
	# except Exception as e:
	# print(f"Error loading data: {e}")
	# print("Creating sample dataset as fallback")
	# sample_faqs = [
	# {"question": "How do I track my order?", "answer": "You can track your order by logging into your account and visiting the Order History section."},
	# {"question": "How do I reset my password?", "answer": "To reset your password, click on the 'Forgot Password' link on the login page."}
	# ]
	# return sample_faqs

	# def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	# """
	# Preprocess FAQ data: clean text, handle formatting, and filter invalid entries
	# """
	# processed_faqs = []
	# for faq in faqs:
	# # Safely handle question and answer fields
	# question = faq.get('question')
	# answer = faq.get('answer')

	# # Convert to string and strip, handling None values
	# question = str(question).strip() if question is not None else ""
	# answer = str(answer).strip() if answer is not None else ""

	# # Update FAQ dictionary
	# faq['question'] = question
	# faq['answer'] = answer

	# # Only include FAQs with both question and answer
	# if question and answer:
	# processed_faqs.append(faq)
	# else:
	# print(f"Skipping invalid FAQ: question='{question}', answer='{answer}'")

	# print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries")
	# return processed_faqs

	# def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmentation: bool = True) -> List[Dict[str, Any]]:
	# """
	# Augment FAQs with paraphrased questions if enabled
	# """
	# if not enable_augmentation:
	# print("Augmentation disabled; returning original FAQs")
	# return faqs

	# if not ensure_nltk_resources():
	# print("NLTK resources unavailable; skipping augmentation")
	# return faqs

	# aug = naw.SynonymAug()
	# augmented = []
	# for faq in faqs:
	# augmented.append(faq)
	# if len(augmented) < max_faqs:
	# try:
	# aug_question = aug.augment(faq['question'])[0]
	# augmented.append({"question": aug_question, "answer": faq['answer'], "category": faq.get("category", "")})
	# except Exception as e:
	# print(f"Augmentation error for question '{faq['question'][:50]}...': {e}")
	# print(f"Augmented to {len(augmented)} FAQs")
	# return augmented

	# def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]:
	# """
	# Translate FAQ to a target language
	# """
	# try:
	# translator = Translator()
	# translated = faq.copy()
	# translated["question"] = translator.translate(faq["question"], dest=target_lang).text
	# translated["answer"] = translator.translate(faq["answer"], dest=target_lang).text
	# translated["language"] = target_lang
	# return translated
	# except Exception as e:
	# print(f"Translation error: {e}")
	# return faq