Spaces:

yourpartner
/

demospace

Running

demospace / main.py

Mitesh Koshiya

Fix db insert issue

08ce423 4 days ago

40.8 kB

	import re
	from fastapi import FastAPI
	from fastapi import Header
	from pydantic import BaseModel
	from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification
	import dateparser
	from datetime import datetime
	from langdetect import detect_langs
	from textblob import TextBlob
	from dateparser.search import search_dates
	import uuid
	import time
	import warnings
	warnings.filterwarnings("ignore", category=FutureWarning)
	warnings.filterwarnings("ignore", category=UserWarning)


	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import JSONResponse
	from fastapi.responses import ORJSONResponse
	from fastapi.requests import Request
	from fastapi import status
	import asyncio
	import psycopg2
	from psycopg2.extras import Json
	import os
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	DATABASE_URL = os.getenv("DATABASE_URL")

	app = FastAPI(default_response_class=ORJSONResponse)
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"], # or your domain(s)
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	CREATE_TABLE_QUERY = """
	CREATE TABLE IF NOT EXISTS user_entries (
	uuid UUID PRIMARY KEY,
	user_id TEXT,
	user_name TEXT,
	uese_email TEXT,
	raw_text TEXT,
	word_count INT,
	day_of_week TEXT,
	hour_of_day INT,
	month TEXT,
	year INT,
	type TEXT,
	expense_type TEXT,
	intent TEXT,
	confidence_scores JSONB,
	urgency_score INT,
	time_mentions TEXT[],
	parsed_dates TEXT[],
	tense TEXT[],
	summary TEXT,
	people TEXT[],
	mood TEXT,
	language JSONB,
	sentiment_score FLOAT,
	tags TEXT[],
	action_required BOOLEAN,
	entities JSONB,
	amounts JSONB,
	stores JSONB,
	processing_time_ms INT,
	raw_json JSONB,
	created_at TIMESTAMPTZ DEFAULT now()
	);
	"""

	@app.on_event("startup")
	def run_migrations():
	try:
	conn = psycopg2.connect(DATABASE_URL)
	cur = conn.cursor()
	cur.execute(CREATE_TABLE_QUERY)
	conn.commit()
	cur.close()
	conn.close()
	print("✅ Table checked/created at startup.")
	except Exception as e:
	print("❌ Migration failed:", e)

	# Load classification and summarization models
	classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")
	summarizer_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
	summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
	# classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
	# summarizer_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
	# summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

	# Load Indic NER (or any general one)
	tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
	model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
	ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

	# Labels for classification
	labels = [
	"task (something to be done or completed)",
	"event (an activity that is happening or has happened)",
	"reminder (a message to remember something in the future)",
	"meeting (a planned gathering between people to discuss something)",
	"relationship (message about personal or emotional connection with someone)",
	"note (general note or quick thought not related to any specific category)",
	"journal (personal reflection or emotional writing about one's day or thoughts)",
	"memory (recollection or recording of a past moment or experience)",
	"status_update (current condition, feeling, or situation being shared)",
	"sick_notice (informing about illness or not feeling well)",
	"out_of_office (message about being unavailable for work or responsibilities)",
	"travel_plan (planning or mentioning a trip or journey)",
	"celebration (message about a festive occasion, party or achievement)",
	"expense (money spent on something, either small or large)",
	"news (update about public events, announcements, or current affairs)",
	"information (factual content or informative message not tied to user activity)",
	"purchase (buying or ordering something, like a product or service)",
	"other (does not clearly fall into any specific category)"
	]

	POPULAR_STORES = {
	"amazon": "shopping",
	"flipkart": "shopping",
	"myntra": "fashion",
	"swiggy": "food",
	"zomato": "food",
	"uber": "transport",
	"ola": "transport",
	"bigbasket": "groceries",
	"blinkit": "groceries",
	"jiomart": "groceries",
	"netflix": "entertainment",
	"hotstar": "entertainment",
	"airbnb": "travel",
	"makemytrip": "travel",
	"bookmyshow": "entertainment",
	"dunzo": "delivery",
	"meesho": "shopping",
	"nykaa": "beauty",
	"instamart": "groceries",
	"apple": "electronics",
	"google": "services"
	}

	expense_keywords = [
	"paid", "bought", "purchased", "ordered", "spent", "payment",
	"recharged", "booked", "transaction", "debit", "renewed",
	"credit card", "cash", "amount", "transfer", "EMI", "wallet",
	"petrol", "bill", "invoice", "kharida", "kharcha", "kharch", "bill", "paisa", "khareed", "order", "le liya", "diya", "khud diya", "khud kharida",
	"expense", "cost", "buy", "buying", "purchase", "purchased", "paid for", "paid to", "paid via", "paid using",
	"expense", "expenses", "costs", "costing", "bills", "bought from", "ordered from", "paid at",
	"paid online", "paid cash", "paid card", "paid wallet", "paid app", "paid through", "paid via",
	"khariden", "kharidi"
	]

	class TextInput(BaseModel):
	text: str
	user_id: str

	# Function to detect popular store categories in the text
	def detect_store_category(text: str):
	found_stores = []
	lowered = text.lower()

	for store, category in POPULAR_STORES.items():
	if store in lowered:
	found_stores.append({
	"store": store,
	"category": category
	})

	return found_stores

	# Function to extract dates and time mentions based on regex patterns
	def extract_dates_with_accuracy(text: str, amounts: list = None):
	amounts = amounts or []
	amount_values = {str(int(a["value"])) for a in amounts if isinstance(a["value"], (int, float))}

	original_text = text
	text_lower = text.lower()

	# Step 1: Replace Hinglish phrases with English equivalents (only for parsing)
	hinglish_map = {
	"aaj": "today",
	"kal": "tomorrow", # Assuming future
	"parso": "day after tomorrow",
	"abhi": "now",
	"subah": "morning",
	"shaam": "evening",
	"raat ko": "night",
	"agli baar": "next time",
	"agli hafte": "next week",
	"agli mahine": "next month",
	"iss hafte": "this week",
	"iss mahine": "this month",
	"pichhle hafte": "last week",
	"tareekh": "date",
	"do din baad": "in 2 days",
	"teen din baad": "in 3 days",
	}

	replaced_text = text_lower
	for h_word, en_word in hinglish_map.items():
	replaced_text = re.sub(rf"\b{re.escape(h_word)}\b", en_word, replaced_text)

	# Step 2: Parse using dateparser
	results = search_dates(replaced_text, settings={
	"PREFER_DATES_FROM": "future",
	"RELATIVE_BASE": datetime.now(),
	"RETURN_AS_TIMEZONE_AWARE": False,
	"STRICT_PARSING": True,
	})

	time_mentions = []
	parsed_dates = []

	if results:
	for phrase, date in results:
	clean_phrase = phrase.strip().lower()

	if clean_phrase in amount_values:
	continue
	if clean_phrase in {"on", "at", "in", "by", "to", "of"}:
	continue
	if re.fullmatch(r"\d{3,4}", clean_phrase): # skip 2025, 1200
	continue
	time_mentions.append(clean_phrase)
	parsed_dates.append(date.isoformat())

	return time_mentions, parsed_dates

	def detect_tense(parsed_dates):
	now = datetime.now()
	tenses = set()
	for d in parsed_dates:
	dt = dateparser.parse(d)
	if not dt:
	continue
	if dt < now:
	tenses.add("past")
	elif dt > now:
	tenses.add("future")
	else:
	tenses.add("present")
	return list(tenses) if tenses else ["unknown"]

	def generate_summary(text):
	input_ids = summarizer_tokenizer("summarize: " + text, return_tensors="pt").input_ids
	output_ids = summarizer_model.generate(input_ids, max_length=60, num_beams=4, early_stopping=True)
	return summarizer_tokenizer.decode(output_ids[0], skip_special_tokens=True)

	def estimate_mood(text):
	text_lower = text.lower()
	# Expanded mood map with Hindi/Hinglish and phrases
	mood_map = {
	"happy": [
	"happy", "excited", "good", "joy", "grateful", "glad", "pleased", "content", "satisfied", "cheerful", "elated",
	"maza aa gaya", "achha lag raha hai", "khush", "khushi", "badiya", "mast", "enjoy", "enjoyed", "mazedaar", "achha"
	],
	"sad": [
	"sad", "upset", "crying", "lonely", "depressed", "down", "disappointed", "heartbroken", "unhappy",
	"bura lag raha hai", "dukhi", "udaas", "rona", "rona aa gaya", "dil toot gaya", "nirash"
	],
	"angry": [
	"angry", "annoyed", "frustrated", "irritated", "mad", "furious", "gussa", "gusse mein", "chidh", "naraz",
	"bhadak gaya", "chidh gaya", "irritate", "irritated"
	],
	"nervous": [
	"nervous", "anxious", "scared", "worried", "fearful", "uneasy", "tensed", "tension", "ghabrahat", "chinta",
	"parishan", "dara hua", "ghabra gaya", "stress", "stressed"
	],
	"unwell": [
	"sick", "unwell", "not feeling well", "fever", "cold", "headache", "flu", "ill", "nauseous", "dizzy",
	"thak gaya", "thaka hua", "bimaar", "bimar", "bukhar", "sardard", "beemar", "kamjor", "thakan"
	],
	"neutral": [
	"ok", "fine", "theek", "normal", "usual", "routine", "nothing special", "kuch khaas nahi", "no stress"
	]
	}

	detected_moods = []
	for mood, keywords in mood_map.items():
	for kw in keywords:
	if kw in text_lower:
	detected_moods.append(mood)
	break # Only need one match per mood

	# Use sentiment as a fallback if no mood keyword matched
	if not detected_moods:
	sentiment = get_sentiment_score(text)
	if sentiment > 0.2:
	return "happy"
	elif sentiment < -0.2:
	return "sad"
	else:
	return "neutral"

	# Priority: angry > sad > unwell > nervous > happy > neutral
	priority = ["angry", "sad", "unwell", "nervous", "happy", "neutral"]
	for mood in priority:
	if mood in detected_moods:
	return mood

	return "neutral"

	def generate_tags(label, text):
	# Define stopwords manually (lightweight and fast)
	stopwords = set([
	"or", "to", "also", "the", "and", "a", "an", "in", "on", "of", "for",
	"with", "at", "by", "from", "as", "is", "was", "are", "be", "will",
	"has", "have", "it", "this", "that", "but", "if", "not", "so", "do",
	"does", "did", "am", "can", "i", "me", "my", "you", "we", "they", "he", "she"
	])

	base_tags = [label]

	# Extract keywords (only alphabetic words with 4 or more letters)
	keywords = re.findall(r'\b[a-zA-Z]{4,}\b', text.lower())

	# Filter out stopwords
	filtered_keywords = [word for word in keywords if word not in stopwords]

	# Add forced tags based on context
	force_tags = []
	lowered = text.lower()

	if any(w in lowered for w in ["sick", "unwell", "not feeling well", "fever"]):
	force_tags += ["sick", "leave"]
	if "work" in lowered:
	force_tags.append("work")

	# Merge and deduplicate tags
	return list(set(base_tags + force_tags + filtered_keywords))

	# Detect language using langdetect
	def detect_language(text):
	langs = detect_langs(text) # returns list like: [en:0.99, hi:0.01]
	if langs:
	top_lang = langs[0]
	return {"lang": top_lang.lang, "prob": round(top_lang.prob, 6)}
	return {"lang": "unknown", "prob": 0}

	# Detect sentiment using TextBlob
	def get_sentiment_score(text):
	try:
	blob = TextBlob(text)
	return round(blob.sentiment.polarity, 3) # Range: -1 to 1
	except:
	return 0.0

	# Infer intent based on label
	def infer_intent(label, text):
	label_to_intent = {
	"out_of_office": "taking_leave",
	"sick_notice": "taking_leave",
	"reminder": "set_reminder",
	"event": "log_event",
	"meeting": "schedule_meeting",
	"note": "log_note",
	"journal": "log_memory",
	"memory": "log_memory",
	"status_update": "status_update",
	"task": "create_task",
	"celebration": "log_event"
	}
	return label_to_intent.get(label, "other")

	# Extract entities using NER
	def extract_entities(text):
	ner_results = ner_pipeline(text)
	entities = {"people": [], "places": [], "organizations": [], "dates": [], "misc": []}

	PLACE_KEYWORDS = [
	"garden", "hotel", "resort", "mall", "restaurant", "cafe", "market",
	"school", "college", "temple", "station", "airport", "hospital",
	"park", "store", "shop", "gym", "theater", "cinema", "bank", "office",
	"court", "salon", "studio", "museum", "library", "club", "university",
	"guest house", "hostel", "canteen", "clinic", "zoo", "residency", "apartment"
	]

	RELATION_KEYWORDS = [
	# English
	"mom", "dad", "father", "mother", "sister", "brother", "sis", "bro",
	"uncle", "aunt", "aunty", "cousin", "grandfather", "grandmother",
	"grandpa", "grandma", "wife", "husband", "son", "daughter", "child",
	"kids", "baby", "partner", "fiancé", "fiancée", "in-laws", "relatives",
	"friend", "colleague", "buddy", "pal", "mate", "acquaintance", "companion",
	"girlfriend", "boyfriend", "lover", "spouse", "significant other",

	# Hindi & Hinglish
	"maa", "mummy", "papa", "pappa", "pitaji", "mataji", "didi", "behen", "bhai",
	"chacha", "chachi", "mama", "mami", "tau", "tai", "nana", "nani",
	"dada", "dadi", "sasur", "sasuma", "jija", "saali", "bhabhi", "devar",
	"nandoi", "patni", "pati", "bachcha", "baccha", "beta", "beti", "putra", "putri",
	"sambandhi", "rishtedaar", "saheli", "dost", "yara", "saathi"
	]

	for ent in ner_results:
	word = ent["word"].replace("##", "")
	if len(word) <= 2 or not word.isalpha():
	continue # skip single-letter non-words
	group = ent["entity_group"]
	if group == "PER":
	entities["people"].append(word)
	elif group == "LOC":
	entities["places"].append(word)
	elif group == "ORG":
	entities["organizations"].append(word)
	elif group == "DATE":
	entities["dates"].append(word)
	else:
	entities["misc"].append(word)

	# ✅ Fallback: Add known days/dates if not already captured
	day_keywords = re.findall(r'\b(Monday\|Tuesday\|Wednesday\|Thursday\|Friday\|Saturday\|Sunday)\b', text, re.IGNORECASE)
	for day in day_keywords:
	if day not in entities["dates"]:
	entities["dates"].append(day)

	# ✅ Fallback: Add phrases like “product launch”, “project”, etc. to misc
	lower_text = text.lower()
	if "product launch" in lower_text:
	entities["misc"].append("product launch")
	if "birthday" in lower_text:
	entities["misc"].append("birthday")
	if "project" in lower_text:
	entities["misc"].append("project")

	# ✅ Add keyword-based places
	for place in PLACE_KEYWORDS:
	if place in lower_text and place not in entities["places"]:
	entities["places"].append(place)

	# ✅ Detect relation keywords (English + Hindi)
	for relation in RELATION_KEYWORDS:
	if re.search(rf"\b{re.escape(relation)}\b", text.lower()):
	entities["people"].append(relation)

	# ✅ Deduplicate and return

	return {k: list(set(v)) for k, v in entities.items()}

	# Function to calculate urgency score based on parsed dates
	def get_urgency_score(text, parsed_dates):
	urgency_keywords = ["urgent", "asap", "immediate", "must", "need to", "important", "don’t forget", "right away"]
	text_lower = text.lower()

	score = 0.0

	# 1. Keyword-based boost
	if any(word in text_lower for word in urgency_keywords):
	score = 0.7

	# 2. Time-based boost
	now = datetime.now()
	for d in parsed_dates:
	dt = dateparser.parse(d)
	if dt:
	hours = (dt - now).total_seconds() / 3600
	if 0 <= hours <= 24:
	score = max(score, 1.0)
	elif 24 < hours <= 72:
	score = max(score, 0.8)
	elif 72 < hours <= 168:
	score = max(score, 0.5)

	return round(score, 2)

	# Function to get meta information about the text
	def get_meta_info(text: str):
	now = datetime.now()
	return {
	"word_count": len(text.strip().split()),
	"day_of_week": now.strftime('%A'), # e.g., "Thursday"
	"hour_of_day": now.hour,
	"month": now.strftime('%B'), # e.g., "July"
	"year": now.year # 0 to 23
	}

	def is_year_context(text_snippet):
	return bool(re.search(r"\b(?:jan\|feb\|march\|april\|may\|june\|july\|aug\|sept\|oct\|nov\|dec\|year\|in\|on\|by\|for)\b", text_snippet))

	# Function to extract amounts in various currencies from text
	def extract_amounts(text: str):
	currency_patterns = [
	# INR variants
	(re.compile(r"(?:₹\|rs\.?\|inr)\s?(\d[\d,]*(?:\.\d+)?)"), "INR"),
	(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?(?:₹\|rs\.?\|inr)"), "INR"),
	(re.compile(r"(\d+(?:\.\d+)?)\s?(rupees?\|rupaye\|rupiye)"), "INR"),
	# USD variants
	(re.compile(r"(?:\$)\s?(\d[\d,]*(?:\.\d+)?)"), "USD"),
	(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?\$"), "USD"),
	(re.compile(r"(\d+(?:\.\d+)?)\s?(dollars?)"), "USD"),
	(re.compile(r"(\d+(?:\.\d+)?)\s?(cents?)"), "USD"),
	# EUR variants
	(re.compile(r"(?:€\|eur)\s?(\d[\d,]*(?:\.\d+)?)"), "EUR"),
	(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?€"), "EUR"),
	(re.compile(r"(\d+(?:\.\d+)?)\s?(euros?)"), "EUR"),
	# GBP variants
	(re.compile(r"(?:£\|gbp)\s?(\d[\d,]*(?:\.\d+)?)"), "GBP"),
	(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?£"), "GBP"),
	(re.compile(r"(\d+(?:\.\d+)?)\s?(pounds?)"), "GBP"),
	# INR large units
	(re.compile(r"(\d+(?:\.\d+)?)\s?(lacs?\|lakhs?)"), "INR"),
	(re.compile(r"(\d+(?:\.\d+)?)\s?(crores?\|crs?\|cr)"), "INR"),
	]

	results = []
	seen = set()
	text_lower = text.lower()

	for pattern, currency_code in currency_patterns:
	for match in pattern.finditer(text_lower):
	groups = match.groups()
	raw_number = next((g for g in groups if re.match(r"\d", g)), None)
	if not raw_number:
	continue
	# Ignore phone numbers and IDs (10+ digits)
	if len(raw_number.replace(",", "")) >= 10:
	continue
	try:
	number = float(raw_number.replace(",", ""))

	# Check for lakh/crore/cents multipliers
	if any(g in ['lakh', 'lacs', 'lakhs'] for g in groups):
	number *= 100_000
	elif any(g in ['crore', 'crores', 'cr', 'crs'] for g in groups):
	number *= 10_000_000
	elif any(g == 'cents' for g in groups):
	number /= 100

	except Exception:
	continue

	key = (number, currency_code)
	if key not in seen:
	seen.add(key)
	results.append({
	"value": round(number, 2),
	"currency": currency_code
	})

	# Fallback matching for generic numeric phrases near expense keywords
	if not results:
	fallback_patterns = [
	re.compile(
	r"\b(?:paid\|spent\|buy\|purchase\|cost\|price\|add(?:ed)?\|gift(?:ed)?\|bill(?: of)?\|recharge(?:d)?\|charged\|transfer(?:red)?)\b[^0-9]{0,10}(\d[\d,]*(?:\.\d+)?)"
	),
	re.compile(r"\b(\d[\d,]{2,8})\b\s?(?:rs\|inr)?")
	]
	for fallback_pattern in fallback_patterns:
	match = fallback_pattern.search(text_lower)
	if match:
	number_str = match.group(1).replace(",", "")
	# Ignore phone numbers and IDs
	if len(number_str) >= 10:
	continue
	try:
	number = float(number_str)

	# Context check for year-like numbers
	if 2020 <= number <= 2100:
	# Check 5-6 words before/after for year clue
	span = match.span(1)
	surrounding = text_lower[max(0, span[0]-30):span[1]+30]
	if is_year_context(surrounding):
	continue # Looks like a year

	key = (number, "INR")
	if key not in seen:
	seen.add(key)
	results.append({
	"value": round(number, 2),
	"currency": "INR"
	})
	break # Only extract first match in fallback
	except:
	continue

	return results


	def predict_expense_category(text, detected_stores):
	text_lower = text.lower()

	# 1. Use detected store category if available
	if detected_stores:
	best_match = max(detected_stores, key=lambda s: s.get("confidence", 1.0))
	return best_match["category"]

	# Category keyword mapping
	category_keywords = {
	"food": [
	"food", "lunch", "dinner", "breakfast", "snacks", "swiggy", "zomato", "dominos", "pizza", "kfc", "mcdonald",
	"restaurant", "hotel", "cafe", "canteen", "meal", "buffet", "thali", "tiffin", "order", "takeaway", "parcel",
	"eat", "eating", "brunch", "supper", "kitchen", "cook", "cooking", "chef", "dish", "dishes", "menu", "serve",
	"served", "serving", "food court", "food delivery", "delivery", "online order", "food app", "food bill",
	"beverage", "juice", "shake", "smoothie", "coffee", "tea", "chai", "cold drink", "soft drink", "soda", "water bottle",
	"ice cream", "dessert", "sweet", "sweets", "chocolate", "candy", "bakery", "bread", "cake", "pastry", "cookie",
	"biscuit", "chips", "fries", "burger", "sandwich", "roll", "wrap", "noodles", "pasta", "rice", "biryani", "curry",
	"gravy", "dal", "sabzi", "roti", "naan", "paratha", "chapati", "idli", "dosa", "vada", "sambar", "chutney", "samosa",
	"pakora", "chaat", "pani puri", "golgappa", "sev", "poha", "upma", "maggi", "maggie", "momos", "spring roll",
	"manchurian", "paneer", "butter chicken", "tandoori", "kebab", "shawarma", "pizza hut", "subway", "starbucks",
	# Hindi/Hinglish
	"khana", "nashta", "bhojan", "rasoi", "thali", "dabba", "tiffin", "chai", "paani", "jal", "kharcha khana",
	"khane ka bill", "khane ka paisa", "khane ki cheez", "khana order", "khana mangwaya", "khana khaya", "khana khud banaya",
	"khana kharch", "khana kharida", "khana diya", "khana laya", "khana banaya"
	],
	"transport": [
	"uber", "ola", "taxi", "cab", "bus", "train", "metro", "flight", "auto", "rickshaw", "car", "gaadi", "yatra", "safar", "travel", "ticket", "plane", "udaan", "station", "airport", "rapido",
	],
	"shopping": [
	"amazon", "flipkart", "myntra", "shopping", "clothes", "kapde", "apparel", "shoes", "jeans", "tshirt", "store", "fashion", "dukaan", "mall", "bazaar", "market", "kharida", "order diya", "le liya"
	],
	"housing": [
	"rent", "apartment", "house", "ghar", "flat", "maintenance", "landlord", "kiraya", "makaan", "room", "hostel", "pg", "society"
	],
	"utilities": [
	"electricity", "power", "bijli", "water", "pani", "gas", "bill", "recharge", "broadband", "wifi", "airtel", "jio", "phone", "mobile", "internet", "light", "cylinder", "connection"
	],
	"entertainment": [
	"movie", "netflix", "hotstar", "bookmyshow", "spotify", "gaming", "youtube premium", "cinema", "film", "picture", "game", "khel", "manoranjan", "show", "concert"
	],
	"health": [
	"medicine", "hospital", "doctor", "clinic", "pharmacy", "tablet", "surgery", "checkup", "dawai", "aspatal", "ilaaj", "health", "bimari", "test", "medical", "pathology", "chemist"
	],
	"travel": [
	"trip", "travel", "tour", "vacation", "hotel", "airbnb", "booking.com", "goibibo", "makemytrip", "yatra", "safar", "holiday", "journey", "musafir", "booking", "trip kiya"
	],
	"education": [
	"course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill", "padhai", "school", "college", "tuition", "kitab", "book", "fees", "shiksha"
	],
	"digital_services": [
	"domain", "membership", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas", "subscription", "digital", "online", "app", "service", "renewal"
	],
	"gifts_donations": [
	"gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift", "uphaar", "daan", "tohfa", "chanda", "puja", "mandir", "gurudwara"
	],
	"finance": [
	"insurance", "sip", "mutual fund", "stock", "demat", "zerodha", "investment", "trading", "upstox", "crypto", "policy", "premium", "loan", "emi", "fd", "rd", "paisa", "bank", "account"
	],
	"family_kids": [
	"kid", "baby", "school", "daycare", "tuition", "books", "uniform", "toys", "creche", "baccha", "bachche", "parivar", "family", "beti", "beta", "child", "children"
	],
	"stationery": [
	"pen", "pencil", "notebook", "diary", "eraser", "sharpener", "paper", "stationery", "register", "files", "file", "markers", "highlighter", "sticky notes", "geometry box",
	"stapler", "ink", "printer paper", "stationary shop", "stationary", "copy", "kagaz", "likhne ka saman"
	]
	}

	# 2. Match using keyword scores
	matched = {cat: sum(1 for kw in kws if kw in text_lower) for cat, kws in category_keywords.items()}
	best_match = max(matched.items(), key=lambda x: x[1])

	if best_match[1] > 0:
	return best_match[0]

	return "miscellaneous"



	def insert_text_entry(data):
	try:
	conn = psycopg2.connect(DATABASE_URL)
	cur = conn.cursor()

	insert_query = """
	INSERT INTO user_entries (
	uuid, user_id, raw_text, word_count, day_of_week, hour_of_day, month, year,
	type, expense_type, intent, confidence_scores, urgency_score,
	time_mentions, parsed_dates, tense, summary,
	people, mood, language, sentiment_score, tags,
	action_required, entities, amounts, stores, processing_time_ms, raw_json
	) VALUES (
	%(uuid)s, %(user_id)s, %(raw_text)s, %(word_count)s, %(day_of_week)s, %(hour_of_day)s, %(month)s, %(year)s,
	%(type)s, %(expense_type)s, %(intent)s, %(confidence_scores)s, %(urgency_score)s,
	%(time_mentions)s, %(parsed_dates)s, %(tense)s, %(summary)s,
	%(people)s, %(mood)s, %(language)s, %(sentiment_score)s, %(tags)s,
	%(action_required)s, %(entities)s, %(amounts)s, %(stores)s, %(processing_time_ms)s, %(raw_json)s
	)
	ON CONFLICT (uuid) DO NOTHING;
	"""

	cur.execute(insert_query, {
	**data,
	"confidence_scores": Json(data["confidence_scores"]),
	"language": Json(data["language"]),
	"stores": Json(data["stores"]),
	"entities": Json(data["entities"]),
	"amounts": Json(data["amounts"]),
	"raw_json": Json(data["raw_json"])
	})

	conn.commit()
	cur.close()
	conn.close()
	print("✅ Data inserted successfully")

	except Exception as e:
	print("❌ Failed to insert data:", e)



	@app.get("/health")
	def health_check():
	return {"message": "✅ Hello from yourpartner/demospace — API is running!"}

	@app.exception_handler(404)
	async def not_found_handler(request: Request, exc):
	return ORJSONResponse(status_code=404, content={"error": "Route not found"})

	@app.exception_handler(500)
	async def internal_error_handler(request: Request, exc):
	return ORJSONResponse(status_code=500, content={"error": "Internal server error: " + str(exc)})

	# Search endpoint to filter user entries based on various criteria
	@app.get("/search", response_class=ORJSONResponse)
	async def search_entries(
	userid: str = Header(..., description="User ID"),
	tags: str = "",
	query: str = "",
	startDate: str = "",
	endDate: str = "",
	type: str = ""
	):
	# Validate user_id from header
	if not userid or not userid.strip():
	return ORJSONResponse(status_code=400, content={"error": "Missing or empty userid header."})

	# Build SQL filters
	filters = ["user_id = %s"]
	params = [userid]

	if type:
	filters.append("type = %s")
	params.append(type)

	if tags:
	tag_list = [t.strip() for t in tags.split(",") if t.strip()]
	filters.append("tags && %s")
	params.append(tag_list)

	if query:
	filters.append("(raw_text ILIKE %s OR summary ILIKE %s)")
	params.extend([f"%{query}%", f"%{query}%"])

	if startDate:
	try:
	start_dt = datetime.strptime(startDate, "%d-%m-%Y")
	filters.append("created_at >= %s")
	params.append(start_dt)
	except:
	return ORJSONResponse(status_code=400, content={"error": "Invalid startDate format. Use DD-MM-YYYY."})

	if endDate:
	try:
	end_dt = datetime.strptime(endDate, "%d-%m-%Y")
	filters.append("created_at <= %s")
	params.append(end_dt)
	except:
	return ORJSONResponse(status_code=400, content={"error": "Invalid endDate format. Use DD-MM-YYYY."})

	where_clause = " AND ".join(filters)
	query_sql = f"SELECT * FROM user_entries WHERE {where_clause} ORDER BY created_at DESC LIMIT 50"

	try:
	conn = psycopg2.connect(DATABASE_URL)
	cur = conn.cursor()
	cur.execute(query_sql, tuple(params))
	rows = cur.fetchall()
	columns = [desc[0] for desc in cur.description]
	entries = [dict(zip(columns, row)) for row in rows]
	# Remove raw_json from each entry in results
	for entry in entries:
	entry.pop("raw_json", None)

	cur.close()
	conn.close()
	except Exception as e:
	return ORJSONResponse(status_code=500, content={"error": str(e)})

	return ORJSONResponse(content={"results": entries})

	@app.get("/visualyse/{user_id}", response_class=ORJSONResponse)
	async def visualyse_dashboard(user_id: str):
	try:
	conn = psycopg2.connect(DATABASE_URL)
	cur = conn.cursor()
	# Fetch all entries for the user
	cur.execute("SELECT * FROM user_entries WHERE user_id = %s", (user_id,))
	rows = cur.fetchall()
	columns = [desc[0] for desc in cur.description]
	entries = [dict(zip(columns, row)) for row in rows]
	cur.close()
	conn.close()
	except Exception as e:
	return ORJSONResponse(status_code=500, content={"error": str(e)})

	# Section 1: Expense Overview
	expenses = [e for e in entries if e["type"] == "expense"]
	total_expense = sum(a["value"] for e in expenses for a in (e["amounts"] or []))
	expense_count = len(expenses)
	expense_by_category = {}
	for e in expenses:
	cat = e.get("expense_type", "miscellaneous")
	amt = sum(a["value"] for a in (e["amounts"] or []))
	expense_by_category[cat] = expense_by_category.get(cat, 0) + amt

	# Monthly/Weekly Trends
	monthly_trends = {}
	for e in expenses:
	key = f"{e['month']}-{e['year']}"
	amt = sum(a["value"] for a in (e["amounts"] or []))
	monthly_trends[key] = monthly_trends.get(key, 0) + amt

	# Section 2: Top Stores & Categories
	store_stats = {}
	for e in expenses:
	for s in (e["stores"] or []):
	store = s.get("store", "unknown")
	amt = sum(a["value"] for a in (e["amounts"] or []))
	if store not in store_stats:
	store_stats[store] = {"count": 0, "total": 0}
	store_stats[store]["count"] += 1
	store_stats[store]["total"] += amt
	top_categories = sorted(expense_by_category.items(), key=lambda x: x[1], reverse=True)

	# Section 3: Recent Expenses
	recent_expenses = sorted(expenses, key=lambda e: e.get("created_at", ""), reverse=True)[:7]

	# Section 4: Mood Trends
	mood_dist = {}
	for e in entries:
	mood = e.get("mood", "neutral")
	mood_dist[mood] = mood_dist.get(mood, 0) + 1

	# Section 5: Tags & Keywords
	tag_freq = {}
	for e in entries:
	for tag in (e["tags"] or []):
	tag_freq[tag] = tag_freq.get(tag, 0) + 1
	top_tags = sorted(tag_freq.items(), key=lambda x: x[1], reverse=True)[:15]

	# Section 6: Time Analysis
	day_stats = {}
	hour_stats = {}
	for e in expenses:
	day = e.get("day_of_week", "unknown")
	hour = e.get("hour_of_day", 0)
	amt = sum(a["value"] for a in (e["amounts"] or []))
	day_stats[day] = day_stats.get(day, 0) + amt
	hour_stats[hour] = hour_stats.get(hour, 0) + amt

	# Section 7: Meta Info
	entry_count = len(entries)
	type_dist = {}
	for e in entries:
	t = e.get("type", "other")
	type_dist[t] = type_dist.get(t, 0) + 1

	dashboard = {
	"expense_overview": {
	"total_expense": total_expense,
	"expense_count": expense_count,
	"expense_by_category": expense_by_category,
	"monthly_trends": monthly_trends
	},
	"top_stores": store_stats,
	"top_categories": top_categories,
	"recent_expenses": recent_expenses,
	"mood_distribution": mood_dist,
	"top_tags": top_tags,
	"time_analysis": {
	"by_day": day_stats,
	"by_hour": hour_stats
	},
	"meta_info": {
	"entry_count": entry_count,
	"type_distribution": type_dist
	}
	}
	return ORJSONResponse(content=dashboard)

	@app.post("/analyze", response_class=ORJSONResponse)
	async def analyze(input: TextInput):
	start_time = time.time() # ⏱️ start

	text = input.text

	label_map = {
	"task (something to be done or completed)": "task",
	"event (an activity that is happening or has happened)": "event",
	"reminder (a message to remember something in the future)": "reminder",
	"meeting (a planned gathering between people to discuss something)": "meeting",
	"relationship (message about personal or emotional connection with someone)": "relationship",
	"note (general note or quick thought not related to any specific category)": "note",
	"journal (personal reflection or emotional writing about one's day or thoughts)": "journal",
	"memory (recollection or recording of a past moment or experience)": "memory",
	"status_update (current condition, feeling, or situation being shared)": "status_update",
	"sick_notice (informing about illness or not feeling well)": "sick_notice",
	"out_of_office (message about being unavailable for work or responsibilities)": "out_of_office",
	"travel_plan (planning or mentioning a trip or journey)": "travel_plan",
	"celebration (message about a festive occasion, party or achievement)": "celebration",
	"expense (money spent on something, either small or large)": "expense",
	"news (update about public events, announcements, or current affairs)": "news",
	"information (factual content or informative message not tied to user activity)": "information",
	"purchase (buying or ordering something, like a product or service)": "purchase",
	"other (does not clearly fall into any specific category)": "other"
	}

	# classification = classifier(text, labels)
	# Async call to classifier
	classification = await asyncio.to_thread(classifier, text, labels, hypothesis_template="This entry is about {}.")
	best_label = classification['labels'][0]

	best_label = label_map.get(best_label, best_label)
	amounts = await asyncio.to_thread(extract_amounts, text)

	# Check if the best label is expense or purchase based on keywords
	if (
	best_label == "task"
	and (any(word in text.lower() for word in expense_keywords) or amounts)
	):
	best_label = "expense"

	if best_label == "purchase":
	best_label = "expense"

	if "reported" in text or "announced" in text or "collapsed" in text:
	if best_label in ["task", "reminder", "event"]:
	best_label = "news"

	scores = dict(zip(classification['labels'], classification['scores']))
	# # Convert to short labels
	confidence_scores_full = {
	label_map.get(label, label): score
	for label, score in scores.items()
	}
	# Only keep top 2
	confidence_scores = dict(sorted(confidence_scores_full.items(), key=lambda x: x[1], reverse=True)[:2])


	parsed_dates, time_mentions = await asyncio.to_thread(extract_dates_with_accuracy, text, amounts)
	tenses = detect_tense(parsed_dates)
	summary = await asyncio.to_thread(generate_summary, text)
	mood = estimate_mood(text)
	tags = generate_tags(best_label, text)
	language_detected = detect_language(text)
	sentiment_score = get_sentiment_score(text)
	if sentiment_score is None or sentiment_score == "":
	sentiment_score = 0.0

	entities = await asyncio.to_thread(extract_entities, text)
	people = entities["people"] # Extracted people entities
	intent = infer_intent(best_label, text)
	urgency_score = get_urgency_score(text, parsed_dates)
	detected_stores = detect_store_category(text)

	expense_category = ""
	if best_label == "expense" or best_label == "purchase":
	expense_category = predict_expense_category(text, detected_stores)

	# Define action triggers
	ACTION_TRIGGERS = ["plan", "organize", "schedule", "remember", "book", "call", "follow up", "need to"]
	action_required = False
	if any(word in text.lower() for word in ACTION_TRIGGERS): action_required = True

	action_required = urgency_score >= 0.6 or action_required
	meta = get_meta_info(text)

	end_time = time.time() # ⏱️ end
	processing_time_ms = round((end_time - start_time) * 1000)

	result = {
	"uuid": str(uuid.uuid4()), # Unique identifier for the request
	"user_id": input.user_id, # Unique identifier for the request
	"raw_text": text,
	"word_count": meta["word_count"],
	"day_of_week": meta["day_of_week"],
	"hour_of_day": meta["hour_of_day"],
	"month": meta["month"],
	"year": meta["year"],
	"type": best_label,
	"expense_type": expense_category,
	"intent": intent,
	"confidence_scores": confidence_scores,
	"urgency_score": urgency_score,
	"time_mentions": time_mentions,
	"parsed_dates": parsed_dates,
	"tense": tenses,
	"summary": summary.removeprefix("summary:").strip(),
	"people": people,
	"mood": mood,
	"language": language_detected,
	"sentiment_score": sentiment_score,
	"tags": tags,
	"action_required": action_required,
	"entities": entities,
	"amounts": amounts,
	"stores": detected_stores,
	"processing_time_ms": processing_time_ms
	}

	# Store a copy of result without raw_json to avoid circular reference
	raw_json_copy = result.copy()
	# Remove raw_json if present (shouldn't be, but for safety)
	raw_json_copy.pop("raw_json", None)
	result["raw_json"] = raw_json_copy

	# Insert into database
	await asyncio.to_thread(insert_text_entry, result)

	# Log the result
	print("✅ Analysis complete")

	# Remove raw_json from response
	result.pop("raw_json", None)

	# Return the result as JSON response
	return ORJSONResponse(content=result)