demospace / main.py
Mitesh Koshiya
Fix db insert issue
08ce423
import re
from fastapi import FastAPI
from fastapi import Header
from pydantic import BaseModel
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification
import dateparser
from datetime import datetime
from langdetect import detect_langs
from textblob import TextBlob
from dateparser.search import search_dates
import uuid
import time
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from fastapi.responses import ORJSONResponse
from fastapi.requests import Request
from fastapi import status
import asyncio
import psycopg2
from psycopg2.extras import Json
import os
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
DATABASE_URL = os.getenv("DATABASE_URL")
app = FastAPI(default_response_class=ORJSONResponse)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # or your domain(s)
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
CREATE_TABLE_QUERY = """
CREATE TABLE IF NOT EXISTS user_entries (
uuid UUID PRIMARY KEY,
user_id TEXT,
user_name TEXT,
uese_email TEXT,
raw_text TEXT,
word_count INT,
day_of_week TEXT,
hour_of_day INT,
month TEXT,
year INT,
type TEXT,
expense_type TEXT,
intent TEXT,
confidence_scores JSONB,
urgency_score INT,
time_mentions TEXT[],
parsed_dates TEXT[],
tense TEXT[],
summary TEXT,
people TEXT[],
mood TEXT,
language JSONB,
sentiment_score FLOAT,
tags TEXT[],
action_required BOOLEAN,
entities JSONB,
amounts JSONB,
stores JSONB,
processing_time_ms INT,
raw_json JSONB,
created_at TIMESTAMPTZ DEFAULT now()
);
"""
@app.on_event("startup")
def run_migrations():
try:
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()
cur.execute(CREATE_TABLE_QUERY)
conn.commit()
cur.close()
conn.close()
print("βœ… Table checked/created at startup.")
except Exception as e:
print("❌ Migration failed:", e)
# Load classification and summarization models
classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")
summarizer_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
# classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# summarizer_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
# summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
# Load Indic NER (or any general one)
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
# Labels for classification
labels = [
"task (something to be done or completed)",
"event (an activity that is happening or has happened)",
"reminder (a message to remember something in the future)",
"meeting (a planned gathering between people to discuss something)",
"relationship (message about personal or emotional connection with someone)",
"note (general note or quick thought not related to any specific category)",
"journal (personal reflection or emotional writing about one's day or thoughts)",
"memory (recollection or recording of a past moment or experience)",
"status_update (current condition, feeling, or situation being shared)",
"sick_notice (informing about illness or not feeling well)",
"out_of_office (message about being unavailable for work or responsibilities)",
"travel_plan (planning or mentioning a trip or journey)",
"celebration (message about a festive occasion, party or achievement)",
"expense (money spent on something, either small or large)",
"news (update about public events, announcements, or current affairs)",
"information (factual content or informative message not tied to user activity)",
"purchase (buying or ordering something, like a product or service)",
"other (does not clearly fall into any specific category)"
]
POPULAR_STORES = {
"amazon": "shopping",
"flipkart": "shopping",
"myntra": "fashion",
"swiggy": "food",
"zomato": "food",
"uber": "transport",
"ola": "transport",
"bigbasket": "groceries",
"blinkit": "groceries",
"jiomart": "groceries",
"netflix": "entertainment",
"hotstar": "entertainment",
"airbnb": "travel",
"makemytrip": "travel",
"bookmyshow": "entertainment",
"dunzo": "delivery",
"meesho": "shopping",
"nykaa": "beauty",
"instamart": "groceries",
"apple": "electronics",
"google": "services"
}
expense_keywords = [
"paid", "bought", "purchased", "ordered", "spent", "payment",
"recharged", "booked", "transaction", "debit", "renewed",
"credit card", "cash", "amount", "transfer", "EMI", "wallet",
"petrol", "bill", "invoice", "kharida", "kharcha", "kharch", "bill", "paisa", "khareed", "order", "le liya", "diya", "khud diya", "khud kharida",
"expense", "cost", "buy", "buying", "purchase", "purchased", "paid for", "paid to", "paid via", "paid using",
"expense", "expenses", "costs", "costing", "bills", "bought from", "ordered from", "paid at",
"paid online", "paid cash", "paid card", "paid wallet", "paid app", "paid through", "paid via",
"khariden", "kharidi"
]
class TextInput(BaseModel):
text: str
user_id: str
# Function to detect popular store categories in the text
def detect_store_category(text: str):
found_stores = []
lowered = text.lower()
for store, category in POPULAR_STORES.items():
if store in lowered:
found_stores.append({
"store": store,
"category": category
})
return found_stores
# Function to extract dates and time mentions based on regex patterns
def extract_dates_with_accuracy(text: str, amounts: list = None):
amounts = amounts or []
amount_values = {str(int(a["value"])) for a in amounts if isinstance(a["value"], (int, float))}
original_text = text
text_lower = text.lower()
# Step 1: Replace Hinglish phrases with English equivalents (only for parsing)
hinglish_map = {
"aaj": "today",
"kal": "tomorrow", # Assuming future
"parso": "day after tomorrow",
"abhi": "now",
"subah": "morning",
"shaam": "evening",
"raat ko": "night",
"agli baar": "next time",
"agli hafte": "next week",
"agli mahine": "next month",
"iss hafte": "this week",
"iss mahine": "this month",
"pichhle hafte": "last week",
"tareekh": "date",
"do din baad": "in 2 days",
"teen din baad": "in 3 days",
}
replaced_text = text_lower
for h_word, en_word in hinglish_map.items():
replaced_text = re.sub(rf"\b{re.escape(h_word)}\b", en_word, replaced_text)
# Step 2: Parse using dateparser
results = search_dates(replaced_text, settings={
"PREFER_DATES_FROM": "future",
"RELATIVE_BASE": datetime.now(),
"RETURN_AS_TIMEZONE_AWARE": False,
"STRICT_PARSING": True,
})
time_mentions = []
parsed_dates = []
if results:
for phrase, date in results:
clean_phrase = phrase.strip().lower()
if clean_phrase in amount_values:
continue
if clean_phrase in {"on", "at", "in", "by", "to", "of"}:
continue
if re.fullmatch(r"\d{3,4}", clean_phrase): # skip 2025, 1200
continue
time_mentions.append(clean_phrase)
parsed_dates.append(date.isoformat())
return time_mentions, parsed_dates
def detect_tense(parsed_dates):
now = datetime.now()
tenses = set()
for d in parsed_dates:
dt = dateparser.parse(d)
if not dt:
continue
if dt < now:
tenses.add("past")
elif dt > now:
tenses.add("future")
else:
tenses.add("present")
return list(tenses) if tenses else ["unknown"]
def generate_summary(text):
input_ids = summarizer_tokenizer("summarize: " + text, return_tensors="pt").input_ids
output_ids = summarizer_model.generate(input_ids, max_length=60, num_beams=4, early_stopping=True)
return summarizer_tokenizer.decode(output_ids[0], skip_special_tokens=True)
def estimate_mood(text):
text_lower = text.lower()
# Expanded mood map with Hindi/Hinglish and phrases
mood_map = {
"happy": [
"happy", "excited", "good", "joy", "grateful", "glad", "pleased", "content", "satisfied", "cheerful", "elated",
"maza aa gaya", "achha lag raha hai", "khush", "khushi", "badiya", "mast", "enjoy", "enjoyed", "mazedaar", "achha"
],
"sad": [
"sad", "upset", "crying", "lonely", "depressed", "down", "disappointed", "heartbroken", "unhappy",
"bura lag raha hai", "dukhi", "udaas", "rona", "rona aa gaya", "dil toot gaya", "nirash"
],
"angry": [
"angry", "annoyed", "frustrated", "irritated", "mad", "furious", "gussa", "gusse mein", "chidh", "naraz",
"bhadak gaya", "chidh gaya", "irritate", "irritated"
],
"nervous": [
"nervous", "anxious", "scared", "worried", "fearful", "uneasy", "tensed", "tension", "ghabrahat", "chinta",
"parishan", "dara hua", "ghabra gaya", "stress", "stressed"
],
"unwell": [
"sick", "unwell", "not feeling well", "fever", "cold", "headache", "flu", "ill", "nauseous", "dizzy",
"thak gaya", "thaka hua", "bimaar", "bimar", "bukhar", "sardard", "beemar", "kamjor", "thakan"
],
"neutral": [
"ok", "fine", "theek", "normal", "usual", "routine", "nothing special", "kuch khaas nahi", "no stress"
]
}
detected_moods = []
for mood, keywords in mood_map.items():
for kw in keywords:
if kw in text_lower:
detected_moods.append(mood)
break # Only need one match per mood
# Use sentiment as a fallback if no mood keyword matched
if not detected_moods:
sentiment = get_sentiment_score(text)
if sentiment > 0.2:
return "happy"
elif sentiment < -0.2:
return "sad"
else:
return "neutral"
# Priority: angry > sad > unwell > nervous > happy > neutral
priority = ["angry", "sad", "unwell", "nervous", "happy", "neutral"]
for mood in priority:
if mood in detected_moods:
return mood
return "neutral"
def generate_tags(label, text):
# Define stopwords manually (lightweight and fast)
stopwords = set([
"or", "to", "also", "the", "and", "a", "an", "in", "on", "of", "for",
"with", "at", "by", "from", "as", "is", "was", "are", "be", "will",
"has", "have", "it", "this", "that", "but", "if", "not", "so", "do",
"does", "did", "am", "can", "i", "me", "my", "you", "we", "they", "he", "she"
])
base_tags = [label]
# Extract keywords (only alphabetic words with 4 or more letters)
keywords = re.findall(r'\b[a-zA-Z]{4,}\b', text.lower())
# Filter out stopwords
filtered_keywords = [word for word in keywords if word not in stopwords]
# Add forced tags based on context
force_tags = []
lowered = text.lower()
if any(w in lowered for w in ["sick", "unwell", "not feeling well", "fever"]):
force_tags += ["sick", "leave"]
if "work" in lowered:
force_tags.append("work")
# Merge and deduplicate tags
return list(set(base_tags + force_tags + filtered_keywords))
# Detect language using langdetect
def detect_language(text):
langs = detect_langs(text) # returns list like: [en:0.99, hi:0.01]
if langs:
top_lang = langs[0]
return {"lang": top_lang.lang, "prob": round(top_lang.prob, 6)}
return {"lang": "unknown", "prob": 0}
# Detect sentiment using TextBlob
def get_sentiment_score(text):
try:
blob = TextBlob(text)
return round(blob.sentiment.polarity, 3) # Range: -1 to 1
except:
return 0.0
# Infer intent based on label
def infer_intent(label, text):
label_to_intent = {
"out_of_office": "taking_leave",
"sick_notice": "taking_leave",
"reminder": "set_reminder",
"event": "log_event",
"meeting": "schedule_meeting",
"note": "log_note",
"journal": "log_memory",
"memory": "log_memory",
"status_update": "status_update",
"task": "create_task",
"celebration": "log_event"
}
return label_to_intent.get(label, "other")
# Extract entities using NER
def extract_entities(text):
ner_results = ner_pipeline(text)
entities = {"people": [], "places": [], "organizations": [], "dates": [], "misc": []}
PLACE_KEYWORDS = [
"garden", "hotel", "resort", "mall", "restaurant", "cafe", "market",
"school", "college", "temple", "station", "airport", "hospital",
"park", "store", "shop", "gym", "theater", "cinema", "bank", "office",
"court", "salon", "studio", "museum", "library", "club", "university",
"guest house", "hostel", "canteen", "clinic", "zoo", "residency", "apartment"
]
RELATION_KEYWORDS = [
# English
"mom", "dad", "father", "mother", "sister", "brother", "sis", "bro",
"uncle", "aunt", "aunty", "cousin", "grandfather", "grandmother",
"grandpa", "grandma", "wife", "husband", "son", "daughter", "child",
"kids", "baby", "partner", "fiancΓ©", "fiancΓ©e", "in-laws", "relatives",
"friend", "colleague", "buddy", "pal", "mate", "acquaintance", "companion",
"girlfriend", "boyfriend", "lover", "spouse", "significant other",
# Hindi & Hinglish
"maa", "mummy", "papa", "pappa", "pitaji", "mataji", "didi", "behen", "bhai",
"chacha", "chachi", "mama", "mami", "tau", "tai", "nana", "nani",
"dada", "dadi", "sasur", "sasuma", "jija", "saali", "bhabhi", "devar",
"nandoi", "patni", "pati", "bachcha", "baccha", "beta", "beti", "putra", "putri",
"sambandhi", "rishtedaar", "saheli", "dost", "yara", "saathi"
]
for ent in ner_results:
word = ent["word"].replace("##", "")
if len(word) <= 2 or not word.isalpha():
continue # skip single-letter non-words
group = ent["entity_group"]
if group == "PER":
entities["people"].append(word)
elif group == "LOC":
entities["places"].append(word)
elif group == "ORG":
entities["organizations"].append(word)
elif group == "DATE":
entities["dates"].append(word)
else:
entities["misc"].append(word)
# βœ… Fallback: Add known days/dates if not already captured
day_keywords = re.findall(r'\b(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', text, re.IGNORECASE)
for day in day_keywords:
if day not in entities["dates"]:
entities["dates"].append(day)
# βœ… Fallback: Add phrases like β€œproduct launch”, β€œproject”, etc. to misc
lower_text = text.lower()
if "product launch" in lower_text:
entities["misc"].append("product launch")
if "birthday" in lower_text:
entities["misc"].append("birthday")
if "project" in lower_text:
entities["misc"].append("project")
# βœ… Add keyword-based places
for place in PLACE_KEYWORDS:
if place in lower_text and place not in entities["places"]:
entities["places"].append(place)
# βœ… Detect relation keywords (English + Hindi)
for relation in RELATION_KEYWORDS:
if re.search(rf"\b{re.escape(relation)}\b", text.lower()):
entities["people"].append(relation)
# βœ… Deduplicate and return
return {k: list(set(v)) for k, v in entities.items()}
# Function to calculate urgency score based on parsed dates
def get_urgency_score(text, parsed_dates):
urgency_keywords = ["urgent", "asap", "immediate", "must", "need to", "important", "don’t forget", "right away"]
text_lower = text.lower()
score = 0.0
# 1. Keyword-based boost
if any(word in text_lower for word in urgency_keywords):
score = 0.7
# 2. Time-based boost
now = datetime.now()
for d in parsed_dates:
dt = dateparser.parse(d)
if dt:
hours = (dt - now).total_seconds() / 3600
if 0 <= hours <= 24:
score = max(score, 1.0)
elif 24 < hours <= 72:
score = max(score, 0.8)
elif 72 < hours <= 168:
score = max(score, 0.5)
return round(score, 2)
# Function to get meta information about the text
def get_meta_info(text: str):
now = datetime.now()
return {
"word_count": len(text.strip().split()),
"day_of_week": now.strftime('%A'), # e.g., "Thursday"
"hour_of_day": now.hour,
"month": now.strftime('%B'), # e.g., "July"
"year": now.year # 0 to 23
}
def is_year_context(text_snippet):
return bool(re.search(r"\b(?:jan|feb|march|april|may|june|july|aug|sept|oct|nov|dec|year|in|on|by|for)\b", text_snippet))
# Function to extract amounts in various currencies from text
def extract_amounts(text: str):
currency_patterns = [
# INR variants
(re.compile(r"(?:β‚Ή|rs\.?|inr)\s?(\d[\d,]*(?:\.\d+)?)"), "INR"),
(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?(?:β‚Ή|rs\.?|inr)"), "INR"),
(re.compile(r"(\d+(?:\.\d+)?)\s?(rupees?|rupaye|rupiye)"), "INR"),
# USD variants
(re.compile(r"(?:\$)\s?(\d[\d,]*(?:\.\d+)?)"), "USD"),
(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?\$"), "USD"),
(re.compile(r"(\d+(?:\.\d+)?)\s?(dollars?)"), "USD"),
(re.compile(r"(\d+(?:\.\d+)?)\s?(cents?)"), "USD"),
# EUR variants
(re.compile(r"(?:€|eur)\s?(\d[\d,]*(?:\.\d+)?)"), "EUR"),
(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?€"), "EUR"),
(re.compile(r"(\d+(?:\.\d+)?)\s?(euros?)"), "EUR"),
# GBP variants
(re.compile(r"(?:Β£|gbp)\s?(\d[\d,]*(?:\.\d+)?)"), "GBP"),
(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?Β£"), "GBP"),
(re.compile(r"(\d+(?:\.\d+)?)\s?(pounds?)"), "GBP"),
# INR large units
(re.compile(r"(\d+(?:\.\d+)?)\s?(lacs?|lakhs?)"), "INR"),
(re.compile(r"(\d+(?:\.\d+)?)\s?(crores?|crs?|cr)"), "INR"),
]
results = []
seen = set()
text_lower = text.lower()
for pattern, currency_code in currency_patterns:
for match in pattern.finditer(text_lower):
groups = match.groups()
raw_number = next((g for g in groups if re.match(r"\d", g)), None)
if not raw_number:
continue
# Ignore phone numbers and IDs (10+ digits)
if len(raw_number.replace(",", "")) >= 10:
continue
try:
number = float(raw_number.replace(",", ""))
# Check for lakh/crore/cents multipliers
if any(g in ['lakh', 'lacs', 'lakhs'] for g in groups):
number *= 100_000
elif any(g in ['crore', 'crores', 'cr', 'crs'] for g in groups):
number *= 10_000_000
elif any(g == 'cents' for g in groups):
number /= 100
except Exception:
continue
key = (number, currency_code)
if key not in seen:
seen.add(key)
results.append({
"value": round(number, 2),
"currency": currency_code
})
# Fallback matching for generic numeric phrases near expense keywords
if not results:
fallback_patterns = [
re.compile(
r"\b(?:paid|spent|buy|purchase|cost|price|add(?:ed)?|gift(?:ed)?|bill(?: of)?|recharge(?:d)?|charged|transfer(?:red)?)\b[^0-9]{0,10}(\d[\d,]*(?:\.\d+)?)"
),
re.compile(r"\b(\d[\d,]{2,8})\b\s?(?:rs|inr)?")
]
for fallback_pattern in fallback_patterns:
match = fallback_pattern.search(text_lower)
if match:
number_str = match.group(1).replace(",", "")
# Ignore phone numbers and IDs
if len(number_str) >= 10:
continue
try:
number = float(number_str)
# Context check for year-like numbers
if 2020 <= number <= 2100:
# Check 5-6 words before/after for year clue
span = match.span(1)
surrounding = text_lower[max(0, span[0]-30):span[1]+30]
if is_year_context(surrounding):
continue # Looks like a year
key = (number, "INR")
if key not in seen:
seen.add(key)
results.append({
"value": round(number, 2),
"currency": "INR"
})
break # Only extract first match in fallback
except:
continue
return results
def predict_expense_category(text, detected_stores):
text_lower = text.lower()
# 1. Use detected store category if available
if detected_stores:
best_match = max(detected_stores, key=lambda s: s.get("confidence", 1.0))
return best_match["category"]
# Category keyword mapping
category_keywords = {
"food": [
"food", "lunch", "dinner", "breakfast", "snacks", "swiggy", "zomato", "dominos", "pizza", "kfc", "mcdonald",
"restaurant", "hotel", "cafe", "canteen", "meal", "buffet", "thali", "tiffin", "order", "takeaway", "parcel",
"eat", "eating", "brunch", "supper", "kitchen", "cook", "cooking", "chef", "dish", "dishes", "menu", "serve",
"served", "serving", "food court", "food delivery", "delivery", "online order", "food app", "food bill",
"beverage", "juice", "shake", "smoothie", "coffee", "tea", "chai", "cold drink", "soft drink", "soda", "water bottle",
"ice cream", "dessert", "sweet", "sweets", "chocolate", "candy", "bakery", "bread", "cake", "pastry", "cookie",
"biscuit", "chips", "fries", "burger", "sandwich", "roll", "wrap", "noodles", "pasta", "rice", "biryani", "curry",
"gravy", "dal", "sabzi", "roti", "naan", "paratha", "chapati", "idli", "dosa", "vada", "sambar", "chutney", "samosa",
"pakora", "chaat", "pani puri", "golgappa", "sev", "poha", "upma", "maggi", "maggie", "momos", "spring roll",
"manchurian", "paneer", "butter chicken", "tandoori", "kebab", "shawarma", "pizza hut", "subway", "starbucks",
# Hindi/Hinglish
"khana", "nashta", "bhojan", "rasoi", "thali", "dabba", "tiffin", "chai", "paani", "jal", "kharcha khana",
"khane ka bill", "khane ka paisa", "khane ki cheez", "khana order", "khana mangwaya", "khana khaya", "khana khud banaya",
"khana kharch", "khana kharida", "khana diya", "khana laya", "khana banaya"
],
"transport": [
"uber", "ola", "taxi", "cab", "bus", "train", "metro", "flight", "auto", "rickshaw", "car", "gaadi", "yatra", "safar", "travel", "ticket", "plane", "udaan", "station", "airport", "rapido",
],
"shopping": [
"amazon", "flipkart", "myntra", "shopping", "clothes", "kapde", "apparel", "shoes", "jeans", "tshirt", "store", "fashion", "dukaan", "mall", "bazaar", "market", "kharida", "order diya", "le liya"
],
"housing": [
"rent", "apartment", "house", "ghar", "flat", "maintenance", "landlord", "kiraya", "makaan", "room", "hostel", "pg", "society"
],
"utilities": [
"electricity", "power", "bijli", "water", "pani", "gas", "bill", "recharge", "broadband", "wifi", "airtel", "jio", "phone", "mobile", "internet", "light", "cylinder", "connection"
],
"entertainment": [
"movie", "netflix", "hotstar", "bookmyshow", "spotify", "gaming", "youtube premium", "cinema", "film", "picture", "game", "khel", "manoranjan", "show", "concert"
],
"health": [
"medicine", "hospital", "doctor", "clinic", "pharmacy", "tablet", "surgery", "checkup", "dawai", "aspatal", "ilaaj", "health", "bimari", "test", "medical", "pathology", "chemist"
],
"travel": [
"trip", "travel", "tour", "vacation", "hotel", "airbnb", "booking.com", "goibibo", "makemytrip", "yatra", "safar", "holiday", "journey", "musafir", "booking", "trip kiya"
],
"education": [
"course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill", "padhai", "school", "college", "tuition", "kitab", "book", "fees", "shiksha"
],
"digital_services": [
"domain", "membership", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas", "subscription", "digital", "online", "app", "service", "renewal"
],
"gifts_donations": [
"gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift", "uphaar", "daan", "tohfa", "chanda", "puja", "mandir", "gurudwara"
],
"finance": [
"insurance", "sip", "mutual fund", "stock", "demat", "zerodha", "investment", "trading", "upstox", "crypto", "policy", "premium", "loan", "emi", "fd", "rd", "paisa", "bank", "account"
],
"family_kids": [
"kid", "baby", "school", "daycare", "tuition", "books", "uniform", "toys", "creche", "baccha", "bachche", "parivar", "family", "beti", "beta", "child", "children"
],
"stationery": [
"pen", "pencil", "notebook", "diary", "eraser", "sharpener", "paper", "stationery", "register", "files", "file", "markers", "highlighter", "sticky notes", "geometry box",
"stapler", "ink", "printer paper", "stationary shop", "stationary", "copy", "kagaz", "likhne ka saman"
]
}
# 2. Match using keyword scores
matched = {cat: sum(1 for kw in kws if kw in text_lower) for cat, kws in category_keywords.items()}
best_match = max(matched.items(), key=lambda x: x[1])
if best_match[1] > 0:
return best_match[0]
return "miscellaneous"
def insert_text_entry(data):
try:
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()
insert_query = """
INSERT INTO user_entries (
uuid, user_id, raw_text, word_count, day_of_week, hour_of_day, month, year,
type, expense_type, intent, confidence_scores, urgency_score,
time_mentions, parsed_dates, tense, summary,
people, mood, language, sentiment_score, tags,
action_required, entities, amounts, stores, processing_time_ms, raw_json
) VALUES (
%(uuid)s, %(user_id)s, %(raw_text)s, %(word_count)s, %(day_of_week)s, %(hour_of_day)s, %(month)s, %(year)s,
%(type)s, %(expense_type)s, %(intent)s, %(confidence_scores)s, %(urgency_score)s,
%(time_mentions)s, %(parsed_dates)s, %(tense)s, %(summary)s,
%(people)s, %(mood)s, %(language)s, %(sentiment_score)s, %(tags)s,
%(action_required)s, %(entities)s, %(amounts)s, %(stores)s, %(processing_time_ms)s, %(raw_json)s
)
ON CONFLICT (uuid) DO NOTHING;
"""
cur.execute(insert_query, {
**data,
"confidence_scores": Json(data["confidence_scores"]),
"language": Json(data["language"]),
"stores": Json(data["stores"]),
"entities": Json(data["entities"]),
"amounts": Json(data["amounts"]),
"raw_json": Json(data["raw_json"])
})
conn.commit()
cur.close()
conn.close()
print("βœ… Data inserted successfully")
except Exception as e:
print("❌ Failed to insert data:", e)
@app.get("/health")
def health_check():
return {"message": "βœ… Hello from yourpartner/demospace β€” API is running!"}
@app.exception_handler(404)
async def not_found_handler(request: Request, exc):
return ORJSONResponse(status_code=404, content={"error": "Route not found"})
@app.exception_handler(500)
async def internal_error_handler(request: Request, exc):
return ORJSONResponse(status_code=500, content={"error": "Internal server error: " + str(exc)})
# Search endpoint to filter user entries based on various criteria
@app.get("/search", response_class=ORJSONResponse)
async def search_entries(
userid: str = Header(..., description="User ID"),
tags: str = "",
query: str = "",
startDate: str = "",
endDate: str = "",
type: str = ""
):
# Validate user_id from header
if not userid or not userid.strip():
return ORJSONResponse(status_code=400, content={"error": "Missing or empty userid header."})
# Build SQL filters
filters = ["user_id = %s"]
params = [userid]
if type:
filters.append("type = %s")
params.append(type)
if tags:
tag_list = [t.strip() for t in tags.split(",") if t.strip()]
filters.append("tags && %s")
params.append(tag_list)
if query:
filters.append("(raw_text ILIKE %s OR summary ILIKE %s)")
params.extend([f"%{query}%", f"%{query}%"])
if startDate:
try:
start_dt = datetime.strptime(startDate, "%d-%m-%Y")
filters.append("created_at >= %s")
params.append(start_dt)
except:
return ORJSONResponse(status_code=400, content={"error": "Invalid startDate format. Use DD-MM-YYYY."})
if endDate:
try:
end_dt = datetime.strptime(endDate, "%d-%m-%Y")
filters.append("created_at <= %s")
params.append(end_dt)
except:
return ORJSONResponse(status_code=400, content={"error": "Invalid endDate format. Use DD-MM-YYYY."})
where_clause = " AND ".join(filters)
query_sql = f"SELECT * FROM user_entries WHERE {where_clause} ORDER BY created_at DESC LIMIT 50"
try:
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()
cur.execute(query_sql, tuple(params))
rows = cur.fetchall()
columns = [desc[0] for desc in cur.description]
entries = [dict(zip(columns, row)) for row in rows]
# Remove raw_json from each entry in results
for entry in entries:
entry.pop("raw_json", None)
cur.close()
conn.close()
except Exception as e:
return ORJSONResponse(status_code=500, content={"error": str(e)})
return ORJSONResponse(content={"results": entries})
@app.get("/visualyse/{user_id}", response_class=ORJSONResponse)
async def visualyse_dashboard(user_id: str):
try:
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()
# Fetch all entries for the user
cur.execute("SELECT * FROM user_entries WHERE user_id = %s", (user_id,))
rows = cur.fetchall()
columns = [desc[0] for desc in cur.description]
entries = [dict(zip(columns, row)) for row in rows]
cur.close()
conn.close()
except Exception as e:
return ORJSONResponse(status_code=500, content={"error": str(e)})
# Section 1: Expense Overview
expenses = [e for e in entries if e["type"] == "expense"]
total_expense = sum(a["value"] for e in expenses for a in (e["amounts"] or []))
expense_count = len(expenses)
expense_by_category = {}
for e in expenses:
cat = e.get("expense_type", "miscellaneous")
amt = sum(a["value"] for a in (e["amounts"] or []))
expense_by_category[cat] = expense_by_category.get(cat, 0) + amt
# Monthly/Weekly Trends
monthly_trends = {}
for e in expenses:
key = f"{e['month']}-{e['year']}"
amt = sum(a["value"] for a in (e["amounts"] or []))
monthly_trends[key] = monthly_trends.get(key, 0) + amt
# Section 2: Top Stores & Categories
store_stats = {}
for e in expenses:
for s in (e["stores"] or []):
store = s.get("store", "unknown")
amt = sum(a["value"] for a in (e["amounts"] or []))
if store not in store_stats:
store_stats[store] = {"count": 0, "total": 0}
store_stats[store]["count"] += 1
store_stats[store]["total"] += amt
top_categories = sorted(expense_by_category.items(), key=lambda x: x[1], reverse=True)
# Section 3: Recent Expenses
recent_expenses = sorted(expenses, key=lambda e: e.get("created_at", ""), reverse=True)[:7]
# Section 4: Mood Trends
mood_dist = {}
for e in entries:
mood = e.get("mood", "neutral")
mood_dist[mood] = mood_dist.get(mood, 0) + 1
# Section 5: Tags & Keywords
tag_freq = {}
for e in entries:
for tag in (e["tags"] or []):
tag_freq[tag] = tag_freq.get(tag, 0) + 1
top_tags = sorted(tag_freq.items(), key=lambda x: x[1], reverse=True)[:15]
# Section 6: Time Analysis
day_stats = {}
hour_stats = {}
for e in expenses:
day = e.get("day_of_week", "unknown")
hour = e.get("hour_of_day", 0)
amt = sum(a["value"] for a in (e["amounts"] or []))
day_stats[day] = day_stats.get(day, 0) + amt
hour_stats[hour] = hour_stats.get(hour, 0) + amt
# Section 7: Meta Info
entry_count = len(entries)
type_dist = {}
for e in entries:
t = e.get("type", "other")
type_dist[t] = type_dist.get(t, 0) + 1
dashboard = {
"expense_overview": {
"total_expense": total_expense,
"expense_count": expense_count,
"expense_by_category": expense_by_category,
"monthly_trends": monthly_trends
},
"top_stores": store_stats,
"top_categories": top_categories,
"recent_expenses": recent_expenses,
"mood_distribution": mood_dist,
"top_tags": top_tags,
"time_analysis": {
"by_day": day_stats,
"by_hour": hour_stats
},
"meta_info": {
"entry_count": entry_count,
"type_distribution": type_dist
}
}
return ORJSONResponse(content=dashboard)
@app.post("/analyze", response_class=ORJSONResponse)
async def analyze(input: TextInput):
start_time = time.time() # ⏱️ start
text = input.text
label_map = {
"task (something to be done or completed)": "task",
"event (an activity that is happening or has happened)": "event",
"reminder (a message to remember something in the future)": "reminder",
"meeting (a planned gathering between people to discuss something)": "meeting",
"relationship (message about personal or emotional connection with someone)": "relationship",
"note (general note or quick thought not related to any specific category)": "note",
"journal (personal reflection or emotional writing about one's day or thoughts)": "journal",
"memory (recollection or recording of a past moment or experience)": "memory",
"status_update (current condition, feeling, or situation being shared)": "status_update",
"sick_notice (informing about illness or not feeling well)": "sick_notice",
"out_of_office (message about being unavailable for work or responsibilities)": "out_of_office",
"travel_plan (planning or mentioning a trip or journey)": "travel_plan",
"celebration (message about a festive occasion, party or achievement)": "celebration",
"expense (money spent on something, either small or large)": "expense",
"news (update about public events, announcements, or current affairs)": "news",
"information (factual content or informative message not tied to user activity)": "information",
"purchase (buying or ordering something, like a product or service)": "purchase",
"other (does not clearly fall into any specific category)": "other"
}
# classification = classifier(text, labels)
# Async call to classifier
classification = await asyncio.to_thread(classifier, text, labels, hypothesis_template="This entry is about {}.")
best_label = classification['labels'][0]
best_label = label_map.get(best_label, best_label)
amounts = await asyncio.to_thread(extract_amounts, text)
# Check if the best label is expense or purchase based on keywords
if (
best_label == "task"
and (any(word in text.lower() for word in expense_keywords) or amounts)
):
best_label = "expense"
if best_label == "purchase":
best_label = "expense"
if "reported" in text or "announced" in text or "collapsed" in text:
if best_label in ["task", "reminder", "event"]:
best_label = "news"
scores = dict(zip(classification['labels'], classification['scores']))
# # Convert to short labels
confidence_scores_full = {
label_map.get(label, label): score
for label, score in scores.items()
}
# Only keep top 2
confidence_scores = dict(sorted(confidence_scores_full.items(), key=lambda x: x[1], reverse=True)[:2])
parsed_dates, time_mentions = await asyncio.to_thread(extract_dates_with_accuracy, text, amounts)
tenses = detect_tense(parsed_dates)
summary = await asyncio.to_thread(generate_summary, text)
mood = estimate_mood(text)
tags = generate_tags(best_label, text)
language_detected = detect_language(text)
sentiment_score = get_sentiment_score(text)
if sentiment_score is None or sentiment_score == "":
sentiment_score = 0.0
entities = await asyncio.to_thread(extract_entities, text)
people = entities["people"] # Extracted people entities
intent = infer_intent(best_label, text)
urgency_score = get_urgency_score(text, parsed_dates)
detected_stores = detect_store_category(text)
expense_category = ""
if best_label == "expense" or best_label == "purchase":
expense_category = predict_expense_category(text, detected_stores)
# Define action triggers
ACTION_TRIGGERS = ["plan", "organize", "schedule", "remember", "book", "call", "follow up", "need to"]
action_required = False
if any(word in text.lower() for word in ACTION_TRIGGERS): action_required = True
action_required = urgency_score >= 0.6 or action_required
meta = get_meta_info(text)
end_time = time.time() # ⏱️ end
processing_time_ms = round((end_time - start_time) * 1000)
result = {
"uuid": str(uuid.uuid4()), # Unique identifier for the request
"user_id": input.user_id, # Unique identifier for the request
"raw_text": text,
"word_count": meta["word_count"],
"day_of_week": meta["day_of_week"],
"hour_of_day": meta["hour_of_day"],
"month": meta["month"],
"year": meta["year"],
"type": best_label,
"expense_type": expense_category,
"intent": intent,
"confidence_scores": confidence_scores,
"urgency_score": urgency_score,
"time_mentions": time_mentions,
"parsed_dates": parsed_dates,
"tense": tenses,
"summary": summary.removeprefix("summary:").strip(),
"people": people,
"mood": mood,
"language": language_detected,
"sentiment_score": sentiment_score,
"tags": tags,
"action_required": action_required,
"entities": entities,
"amounts": amounts,
"stores": detected_stores,
"processing_time_ms": processing_time_ms
}
# Store a copy of result without raw_json to avoid circular reference
raw_json_copy = result.copy()
# Remove raw_json if present (shouldn't be, but for safety)
raw_json_copy.pop("raw_json", None)
result["raw_json"] = raw_json_copy
# Insert into database
await asyncio.to_thread(insert_text_entry, result)
# Log the result
print("βœ… Analysis complete")
# Remove raw_json from response
result.pop("raw_json", None)
# Return the result as JSON response
return ORJSONResponse(content=result)