Spaces:
Running
Running
import re | |
from fastapi import FastAPI | |
from fastapi import Header | |
from pydantic import BaseModel | |
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification | |
import dateparser | |
from datetime import datetime | |
from langdetect import detect_langs | |
from textblob import TextBlob | |
from dateparser.search import search_dates | |
import uuid | |
import time | |
import warnings | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
warnings.filterwarnings("ignore", category=UserWarning) | |
from fastapi.middleware.cors import CORSMiddleware | |
from fastapi.responses import JSONResponse | |
from fastapi.responses import ORJSONResponse | |
from fastapi.requests import Request | |
from fastapi import status | |
import asyncio | |
import psycopg2 | |
from psycopg2.extras import Json | |
import os | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
DATABASE_URL = os.getenv("DATABASE_URL") | |
app = FastAPI(default_response_class=ORJSONResponse) | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], # or your domain(s) | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
CREATE_TABLE_QUERY = """ | |
CREATE TABLE IF NOT EXISTS user_entries ( | |
uuid UUID PRIMARY KEY, | |
user_id TEXT, | |
user_name TEXT, | |
uese_email TEXT, | |
raw_text TEXT, | |
word_count INT, | |
day_of_week TEXT, | |
hour_of_day INT, | |
month TEXT, | |
year INT, | |
type TEXT, | |
expense_type TEXT, | |
intent TEXT, | |
confidence_scores JSONB, | |
urgency_score INT, | |
time_mentions TEXT[], | |
parsed_dates TEXT[], | |
tense TEXT[], | |
summary TEXT, | |
people TEXT[], | |
mood TEXT, | |
language JSONB, | |
sentiment_score FLOAT, | |
tags TEXT[], | |
action_required BOOLEAN, | |
entities JSONB, | |
amounts JSONB, | |
stores JSONB, | |
processing_time_ms INT, | |
raw_json JSONB, | |
created_at TIMESTAMPTZ DEFAULT now() | |
); | |
""" | |
def run_migrations(): | |
try: | |
conn = psycopg2.connect(DATABASE_URL) | |
cur = conn.cursor() | |
cur.execute(CREATE_TABLE_QUERY) | |
conn.commit() | |
cur.close() | |
conn.close() | |
print("β Table checked/created at startup.") | |
except Exception as e: | |
print("β Migration failed:", e) | |
# Load classification and summarization models | |
classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli") | |
summarizer_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base") | |
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base") | |
# classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
# summarizer_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small") | |
# summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small") | |
# Load Indic NER (or any general one) | |
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER") | |
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER") | |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") | |
# Labels for classification | |
labels = [ | |
"task (something to be done or completed)", | |
"event (an activity that is happening or has happened)", | |
"reminder (a message to remember something in the future)", | |
"meeting (a planned gathering between people to discuss something)", | |
"relationship (message about personal or emotional connection with someone)", | |
"note (general note or quick thought not related to any specific category)", | |
"journal (personal reflection or emotional writing about one's day or thoughts)", | |
"memory (recollection or recording of a past moment or experience)", | |
"status_update (current condition, feeling, or situation being shared)", | |
"sick_notice (informing about illness or not feeling well)", | |
"out_of_office (message about being unavailable for work or responsibilities)", | |
"travel_plan (planning or mentioning a trip or journey)", | |
"celebration (message about a festive occasion, party or achievement)", | |
"expense (money spent on something, either small or large)", | |
"news (update about public events, announcements, or current affairs)", | |
"information (factual content or informative message not tied to user activity)", | |
"purchase (buying or ordering something, like a product or service)", | |
"other (does not clearly fall into any specific category)" | |
] | |
POPULAR_STORES = { | |
"amazon": "shopping", | |
"flipkart": "shopping", | |
"myntra": "fashion", | |
"swiggy": "food", | |
"zomato": "food", | |
"uber": "transport", | |
"ola": "transport", | |
"bigbasket": "groceries", | |
"blinkit": "groceries", | |
"jiomart": "groceries", | |
"netflix": "entertainment", | |
"hotstar": "entertainment", | |
"airbnb": "travel", | |
"makemytrip": "travel", | |
"bookmyshow": "entertainment", | |
"dunzo": "delivery", | |
"meesho": "shopping", | |
"nykaa": "beauty", | |
"instamart": "groceries", | |
"apple": "electronics", | |
"google": "services" | |
} | |
expense_keywords = [ | |
"paid", "bought", "purchased", "ordered", "spent", "payment", | |
"recharged", "booked", "transaction", "debit", "renewed", | |
"credit card", "cash", "amount", "transfer", "EMI", "wallet", | |
"petrol", "bill", "invoice", "kharida", "kharcha", "kharch", "bill", "paisa", "khareed", "order", "le liya", "diya", "khud diya", "khud kharida", | |
"expense", "cost", "buy", "buying", "purchase", "purchased", "paid for", "paid to", "paid via", "paid using", | |
"expense", "expenses", "costs", "costing", "bills", "bought from", "ordered from", "paid at", | |
"paid online", "paid cash", "paid card", "paid wallet", "paid app", "paid through", "paid via", | |
"khariden", "kharidi" | |
] | |
class TextInput(BaseModel): | |
text: str | |
user_id: str | |
# Function to detect popular store categories in the text | |
def detect_store_category(text: str): | |
found_stores = [] | |
lowered = text.lower() | |
for store, category in POPULAR_STORES.items(): | |
if store in lowered: | |
found_stores.append({ | |
"store": store, | |
"category": category | |
}) | |
return found_stores | |
# Function to extract dates and time mentions based on regex patterns | |
def extract_dates_with_accuracy(text: str, amounts: list = None): | |
amounts = amounts or [] | |
amount_values = {str(int(a["value"])) for a in amounts if isinstance(a["value"], (int, float))} | |
original_text = text | |
text_lower = text.lower() | |
# Step 1: Replace Hinglish phrases with English equivalents (only for parsing) | |
hinglish_map = { | |
"aaj": "today", | |
"kal": "tomorrow", # Assuming future | |
"parso": "day after tomorrow", | |
"abhi": "now", | |
"subah": "morning", | |
"shaam": "evening", | |
"raat ko": "night", | |
"agli baar": "next time", | |
"agli hafte": "next week", | |
"agli mahine": "next month", | |
"iss hafte": "this week", | |
"iss mahine": "this month", | |
"pichhle hafte": "last week", | |
"tareekh": "date", | |
"do din baad": "in 2 days", | |
"teen din baad": "in 3 days", | |
} | |
replaced_text = text_lower | |
for h_word, en_word in hinglish_map.items(): | |
replaced_text = re.sub(rf"\b{re.escape(h_word)}\b", en_word, replaced_text) | |
# Step 2: Parse using dateparser | |
results = search_dates(replaced_text, settings={ | |
"PREFER_DATES_FROM": "future", | |
"RELATIVE_BASE": datetime.now(), | |
"RETURN_AS_TIMEZONE_AWARE": False, | |
"STRICT_PARSING": True, | |
}) | |
time_mentions = [] | |
parsed_dates = [] | |
if results: | |
for phrase, date in results: | |
clean_phrase = phrase.strip().lower() | |
if clean_phrase in amount_values: | |
continue | |
if clean_phrase in {"on", "at", "in", "by", "to", "of"}: | |
continue | |
if re.fullmatch(r"\d{3,4}", clean_phrase): # skip 2025, 1200 | |
continue | |
time_mentions.append(clean_phrase) | |
parsed_dates.append(date.isoformat()) | |
return time_mentions, parsed_dates | |
def detect_tense(parsed_dates): | |
now = datetime.now() | |
tenses = set() | |
for d in parsed_dates: | |
dt = dateparser.parse(d) | |
if not dt: | |
continue | |
if dt < now: | |
tenses.add("past") | |
elif dt > now: | |
tenses.add("future") | |
else: | |
tenses.add("present") | |
return list(tenses) if tenses else ["unknown"] | |
def generate_summary(text): | |
input_ids = summarizer_tokenizer("summarize: " + text, return_tensors="pt").input_ids | |
output_ids = summarizer_model.generate(input_ids, max_length=60, num_beams=4, early_stopping=True) | |
return summarizer_tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
def estimate_mood(text): | |
text_lower = text.lower() | |
# Expanded mood map with Hindi/Hinglish and phrases | |
mood_map = { | |
"happy": [ | |
"happy", "excited", "good", "joy", "grateful", "glad", "pleased", "content", "satisfied", "cheerful", "elated", | |
"maza aa gaya", "achha lag raha hai", "khush", "khushi", "badiya", "mast", "enjoy", "enjoyed", "mazedaar", "achha" | |
], | |
"sad": [ | |
"sad", "upset", "crying", "lonely", "depressed", "down", "disappointed", "heartbroken", "unhappy", | |
"bura lag raha hai", "dukhi", "udaas", "rona", "rona aa gaya", "dil toot gaya", "nirash" | |
], | |
"angry": [ | |
"angry", "annoyed", "frustrated", "irritated", "mad", "furious", "gussa", "gusse mein", "chidh", "naraz", | |
"bhadak gaya", "chidh gaya", "irritate", "irritated" | |
], | |
"nervous": [ | |
"nervous", "anxious", "scared", "worried", "fearful", "uneasy", "tensed", "tension", "ghabrahat", "chinta", | |
"parishan", "dara hua", "ghabra gaya", "stress", "stressed" | |
], | |
"unwell": [ | |
"sick", "unwell", "not feeling well", "fever", "cold", "headache", "flu", "ill", "nauseous", "dizzy", | |
"thak gaya", "thaka hua", "bimaar", "bimar", "bukhar", "sardard", "beemar", "kamjor", "thakan" | |
], | |
"neutral": [ | |
"ok", "fine", "theek", "normal", "usual", "routine", "nothing special", "kuch khaas nahi", "no stress" | |
] | |
} | |
detected_moods = [] | |
for mood, keywords in mood_map.items(): | |
for kw in keywords: | |
if kw in text_lower: | |
detected_moods.append(mood) | |
break # Only need one match per mood | |
# Use sentiment as a fallback if no mood keyword matched | |
if not detected_moods: | |
sentiment = get_sentiment_score(text) | |
if sentiment > 0.2: | |
return "happy" | |
elif sentiment < -0.2: | |
return "sad" | |
else: | |
return "neutral" | |
# Priority: angry > sad > unwell > nervous > happy > neutral | |
priority = ["angry", "sad", "unwell", "nervous", "happy", "neutral"] | |
for mood in priority: | |
if mood in detected_moods: | |
return mood | |
return "neutral" | |
def generate_tags(label, text): | |
# Define stopwords manually (lightweight and fast) | |
stopwords = set([ | |
"or", "to", "also", "the", "and", "a", "an", "in", "on", "of", "for", | |
"with", "at", "by", "from", "as", "is", "was", "are", "be", "will", | |
"has", "have", "it", "this", "that", "but", "if", "not", "so", "do", | |
"does", "did", "am", "can", "i", "me", "my", "you", "we", "they", "he", "she" | |
]) | |
base_tags = [label] | |
# Extract keywords (only alphabetic words with 4 or more letters) | |
keywords = re.findall(r'\b[a-zA-Z]{4,}\b', text.lower()) | |
# Filter out stopwords | |
filtered_keywords = [word for word in keywords if word not in stopwords] | |
# Add forced tags based on context | |
force_tags = [] | |
lowered = text.lower() | |
if any(w in lowered for w in ["sick", "unwell", "not feeling well", "fever"]): | |
force_tags += ["sick", "leave"] | |
if "work" in lowered: | |
force_tags.append("work") | |
# Merge and deduplicate tags | |
return list(set(base_tags + force_tags + filtered_keywords)) | |
# Detect language using langdetect | |
def detect_language(text): | |
langs = detect_langs(text) # returns list like: [en:0.99, hi:0.01] | |
if langs: | |
top_lang = langs[0] | |
return {"lang": top_lang.lang, "prob": round(top_lang.prob, 6)} | |
return {"lang": "unknown", "prob": 0} | |
# Detect sentiment using TextBlob | |
def get_sentiment_score(text): | |
try: | |
blob = TextBlob(text) | |
return round(blob.sentiment.polarity, 3) # Range: -1 to 1 | |
except: | |
return 0.0 | |
# Infer intent based on label | |
def infer_intent(label, text): | |
label_to_intent = { | |
"out_of_office": "taking_leave", | |
"sick_notice": "taking_leave", | |
"reminder": "set_reminder", | |
"event": "log_event", | |
"meeting": "schedule_meeting", | |
"note": "log_note", | |
"journal": "log_memory", | |
"memory": "log_memory", | |
"status_update": "status_update", | |
"task": "create_task", | |
"celebration": "log_event" | |
} | |
return label_to_intent.get(label, "other") | |
# Extract entities using NER | |
def extract_entities(text): | |
ner_results = ner_pipeline(text) | |
entities = {"people": [], "places": [], "organizations": [], "dates": [], "misc": []} | |
PLACE_KEYWORDS = [ | |
"garden", "hotel", "resort", "mall", "restaurant", "cafe", "market", | |
"school", "college", "temple", "station", "airport", "hospital", | |
"park", "store", "shop", "gym", "theater", "cinema", "bank", "office", | |
"court", "salon", "studio", "museum", "library", "club", "university", | |
"guest house", "hostel", "canteen", "clinic", "zoo", "residency", "apartment" | |
] | |
RELATION_KEYWORDS = [ | |
# English | |
"mom", "dad", "father", "mother", "sister", "brother", "sis", "bro", | |
"uncle", "aunt", "aunty", "cousin", "grandfather", "grandmother", | |
"grandpa", "grandma", "wife", "husband", "son", "daughter", "child", | |
"kids", "baby", "partner", "fiancΓ©", "fiancΓ©e", "in-laws", "relatives", | |
"friend", "colleague", "buddy", "pal", "mate", "acquaintance", "companion", | |
"girlfriend", "boyfriend", "lover", "spouse", "significant other", | |
# Hindi & Hinglish | |
"maa", "mummy", "papa", "pappa", "pitaji", "mataji", "didi", "behen", "bhai", | |
"chacha", "chachi", "mama", "mami", "tau", "tai", "nana", "nani", | |
"dada", "dadi", "sasur", "sasuma", "jija", "saali", "bhabhi", "devar", | |
"nandoi", "patni", "pati", "bachcha", "baccha", "beta", "beti", "putra", "putri", | |
"sambandhi", "rishtedaar", "saheli", "dost", "yara", "saathi" | |
] | |
for ent in ner_results: | |
word = ent["word"].replace("##", "") | |
if len(word) <= 2 or not word.isalpha(): | |
continue # skip single-letter non-words | |
group = ent["entity_group"] | |
if group == "PER": | |
entities["people"].append(word) | |
elif group == "LOC": | |
entities["places"].append(word) | |
elif group == "ORG": | |
entities["organizations"].append(word) | |
elif group == "DATE": | |
entities["dates"].append(word) | |
else: | |
entities["misc"].append(word) | |
# β Fallback: Add known days/dates if not already captured | |
day_keywords = re.findall(r'\b(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', text, re.IGNORECASE) | |
for day in day_keywords: | |
if day not in entities["dates"]: | |
entities["dates"].append(day) | |
# β Fallback: Add phrases like βproduct launchβ, βprojectβ, etc. to misc | |
lower_text = text.lower() | |
if "product launch" in lower_text: | |
entities["misc"].append("product launch") | |
if "birthday" in lower_text: | |
entities["misc"].append("birthday") | |
if "project" in lower_text: | |
entities["misc"].append("project") | |
# β Add keyword-based places | |
for place in PLACE_KEYWORDS: | |
if place in lower_text and place not in entities["places"]: | |
entities["places"].append(place) | |
# β Detect relation keywords (English + Hindi) | |
for relation in RELATION_KEYWORDS: | |
if re.search(rf"\b{re.escape(relation)}\b", text.lower()): | |
entities["people"].append(relation) | |
# β Deduplicate and return | |
return {k: list(set(v)) for k, v in entities.items()} | |
# Function to calculate urgency score based on parsed dates | |
def get_urgency_score(text, parsed_dates): | |
urgency_keywords = ["urgent", "asap", "immediate", "must", "need to", "important", "donβt forget", "right away"] | |
text_lower = text.lower() | |
score = 0.0 | |
# 1. Keyword-based boost | |
if any(word in text_lower for word in urgency_keywords): | |
score = 0.7 | |
# 2. Time-based boost | |
now = datetime.now() | |
for d in parsed_dates: | |
dt = dateparser.parse(d) | |
if dt: | |
hours = (dt - now).total_seconds() / 3600 | |
if 0 <= hours <= 24: | |
score = max(score, 1.0) | |
elif 24 < hours <= 72: | |
score = max(score, 0.8) | |
elif 72 < hours <= 168: | |
score = max(score, 0.5) | |
return round(score, 2) | |
# Function to get meta information about the text | |
def get_meta_info(text: str): | |
now = datetime.now() | |
return { | |
"word_count": len(text.strip().split()), | |
"day_of_week": now.strftime('%A'), # e.g., "Thursday" | |
"hour_of_day": now.hour, | |
"month": now.strftime('%B'), # e.g., "July" | |
"year": now.year # 0 to 23 | |
} | |
def is_year_context(text_snippet): | |
return bool(re.search(r"\b(?:jan|feb|march|april|may|june|july|aug|sept|oct|nov|dec|year|in|on|by|for)\b", text_snippet)) | |
# Function to extract amounts in various currencies from text | |
def extract_amounts(text: str): | |
currency_patterns = [ | |
# INR variants | |
(re.compile(r"(?:βΉ|rs\.?|inr)\s?(\d[\d,]*(?:\.\d+)?)"), "INR"), | |
(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?(?:βΉ|rs\.?|inr)"), "INR"), | |
(re.compile(r"(\d+(?:\.\d+)?)\s?(rupees?|rupaye|rupiye)"), "INR"), | |
# USD variants | |
(re.compile(r"(?:\$)\s?(\d[\d,]*(?:\.\d+)?)"), "USD"), | |
(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?\$"), "USD"), | |
(re.compile(r"(\d+(?:\.\d+)?)\s?(dollars?)"), "USD"), | |
(re.compile(r"(\d+(?:\.\d+)?)\s?(cents?)"), "USD"), | |
# EUR variants | |
(re.compile(r"(?:β¬|eur)\s?(\d[\d,]*(?:\.\d+)?)"), "EUR"), | |
(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?β¬"), "EUR"), | |
(re.compile(r"(\d+(?:\.\d+)?)\s?(euros?)"), "EUR"), | |
# GBP variants | |
(re.compile(r"(?:Β£|gbp)\s?(\d[\d,]*(?:\.\d+)?)"), "GBP"), | |
(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?Β£"), "GBP"), | |
(re.compile(r"(\d+(?:\.\d+)?)\s?(pounds?)"), "GBP"), | |
# INR large units | |
(re.compile(r"(\d+(?:\.\d+)?)\s?(lacs?|lakhs?)"), "INR"), | |
(re.compile(r"(\d+(?:\.\d+)?)\s?(crores?|crs?|cr)"), "INR"), | |
] | |
results = [] | |
seen = set() | |
text_lower = text.lower() | |
for pattern, currency_code in currency_patterns: | |
for match in pattern.finditer(text_lower): | |
groups = match.groups() | |
raw_number = next((g for g in groups if re.match(r"\d", g)), None) | |
if not raw_number: | |
continue | |
# Ignore phone numbers and IDs (10+ digits) | |
if len(raw_number.replace(",", "")) >= 10: | |
continue | |
try: | |
number = float(raw_number.replace(",", "")) | |
# Check for lakh/crore/cents multipliers | |
if any(g in ['lakh', 'lacs', 'lakhs'] for g in groups): | |
number *= 100_000 | |
elif any(g in ['crore', 'crores', 'cr', 'crs'] for g in groups): | |
number *= 10_000_000 | |
elif any(g == 'cents' for g in groups): | |
number /= 100 | |
except Exception: | |
continue | |
key = (number, currency_code) | |
if key not in seen: | |
seen.add(key) | |
results.append({ | |
"value": round(number, 2), | |
"currency": currency_code | |
}) | |
# Fallback matching for generic numeric phrases near expense keywords | |
if not results: | |
fallback_patterns = [ | |
re.compile( | |
r"\b(?:paid|spent|buy|purchase|cost|price|add(?:ed)?|gift(?:ed)?|bill(?: of)?|recharge(?:d)?|charged|transfer(?:red)?)\b[^0-9]{0,10}(\d[\d,]*(?:\.\d+)?)" | |
), | |
re.compile(r"\b(\d[\d,]{2,8})\b\s?(?:rs|inr)?") | |
] | |
for fallback_pattern in fallback_patterns: | |
match = fallback_pattern.search(text_lower) | |
if match: | |
number_str = match.group(1).replace(",", "") | |
# Ignore phone numbers and IDs | |
if len(number_str) >= 10: | |
continue | |
try: | |
number = float(number_str) | |
# Context check for year-like numbers | |
if 2020 <= number <= 2100: | |
# Check 5-6 words before/after for year clue | |
span = match.span(1) | |
surrounding = text_lower[max(0, span[0]-30):span[1]+30] | |
if is_year_context(surrounding): | |
continue # Looks like a year | |
key = (number, "INR") | |
if key not in seen: | |
seen.add(key) | |
results.append({ | |
"value": round(number, 2), | |
"currency": "INR" | |
}) | |
break # Only extract first match in fallback | |
except: | |
continue | |
return results | |
def predict_expense_category(text, detected_stores): | |
text_lower = text.lower() | |
# 1. Use detected store category if available | |
if detected_stores: | |
best_match = max(detected_stores, key=lambda s: s.get("confidence", 1.0)) | |
return best_match["category"] | |
# Category keyword mapping | |
category_keywords = { | |
"food": [ | |
"food", "lunch", "dinner", "breakfast", "snacks", "swiggy", "zomato", "dominos", "pizza", "kfc", "mcdonald", | |
"restaurant", "hotel", "cafe", "canteen", "meal", "buffet", "thali", "tiffin", "order", "takeaway", "parcel", | |
"eat", "eating", "brunch", "supper", "kitchen", "cook", "cooking", "chef", "dish", "dishes", "menu", "serve", | |
"served", "serving", "food court", "food delivery", "delivery", "online order", "food app", "food bill", | |
"beverage", "juice", "shake", "smoothie", "coffee", "tea", "chai", "cold drink", "soft drink", "soda", "water bottle", | |
"ice cream", "dessert", "sweet", "sweets", "chocolate", "candy", "bakery", "bread", "cake", "pastry", "cookie", | |
"biscuit", "chips", "fries", "burger", "sandwich", "roll", "wrap", "noodles", "pasta", "rice", "biryani", "curry", | |
"gravy", "dal", "sabzi", "roti", "naan", "paratha", "chapati", "idli", "dosa", "vada", "sambar", "chutney", "samosa", | |
"pakora", "chaat", "pani puri", "golgappa", "sev", "poha", "upma", "maggi", "maggie", "momos", "spring roll", | |
"manchurian", "paneer", "butter chicken", "tandoori", "kebab", "shawarma", "pizza hut", "subway", "starbucks", | |
# Hindi/Hinglish | |
"khana", "nashta", "bhojan", "rasoi", "thali", "dabba", "tiffin", "chai", "paani", "jal", "kharcha khana", | |
"khane ka bill", "khane ka paisa", "khane ki cheez", "khana order", "khana mangwaya", "khana khaya", "khana khud banaya", | |
"khana kharch", "khana kharida", "khana diya", "khana laya", "khana banaya" | |
], | |
"transport": [ | |
"uber", "ola", "taxi", "cab", "bus", "train", "metro", "flight", "auto", "rickshaw", "car", "gaadi", "yatra", "safar", "travel", "ticket", "plane", "udaan", "station", "airport", "rapido", | |
], | |
"shopping": [ | |
"amazon", "flipkart", "myntra", "shopping", "clothes", "kapde", "apparel", "shoes", "jeans", "tshirt", "store", "fashion", "dukaan", "mall", "bazaar", "market", "kharida", "order diya", "le liya" | |
], | |
"housing": [ | |
"rent", "apartment", "house", "ghar", "flat", "maintenance", "landlord", "kiraya", "makaan", "room", "hostel", "pg", "society" | |
], | |
"utilities": [ | |
"electricity", "power", "bijli", "water", "pani", "gas", "bill", "recharge", "broadband", "wifi", "airtel", "jio", "phone", "mobile", "internet", "light", "cylinder", "connection" | |
], | |
"entertainment": [ | |
"movie", "netflix", "hotstar", "bookmyshow", "spotify", "gaming", "youtube premium", "cinema", "film", "picture", "game", "khel", "manoranjan", "show", "concert" | |
], | |
"health": [ | |
"medicine", "hospital", "doctor", "clinic", "pharmacy", "tablet", "surgery", "checkup", "dawai", "aspatal", "ilaaj", "health", "bimari", "test", "medical", "pathology", "chemist" | |
], | |
"travel": [ | |
"trip", "travel", "tour", "vacation", "hotel", "airbnb", "booking.com", "goibibo", "makemytrip", "yatra", "safar", "holiday", "journey", "musafir", "booking", "trip kiya" | |
], | |
"education": [ | |
"course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill", "padhai", "school", "college", "tuition", "kitab", "book", "fees", "shiksha" | |
], | |
"digital_services": [ | |
"domain", "membership", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas", "subscription", "digital", "online", "app", "service", "renewal" | |
], | |
"gifts_donations": [ | |
"gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift", "uphaar", "daan", "tohfa", "chanda", "puja", "mandir", "gurudwara" | |
], | |
"finance": [ | |
"insurance", "sip", "mutual fund", "stock", "demat", "zerodha", "investment", "trading", "upstox", "crypto", "policy", "premium", "loan", "emi", "fd", "rd", "paisa", "bank", "account" | |
], | |
"family_kids": [ | |
"kid", "baby", "school", "daycare", "tuition", "books", "uniform", "toys", "creche", "baccha", "bachche", "parivar", "family", "beti", "beta", "child", "children" | |
], | |
"stationery": [ | |
"pen", "pencil", "notebook", "diary", "eraser", "sharpener", "paper", "stationery", "register", "files", "file", "markers", "highlighter", "sticky notes", "geometry box", | |
"stapler", "ink", "printer paper", "stationary shop", "stationary", "copy", "kagaz", "likhne ka saman" | |
] | |
} | |
# 2. Match using keyword scores | |
matched = {cat: sum(1 for kw in kws if kw in text_lower) for cat, kws in category_keywords.items()} | |
best_match = max(matched.items(), key=lambda x: x[1]) | |
if best_match[1] > 0: | |
return best_match[0] | |
return "miscellaneous" | |
def insert_text_entry(data): | |
try: | |
conn = psycopg2.connect(DATABASE_URL) | |
cur = conn.cursor() | |
insert_query = """ | |
INSERT INTO user_entries ( | |
uuid, user_id, raw_text, word_count, day_of_week, hour_of_day, month, year, | |
type, expense_type, intent, confidence_scores, urgency_score, | |
time_mentions, parsed_dates, tense, summary, | |
people, mood, language, sentiment_score, tags, | |
action_required, entities, amounts, stores, processing_time_ms, raw_json | |
) VALUES ( | |
%(uuid)s, %(user_id)s, %(raw_text)s, %(word_count)s, %(day_of_week)s, %(hour_of_day)s, %(month)s, %(year)s, | |
%(type)s, %(expense_type)s, %(intent)s, %(confidence_scores)s, %(urgency_score)s, | |
%(time_mentions)s, %(parsed_dates)s, %(tense)s, %(summary)s, | |
%(people)s, %(mood)s, %(language)s, %(sentiment_score)s, %(tags)s, | |
%(action_required)s, %(entities)s, %(amounts)s, %(stores)s, %(processing_time_ms)s, %(raw_json)s | |
) | |
ON CONFLICT (uuid) DO NOTHING; | |
""" | |
cur.execute(insert_query, { | |
**data, | |
"confidence_scores": Json(data["confidence_scores"]), | |
"language": Json(data["language"]), | |
"stores": Json(data["stores"]), | |
"entities": Json(data["entities"]), | |
"amounts": Json(data["amounts"]), | |
"raw_json": Json(data["raw_json"]) | |
}) | |
conn.commit() | |
cur.close() | |
conn.close() | |
print("β Data inserted successfully") | |
except Exception as e: | |
print("β Failed to insert data:", e) | |
def health_check(): | |
return {"message": "β Hello from yourpartner/demospace β API is running!"} | |
async def not_found_handler(request: Request, exc): | |
return ORJSONResponse(status_code=404, content={"error": "Route not found"}) | |
async def internal_error_handler(request: Request, exc): | |
return ORJSONResponse(status_code=500, content={"error": "Internal server error: " + str(exc)}) | |
# Search endpoint to filter user entries based on various criteria | |
async def search_entries( | |
userid: str = Header(..., description="User ID"), | |
tags: str = "", | |
query: str = "", | |
startDate: str = "", | |
endDate: str = "", | |
type: str = "" | |
): | |
# Validate user_id from header | |
if not userid or not userid.strip(): | |
return ORJSONResponse(status_code=400, content={"error": "Missing or empty userid header."}) | |
# Build SQL filters | |
filters = ["user_id = %s"] | |
params = [userid] | |
if type: | |
filters.append("type = %s") | |
params.append(type) | |
if tags: | |
tag_list = [t.strip() for t in tags.split(",") if t.strip()] | |
filters.append("tags && %s") | |
params.append(tag_list) | |
if query: | |
filters.append("(raw_text ILIKE %s OR summary ILIKE %s)") | |
params.extend([f"%{query}%", f"%{query}%"]) | |
if startDate: | |
try: | |
start_dt = datetime.strptime(startDate, "%d-%m-%Y") | |
filters.append("created_at >= %s") | |
params.append(start_dt) | |
except: | |
return ORJSONResponse(status_code=400, content={"error": "Invalid startDate format. Use DD-MM-YYYY."}) | |
if endDate: | |
try: | |
end_dt = datetime.strptime(endDate, "%d-%m-%Y") | |
filters.append("created_at <= %s") | |
params.append(end_dt) | |
except: | |
return ORJSONResponse(status_code=400, content={"error": "Invalid endDate format. Use DD-MM-YYYY."}) | |
where_clause = " AND ".join(filters) | |
query_sql = f"SELECT * FROM user_entries WHERE {where_clause} ORDER BY created_at DESC LIMIT 50" | |
try: | |
conn = psycopg2.connect(DATABASE_URL) | |
cur = conn.cursor() | |
cur.execute(query_sql, tuple(params)) | |
rows = cur.fetchall() | |
columns = [desc[0] for desc in cur.description] | |
entries = [dict(zip(columns, row)) for row in rows] | |
# Remove raw_json from each entry in results | |
for entry in entries: | |
entry.pop("raw_json", None) | |
cur.close() | |
conn.close() | |
except Exception as e: | |
return ORJSONResponse(status_code=500, content={"error": str(e)}) | |
return ORJSONResponse(content={"results": entries}) | |
async def visualyse_dashboard(user_id: str): | |
try: | |
conn = psycopg2.connect(DATABASE_URL) | |
cur = conn.cursor() | |
# Fetch all entries for the user | |
cur.execute("SELECT * FROM user_entries WHERE user_id = %s", (user_id,)) | |
rows = cur.fetchall() | |
columns = [desc[0] for desc in cur.description] | |
entries = [dict(zip(columns, row)) for row in rows] | |
cur.close() | |
conn.close() | |
except Exception as e: | |
return ORJSONResponse(status_code=500, content={"error": str(e)}) | |
# Section 1: Expense Overview | |
expenses = [e for e in entries if e["type"] == "expense"] | |
total_expense = sum(a["value"] for e in expenses for a in (e["amounts"] or [])) | |
expense_count = len(expenses) | |
expense_by_category = {} | |
for e in expenses: | |
cat = e.get("expense_type", "miscellaneous") | |
amt = sum(a["value"] for a in (e["amounts"] or [])) | |
expense_by_category[cat] = expense_by_category.get(cat, 0) + amt | |
# Monthly/Weekly Trends | |
monthly_trends = {} | |
for e in expenses: | |
key = f"{e['month']}-{e['year']}" | |
amt = sum(a["value"] for a in (e["amounts"] or [])) | |
monthly_trends[key] = monthly_trends.get(key, 0) + amt | |
# Section 2: Top Stores & Categories | |
store_stats = {} | |
for e in expenses: | |
for s in (e["stores"] or []): | |
store = s.get("store", "unknown") | |
amt = sum(a["value"] for a in (e["amounts"] or [])) | |
if store not in store_stats: | |
store_stats[store] = {"count": 0, "total": 0} | |
store_stats[store]["count"] += 1 | |
store_stats[store]["total"] += amt | |
top_categories = sorted(expense_by_category.items(), key=lambda x: x[1], reverse=True) | |
# Section 3: Recent Expenses | |
recent_expenses = sorted(expenses, key=lambda e: e.get("created_at", ""), reverse=True)[:7] | |
# Section 4: Mood Trends | |
mood_dist = {} | |
for e in entries: | |
mood = e.get("mood", "neutral") | |
mood_dist[mood] = mood_dist.get(mood, 0) + 1 | |
# Section 5: Tags & Keywords | |
tag_freq = {} | |
for e in entries: | |
for tag in (e["tags"] or []): | |
tag_freq[tag] = tag_freq.get(tag, 0) + 1 | |
top_tags = sorted(tag_freq.items(), key=lambda x: x[1], reverse=True)[:15] | |
# Section 6: Time Analysis | |
day_stats = {} | |
hour_stats = {} | |
for e in expenses: | |
day = e.get("day_of_week", "unknown") | |
hour = e.get("hour_of_day", 0) | |
amt = sum(a["value"] for a in (e["amounts"] or [])) | |
day_stats[day] = day_stats.get(day, 0) + amt | |
hour_stats[hour] = hour_stats.get(hour, 0) + amt | |
# Section 7: Meta Info | |
entry_count = len(entries) | |
type_dist = {} | |
for e in entries: | |
t = e.get("type", "other") | |
type_dist[t] = type_dist.get(t, 0) + 1 | |
dashboard = { | |
"expense_overview": { | |
"total_expense": total_expense, | |
"expense_count": expense_count, | |
"expense_by_category": expense_by_category, | |
"monthly_trends": monthly_trends | |
}, | |
"top_stores": store_stats, | |
"top_categories": top_categories, | |
"recent_expenses": recent_expenses, | |
"mood_distribution": mood_dist, | |
"top_tags": top_tags, | |
"time_analysis": { | |
"by_day": day_stats, | |
"by_hour": hour_stats | |
}, | |
"meta_info": { | |
"entry_count": entry_count, | |
"type_distribution": type_dist | |
} | |
} | |
return ORJSONResponse(content=dashboard) | |
async def analyze(input: TextInput): | |
start_time = time.time() # β±οΈ start | |
text = input.text | |
label_map = { | |
"task (something to be done or completed)": "task", | |
"event (an activity that is happening or has happened)": "event", | |
"reminder (a message to remember something in the future)": "reminder", | |
"meeting (a planned gathering between people to discuss something)": "meeting", | |
"relationship (message about personal or emotional connection with someone)": "relationship", | |
"note (general note or quick thought not related to any specific category)": "note", | |
"journal (personal reflection or emotional writing about one's day or thoughts)": "journal", | |
"memory (recollection or recording of a past moment or experience)": "memory", | |
"status_update (current condition, feeling, or situation being shared)": "status_update", | |
"sick_notice (informing about illness or not feeling well)": "sick_notice", | |
"out_of_office (message about being unavailable for work or responsibilities)": "out_of_office", | |
"travel_plan (planning or mentioning a trip or journey)": "travel_plan", | |
"celebration (message about a festive occasion, party or achievement)": "celebration", | |
"expense (money spent on something, either small or large)": "expense", | |
"news (update about public events, announcements, or current affairs)": "news", | |
"information (factual content or informative message not tied to user activity)": "information", | |
"purchase (buying or ordering something, like a product or service)": "purchase", | |
"other (does not clearly fall into any specific category)": "other" | |
} | |
# classification = classifier(text, labels) | |
# Async call to classifier | |
classification = await asyncio.to_thread(classifier, text, labels, hypothesis_template="This entry is about {}.") | |
best_label = classification['labels'][0] | |
best_label = label_map.get(best_label, best_label) | |
amounts = await asyncio.to_thread(extract_amounts, text) | |
# Check if the best label is expense or purchase based on keywords | |
if ( | |
best_label == "task" | |
and (any(word in text.lower() for word in expense_keywords) or amounts) | |
): | |
best_label = "expense" | |
if best_label == "purchase": | |
best_label = "expense" | |
if "reported" in text or "announced" in text or "collapsed" in text: | |
if best_label in ["task", "reminder", "event"]: | |
best_label = "news" | |
scores = dict(zip(classification['labels'], classification['scores'])) | |
# # Convert to short labels | |
confidence_scores_full = { | |
label_map.get(label, label): score | |
for label, score in scores.items() | |
} | |
# Only keep top 2 | |
confidence_scores = dict(sorted(confidence_scores_full.items(), key=lambda x: x[1], reverse=True)[:2]) | |
parsed_dates, time_mentions = await asyncio.to_thread(extract_dates_with_accuracy, text, amounts) | |
tenses = detect_tense(parsed_dates) | |
summary = await asyncio.to_thread(generate_summary, text) | |
mood = estimate_mood(text) | |
tags = generate_tags(best_label, text) | |
language_detected = detect_language(text) | |
sentiment_score = get_sentiment_score(text) | |
if sentiment_score is None or sentiment_score == "": | |
sentiment_score = 0.0 | |
entities = await asyncio.to_thread(extract_entities, text) | |
people = entities["people"] # Extracted people entities | |
intent = infer_intent(best_label, text) | |
urgency_score = get_urgency_score(text, parsed_dates) | |
detected_stores = detect_store_category(text) | |
expense_category = "" | |
if best_label == "expense" or best_label == "purchase": | |
expense_category = predict_expense_category(text, detected_stores) | |
# Define action triggers | |
ACTION_TRIGGERS = ["plan", "organize", "schedule", "remember", "book", "call", "follow up", "need to"] | |
action_required = False | |
if any(word in text.lower() for word in ACTION_TRIGGERS): action_required = True | |
action_required = urgency_score >= 0.6 or action_required | |
meta = get_meta_info(text) | |
end_time = time.time() # β±οΈ end | |
processing_time_ms = round((end_time - start_time) * 1000) | |
result = { | |
"uuid": str(uuid.uuid4()), # Unique identifier for the request | |
"user_id": input.user_id, # Unique identifier for the request | |
"raw_text": text, | |
"word_count": meta["word_count"], | |
"day_of_week": meta["day_of_week"], | |
"hour_of_day": meta["hour_of_day"], | |
"month": meta["month"], | |
"year": meta["year"], | |
"type": best_label, | |
"expense_type": expense_category, | |
"intent": intent, | |
"confidence_scores": confidence_scores, | |
"urgency_score": urgency_score, | |
"time_mentions": time_mentions, | |
"parsed_dates": parsed_dates, | |
"tense": tenses, | |
"summary": summary.removeprefix("summary:").strip(), | |
"people": people, | |
"mood": mood, | |
"language": language_detected, | |
"sentiment_score": sentiment_score, | |
"tags": tags, | |
"action_required": action_required, | |
"entities": entities, | |
"amounts": amounts, | |
"stores": detected_stores, | |
"processing_time_ms": processing_time_ms | |
} | |
# Store a copy of result without raw_json to avoid circular reference | |
raw_json_copy = result.copy() | |
# Remove raw_json if present (shouldn't be, but for safety) | |
raw_json_copy.pop("raw_json", None) | |
result["raw_json"] = raw_json_copy | |
# Insert into database | |
await asyncio.to_thread(insert_text_entry, result) | |
# Log the result | |
print("β Analysis complete") | |
# Remove raw_json from response | |
result.pop("raw_json", None) | |
# Return the result as JSON response | |
return ORJSONResponse(content=result) |