Muhammad Abdur Rahman Saad
commited on
Commit
·
25a7cd3
1
Parent(s):
4fa3bf0
Issue #35
Browse files- app/app.py +3 -2
- app/jobs.json +1 -1
- app/models/database/__init__.py +6 -1
- app/models/database/astra.py +78 -0
- app/requirements.txt +1 -0
- app/routes/knowledge_base.py +59 -0
app/app.py
CHANGED
|
@@ -10,7 +10,7 @@ from fastapi.responses import JSONResponse
|
|
| 10 |
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
| 11 |
from apscheduler.triggers.cron import CronTrigger
|
| 12 |
|
| 13 |
-
from routes import category, summary, keyword, lda # pylint: disable=import-error
|
| 14 |
|
| 15 |
|
| 16 |
class Config: # pylint: disable=too-few-public-methods
|
|
@@ -63,7 +63,7 @@ async def lifespan(_app: FastAPI):
|
|
| 63 |
logging.basicConfig(
|
| 64 |
format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
|
| 65 |
)
|
| 66 |
-
logging.getLogger().setLevel(logging.
|
| 67 |
|
| 68 |
#start scheduler anf add jobs
|
| 69 |
scheduler.start()
|
|
@@ -172,6 +172,7 @@ app.include_router(category.router)
|
|
| 172 |
app.include_router(summary.router)
|
| 173 |
app.include_router(keyword.router)
|
| 174 |
app.include_router(lda.router)
|
|
|
|
| 175 |
|
| 176 |
@app.get("/_health")
|
| 177 |
def health():
|
|
|
|
| 10 |
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
| 11 |
from apscheduler.triggers.cron import CronTrigger
|
| 12 |
|
| 13 |
+
from routes import category, summary, keyword, lda, knowledge_base # pylint: disable=import-error
|
| 14 |
|
| 15 |
|
| 16 |
class Config: # pylint: disable=too-few-public-methods
|
|
|
|
| 63 |
logging.basicConfig(
|
| 64 |
format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
|
| 65 |
)
|
| 66 |
+
logging.getLogger().setLevel(logging.ERROR)
|
| 67 |
|
| 68 |
#start scheduler anf add jobs
|
| 69 |
scheduler.start()
|
|
|
|
| 172 |
app.include_router(summary.router)
|
| 173 |
app.include_router(keyword.router)
|
| 174 |
app.include_router(lda.router)
|
| 175 |
+
app.include_router(knowledge_base.router)
|
| 176 |
|
| 177 |
@app.get("/_health")
|
| 178 |
def health():
|
app/jobs.json
CHANGED
|
@@ -32,7 +32,7 @@
|
|
| 32 |
"func": "collectors.finfast.lda:collect",
|
| 33 |
"trigger": "cron",
|
| 34 |
"hour": 23,
|
| 35 |
-
"minute":
|
| 36 |
}
|
| 37 |
|
| 38 |
]
|
|
|
|
| 32 |
"func": "collectors.finfast.lda:collect",
|
| 33 |
"trigger": "cron",
|
| 34 |
"hour": 23,
|
| 35 |
+
"minute": 45
|
| 36 |
}
|
| 37 |
|
| 38 |
]
|
app/models/database/__init__.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
| 1 |
"""Module for Mongodb database"""
|
| 2 |
from .mongodb import (article_collection, entity_collection, category_collection,
|
| 3 |
keywords_collection, summary_collection, lda_collection)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
__all__ = [
|
| 6 |
"article_collection",
|
|
@@ -8,5 +12,6 @@ __all__ = [
|
|
| 8 |
"category_collection",
|
| 9 |
"keywords_collection",
|
| 10 |
"summary_collection",
|
| 11 |
-
"lda_collection"
|
|
|
|
| 12 |
]
|
|
|
|
| 1 |
"""Module for Mongodb database"""
|
| 2 |
from .mongodb import (article_collection, entity_collection, category_collection,
|
| 3 |
keywords_collection, summary_collection, lda_collection)
|
| 4 |
+
from .astra import KnowledgeBase
|
| 5 |
+
|
| 6 |
+
# Initialize KnowledgeBase instance
|
| 7 |
+
knowledge_base = KnowledgeBase()
|
| 8 |
|
| 9 |
__all__ = [
|
| 10 |
"article_collection",
|
|
|
|
| 12 |
"category_collection",
|
| 13 |
"keywords_collection",
|
| 14 |
"summary_collection",
|
| 15 |
+
"lda_collection",
|
| 16 |
+
"knowledge_base"
|
| 17 |
]
|
app/models/database/astra.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Module for AstraDB database"""
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
import astrapy
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
load_dotenv()
|
| 9 |
+
|
| 10 |
+
logging.basicConfig(
|
| 11 |
+
format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
|
| 12 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 13 |
+
level=logging.ERROR)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class KnowledgeBase:
|
| 17 |
+
"""
|
| 18 |
+
AstraDB class for direct collection operations.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self):
|
| 22 |
+
"""Initialize AstraDB connection."""
|
| 23 |
+
self.collection = astrapy.DataAPIClient(
|
| 24 |
+
os.environ["ASTRA_DB_APPLICATION_TOKEN"]).get_database(
|
| 25 |
+
os.environ["ASTRA_DB_API_ENDPOINT"]).documents
|
| 26 |
+
|
| 27 |
+
def get_doc_count(self, user_id: str) -> dict:
|
| 28 |
+
"""
|
| 29 |
+
Count unique emails and files for a specific user.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
user_id (str): The user's email address
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
dict: {"emails": count, "files": count, "total_documents": count}
|
| 36 |
+
|
| 37 |
+
Raises:
|
| 38 |
+
ValueError: If user_id is invalid
|
| 39 |
+
Exception: If database query fails
|
| 40 |
+
"""
|
| 41 |
+
if not user_id or not isinstance(user_id, str):
|
| 42 |
+
raise ValueError("user_id must be a non-empty string")
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
# Get all documents for the user with type gmail or file
|
| 46 |
+
filter_criteria = {
|
| 47 |
+
"metadata.userId": user_id,
|
| 48 |
+
"metadata.type": {"$in": ["gmail", "file"]}
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# Use direct collection access
|
| 52 |
+
results = list(self.collection.find(filter=filter_criteria))
|
| 53 |
+
|
| 54 |
+
# Group by metadata.id to get unique documents
|
| 55 |
+
unique_docs = {}
|
| 56 |
+
for doc in results:
|
| 57 |
+
doc_id = doc.get("metadata", {}).get("id")
|
| 58 |
+
doc_type = doc.get("metadata", {}).get("type")
|
| 59 |
+
|
| 60 |
+
if doc_id and doc_type:
|
| 61 |
+
if doc_id not in unique_docs:
|
| 62 |
+
unique_docs[doc_id] = doc_type
|
| 63 |
+
|
| 64 |
+
# Count by type
|
| 65 |
+
email_count = sum(1 for doc_type in unique_docs.values() if doc_type == "gmail")
|
| 66 |
+
file_count = sum(1 for doc_type in unique_docs.values() if doc_type == "file")
|
| 67 |
+
total_count = len(unique_docs)
|
| 68 |
+
|
| 69 |
+
return {
|
| 70 |
+
"emails": email_count,
|
| 71 |
+
"files": file_count,
|
| 72 |
+
"total_documents": total_count
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
except Exception as e: # pylint: disable=broad-exception-caught
|
| 76 |
+
logging.error("Failed to get document count for user %s: %s", user_id, str(e))
|
| 77 |
+
# pylint: disable=raise-missing-from
|
| 78 |
+
raise Exception(f"Database query failed: {str(e)}") # pylint: disable=broad-exception-raised
|
app/requirements.txt
CHANGED
|
@@ -31,6 +31,7 @@ boto3==1.38.43
|
|
| 31 |
nltk==3.9.1
|
| 32 |
langchain==0.3.27
|
| 33 |
langchain-openai==0.3.28
|
|
|
|
| 34 |
gensim==4.3.3
|
| 35 |
pyLDAvis==3.4.1
|
| 36 |
scikit-learn==1.7.1
|
|
|
|
| 31 |
nltk==3.9.1
|
| 32 |
langchain==0.3.27
|
| 33 |
langchain-openai==0.3.28
|
| 34 |
+
langchain-astradb==0.6.0
|
| 35 |
gensim==4.3.3
|
| 36 |
pyLDAvis==3.4.1
|
| 37 |
scikit-learn==1.7.1
|
app/routes/knowledge_base.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""This module defines the /knowledge-base route for the FastAPI application."""
|
| 2 |
+
import logging
|
| 3 |
+
from fastapi import APIRouter, HTTPException, Path
|
| 4 |
+
from fastapi.responses import JSONResponse
|
| 5 |
+
from models.database import knowledge_base # pylint: disable=import-error
|
| 6 |
+
|
| 7 |
+
router = APIRouter(prefix="/knowledge-base", tags=["knowledge-base"])
|
| 8 |
+
|
| 9 |
+
@router.get("/{user_id}")
|
| 10 |
+
async def get_user_document_stats(
|
| 11 |
+
user_id: str = Path(..., description="User's email address")
|
| 12 |
+
) -> JSONResponse:
|
| 13 |
+
"""
|
| 14 |
+
Get document statistics for a specific user.
|
| 15 |
+
|
| 16 |
+
This endpoint counts the number of unique emails and files uploaded by the user.
|
| 17 |
+
It groups documents by metadata.id to ensure unique document counting (not chunks).
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
user_id (str): The user's email address
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
JSONResponse: A JSON response containing document counts:
|
| 24 |
+
{
|
| 25 |
+
"user_id": "[email protected]",
|
| 26 |
+
"emails": 5,
|
| 27 |
+
"files": 12,
|
| 28 |
+
"total_documents": 17
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
Raises:
|
| 32 |
+
HTTPException: 400 for invalid user_id, 500 for database errors
|
| 33 |
+
"""
|
| 34 |
+
try:
|
| 35 |
+
# Validate user_id format (basic email validation)
|
| 36 |
+
if not user_id or "@" not in user_id or "." not in user_id:
|
| 37 |
+
raise HTTPException(
|
| 38 |
+
status_code=400,
|
| 39 |
+
detail="Invalid user_id format. Must be a valid email address."
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# Get document counts from database
|
| 43 |
+
result = knowledge_base.get_doc_count(user_id)
|
| 44 |
+
|
| 45 |
+
# Add user_id to response
|
| 46 |
+
result["user_id"] = user_id
|
| 47 |
+
|
| 48 |
+
return JSONResponse(content=result, status_code=200)
|
| 49 |
+
|
| 50 |
+
except ValueError as e:
|
| 51 |
+
logging.error("Validation error for user %s: %s", user_id, str(e))
|
| 52 |
+
raise HTTPException(status_code=400, detail=str(e)) # pylint: disable=raise-missing-from
|
| 53 |
+
|
| 54 |
+
except Exception as e: # pylint: disable=broad-exception-caught
|
| 55 |
+
logging.error("Database error for user %s: %s", user_id, str(e))
|
| 56 |
+
raise HTTPException( # pylint: disable=raise-missing-from
|
| 57 |
+
status_code=500,
|
| 58 |
+
detail="Internal server error while retrieving document statistics."
|
| 59 |
+
)
|