Spaces:
Sleeping
Sleeping
Quintino Fernandes
commited on
Commit
·
379003a
1
Parent(s):
4bbe44e
Initial
Browse files- Dockerfile +37 -0
- README.md +4 -5
- app.py +164 -0
- database/query.py +164 -0
- main.py +135 -0
- models/LexRank.py +121 -0
- models/embedding.py +10 -0
- models/nlp.py +35 -0
- models/summarization.py +27 -0
- requirements.txt +13 -0
Dockerfile
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
# Install system dependencies
|
| 4 |
+
RUN apt-get update && apt-get install -y \
|
| 5 |
+
build-essential \
|
| 6 |
+
libpq-dev \
|
| 7 |
+
curl \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
# Add a non-root user
|
| 11 |
+
RUN useradd -m -u 1000 user
|
| 12 |
+
USER user
|
| 13 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 14 |
+
|
| 15 |
+
WORKDIR /app
|
| 16 |
+
|
| 17 |
+
# Copy and install Python dependencies
|
| 18 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 19 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 20 |
+
pip install --no-cache-dir -r requirements.txt
|
| 21 |
+
|
| 22 |
+
# Pre-download Spacy model
|
| 23 |
+
RUN python -m spacy download pt_core_news_md
|
| 24 |
+
|
| 25 |
+
# Pre-download T5 models
|
| 26 |
+
RUN python -c "from transformers import T5Tokenizer, T5ForConditionalGeneration; \
|
| 27 |
+
T5Tokenizer.from_pretrained('unicamp-dl/ptt5-base-portuguese-vocab'); \
|
| 28 |
+
T5ForConditionalGeneration.from_pretrained('recogna-nlp/ptt5-base-summ')"
|
| 29 |
+
|
| 30 |
+
# Copy application code
|
| 31 |
+
COPY --chown=user . /app
|
| 32 |
+
|
| 33 |
+
# Expose the application port
|
| 34 |
+
EXPOSE 7860
|
| 35 |
+
|
| 36 |
+
# Run the application
|
| 37 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
-
short_description: Summary Snapshots of multiple Portuguese Articles
|
| 9 |
---
|
| 10 |
|
| 11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Test
|
| 3 |
+
emoji: 📈
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
| 3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
+
from pydantic import BaseModel
|
| 5 |
+
from typing import Dict, Optional, List, Any
|
| 6 |
+
import uuid
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from contextlib import asynccontextmanager
|
| 9 |
+
|
| 10 |
+
from models.embedding import EmbeddingModel
|
| 11 |
+
from models.summarization import SummarizationModel
|
| 12 |
+
from models.nlp import NLPModel
|
| 13 |
+
from database.query import DatabaseService
|
| 14 |
+
from KairosNews.main import QueryProcessor
|
| 15 |
+
|
| 16 |
+
# Configure logging
|
| 17 |
+
logging.basicConfig(
|
| 18 |
+
level=logging.INFO,
|
| 19 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 20 |
+
handlers=[logging.StreamHandler()]
|
| 21 |
+
)
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
# Initialize models
|
| 25 |
+
embedding_model = None
|
| 26 |
+
summarization_model = None
|
| 27 |
+
nlp_model = None
|
| 28 |
+
db_service = None
|
| 29 |
+
|
| 30 |
+
@asynccontextmanager
|
| 31 |
+
async def lifespan(app: FastAPI):
|
| 32 |
+
global embedding_model, summarization_model, nlp_model, db_service
|
| 33 |
+
|
| 34 |
+
# Model initialization
|
| 35 |
+
logger.info("Initializing models...")
|
| 36 |
+
try:
|
| 37 |
+
embedding_model = EmbeddingModel()
|
| 38 |
+
summarization_model = SummarizationModel()
|
| 39 |
+
nlp_model = NLPModel()
|
| 40 |
+
db_service = DatabaseService()
|
| 41 |
+
logger.info("All models initialized successfully")
|
| 42 |
+
except Exception as e:
|
| 43 |
+
logger.error(f"Model initialization failed: {str(e)}")
|
| 44 |
+
raise
|
| 45 |
+
|
| 46 |
+
yield
|
| 47 |
+
|
| 48 |
+
# Cleanup
|
| 49 |
+
logger.info("Shutting down application...")
|
| 50 |
+
if db_service:
|
| 51 |
+
try:
|
| 52 |
+
await db_service.close()
|
| 53 |
+
logger.info("Database connection closed successfully")
|
| 54 |
+
except Exception as e:
|
| 55 |
+
logger.error(f"Error closing database connection: {str(e)}")
|
| 56 |
+
|
| 57 |
+
app = FastAPI(
|
| 58 |
+
title="Kairos News API",
|
| 59 |
+
version="1.0",
|
| 60 |
+
lifespan=lifespan
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
app.add_middleware(
|
| 64 |
+
CORSMiddleware,
|
| 65 |
+
allow_origins=["*"],
|
| 66 |
+
allow_methods=["*"],
|
| 67 |
+
allow_headers=["*"],
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# In-memory job storage
|
| 71 |
+
jobs_db: Dict[str, Dict] = {}
|
| 72 |
+
|
| 73 |
+
class PostRequest(BaseModel):
|
| 74 |
+
query: str
|
| 75 |
+
topic: Optional[str] = None
|
| 76 |
+
start_date: Optional[str] = None
|
| 77 |
+
end_date: Optional[str] = None
|
| 78 |
+
|
| 79 |
+
class JobStatus(BaseModel):
|
| 80 |
+
id: str
|
| 81 |
+
status: str
|
| 82 |
+
created_at: datetime
|
| 83 |
+
completed_at: Optional[datetime] = None
|
| 84 |
+
request: PostRequest
|
| 85 |
+
result: Optional[Dict[str, Any]] = None
|
| 86 |
+
|
| 87 |
+
@app.post("/index", response_model=JobStatus)
|
| 88 |
+
async def create_job(request: PostRequest, background_tasks: BackgroundTasks):
|
| 89 |
+
job_id = str(uuid.uuid4())
|
| 90 |
+
logger.info(f"Creating new job {job_id} with request: {request.dict()}")
|
| 91 |
+
|
| 92 |
+
jobs_db[job_id] = {
|
| 93 |
+
"id": job_id,
|
| 94 |
+
"status": "processing",
|
| 95 |
+
"created_at": datetime.now(),
|
| 96 |
+
"completed_at": None,
|
| 97 |
+
"request": request.dict(),
|
| 98 |
+
"result": None
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
background_tasks.add_task(
|
| 102 |
+
process_job,
|
| 103 |
+
job_id,
|
| 104 |
+
request,
|
| 105 |
+
embedding_model,
|
| 106 |
+
summarization_model,
|
| 107 |
+
nlp_model,
|
| 108 |
+
db_service
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
logger.info(f"Job {job_id} created and processing started")
|
| 112 |
+
return jobs_db[job_id]
|
| 113 |
+
|
| 114 |
+
@app.get("/loading", response_model=JobStatus)
|
| 115 |
+
async def get_job_status(id: str):
|
| 116 |
+
logger.info(f"Checking status for job {id}")
|
| 117 |
+
if id not in jobs_db:
|
| 118 |
+
logger.warning(f"Job {id} not found")
|
| 119 |
+
raise HTTPException(status_code=404, detail="Job not found")
|
| 120 |
+
|
| 121 |
+
logger.info(f"Returning status for job {id}: {jobs_db[id]['status']}")
|
| 122 |
+
return jobs_db[id]
|
| 123 |
+
|
| 124 |
+
async def process_job(
|
| 125 |
+
job_id: str,
|
| 126 |
+
request: PostRequest,
|
| 127 |
+
embedding_model: EmbeddingModel,
|
| 128 |
+
summarization_model: SummarizationModel,
|
| 129 |
+
nlp_model: NLPModel,
|
| 130 |
+
db_service: DatabaseService
|
| 131 |
+
):
|
| 132 |
+
try:
|
| 133 |
+
logger.info(f"Starting processing for job {job_id}")
|
| 134 |
+
|
| 135 |
+
processor = QueryProcessor(
|
| 136 |
+
embedding_model=embedding_model,
|
| 137 |
+
summarization_model=summarization_model,
|
| 138 |
+
nlp_model=nlp_model,
|
| 139 |
+
db_service=db_service
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
logger.debug(f"Processing query: {request.query}")
|
| 143 |
+
result = await processor.process(
|
| 144 |
+
query=request.query,
|
| 145 |
+
topic=request.topic,
|
| 146 |
+
start_date=request.start_date,
|
| 147 |
+
end_date=request.end_date
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
jobs_db[job_id].update({
|
| 151 |
+
"status": "completed",
|
| 152 |
+
"completed_at": datetime.now(),
|
| 153 |
+
"result": result if result else {"message": "No results found"}
|
| 154 |
+
})
|
| 155 |
+
logger.info(f"Job {job_id} completed successfully")
|
| 156 |
+
|
| 157 |
+
except Exception as e:
|
| 158 |
+
logger.error(f"Error processing job {job_id}: {str(e)}", exc_info=True)
|
| 159 |
+
jobs_db[job_id].update({
|
| 160 |
+
"status": "failed",
|
| 161 |
+
"completed_at": datetime.now(),
|
| 162 |
+
"result": {"error": str(e)}
|
| 163 |
+
})
|
| 164 |
+
logger.info(f"Job {job_id} marked as failed")
|
database/query.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import List, Dict, Optional,Tuple
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
import psycopg2
|
| 5 |
+
from psycopg2 import sql
|
| 6 |
+
|
| 7 |
+
class DatabaseService:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
# Connection parameters
|
| 10 |
+
self.DB_HOST = os.getenv("SUPABASE_HOST", "aws-0-eu-west-3.pooler.supabase.com")
|
| 11 |
+
self.DB_PORT = os.getenv("DB_PORT", "6543")
|
| 12 |
+
self.DB_NAME = os.getenv("DB_NAME", "postgres")
|
| 13 |
+
self.DB_USER = os.getenv("DB_USER")
|
| 14 |
+
self.DB_PASSWORD = os.getenv("DB_PASSWORD")
|
| 15 |
+
|
| 16 |
+
async def semantic_search(
|
| 17 |
+
self,
|
| 18 |
+
query_embedding: List[float],
|
| 19 |
+
start_date: Optional[datetime] = None,
|
| 20 |
+
end_date: Optional[datetime] = None,
|
| 21 |
+
topic: Optional[str] = None,
|
| 22 |
+
entities: Optional[List[Tuple[str,str]]] = None,
|
| 23 |
+
limit: int = 10
|
| 24 |
+
) -> List[Dict[str, any]]:
|
| 25 |
+
|
| 26 |
+
print(f"Extracted entities2: {entities}")
|
| 27 |
+
try:
|
| 28 |
+
with psycopg2.connect(
|
| 29 |
+
user=self.DB_USER,
|
| 30 |
+
password=self.DB_PASSWORD,
|
| 31 |
+
host=self.DB_HOST,
|
| 32 |
+
port=self.DB_PORT,
|
| 33 |
+
dbname=self.DB_NAME
|
| 34 |
+
) as conn:
|
| 35 |
+
with conn.cursor() as cursor:
|
| 36 |
+
# Base query with date range and topic filters
|
| 37 |
+
base_query = sql.SQL('''
|
| 38 |
+
WITH filtered_articles AS (
|
| 39 |
+
SELECT article_id
|
| 40 |
+
FROM articles.articles
|
| 41 |
+
WHERE 1=1
|
| 42 |
+
''')
|
| 43 |
+
|
| 44 |
+
# Add date range filter (if both start and end dates provided)
|
| 45 |
+
if start_date and end_date:
|
| 46 |
+
base_query = sql.SQL('{}{}').format(
|
| 47 |
+
base_query,
|
| 48 |
+
sql.SQL(' AND date BETWEEN {} AND {}').format(
|
| 49 |
+
sql.Literal(start_date),
|
| 50 |
+
sql.Literal(end_date)
|
| 51 |
+
)
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# Add topic filter (if provided)
|
| 55 |
+
if topic:
|
| 56 |
+
base_query = sql.SQL('{}{}').format(
|
| 57 |
+
base_query,
|
| 58 |
+
sql.SQL(' AND topic = {}').format(sql.Literal(topic))
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
base_query = sql.SQL('{} {}').format(
|
| 62 |
+
base_query,
|
| 63 |
+
sql.SQL(')')
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# Add entity filter (if entities exist)
|
| 67 |
+
if entities:
|
| 68 |
+
entity_conditions = sql.SQL(" OR ").join(
|
| 69 |
+
sql.SQL("""
|
| 70 |
+
(LOWER(UNACCENT(word)) = LOWER(UNACCENT({}))
|
| 71 |
+
AND entity_group = {})
|
| 72 |
+
""").format(
|
| 73 |
+
sql.Literal(e[0]), # Lowercase + unaccented entity text
|
| 74 |
+
sql.Literal(e[1]) # Original entity label (case-sensitive)
|
| 75 |
+
) for e in entities
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
final_query = sql.SQL('''
|
| 79 |
+
{base_query},
|
| 80 |
+
target_articles AS (
|
| 81 |
+
SELECT DISTINCT article_id
|
| 82 |
+
FROM articles.ner
|
| 83 |
+
WHERE ({entity_conditions})
|
| 84 |
+
AND article_id IN (SELECT article_id FROM filtered_articles)
|
| 85 |
+
)
|
| 86 |
+
SELECT
|
| 87 |
+
a.content,
|
| 88 |
+
a.embedding <=> {embedding}::vector AS distance,
|
| 89 |
+
a.date,
|
| 90 |
+
a.topic,
|
| 91 |
+
a.url
|
| 92 |
+
FROM articles.articles a
|
| 93 |
+
JOIN target_articles t ON a.article_id = t.article_id
|
| 94 |
+
ORDER BY distance
|
| 95 |
+
LIMIT {limit}
|
| 96 |
+
''').format(
|
| 97 |
+
base_query=base_query,
|
| 98 |
+
entity_conditions=entity_conditions,
|
| 99 |
+
embedding=sql.Literal(query_embedding),
|
| 100 |
+
limit=sql.Literal(limit)
|
| 101 |
+
)
|
| 102 |
+
else:
|
| 103 |
+
final_query = sql.SQL('''
|
| 104 |
+
{base_query}
|
| 105 |
+
SELECT
|
| 106 |
+
a.content,
|
| 107 |
+
a.embedding <=> {embedding}::vector AS distance,
|
| 108 |
+
a.date,
|
| 109 |
+
a.topic,
|
| 110 |
+
a.url
|
| 111 |
+
FROM articles.articles a
|
| 112 |
+
JOIN filtered_articles f ON a.article_id = f.article_id
|
| 113 |
+
ORDER BY distance
|
| 114 |
+
LIMIT {limit}
|
| 115 |
+
''').format(
|
| 116 |
+
base_query=base_query,
|
| 117 |
+
embedding=sql.Literal(query_embedding),
|
| 118 |
+
limit=sql.Literal(limit)
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
cursor.execute(final_query)
|
| 122 |
+
articles = cursor.fetchall()
|
| 123 |
+
|
| 124 |
+
# Fallback: Retry with fewer filters if no results
|
| 125 |
+
if not articles:
|
| 126 |
+
print("No articles found with entities...")
|
| 127 |
+
fallback_query = sql.SQL('''
|
| 128 |
+
SELECT
|
| 129 |
+
content,
|
| 130 |
+
embedding <=> {}::vector AS distance,
|
| 131 |
+
date,
|
| 132 |
+
topic,
|
| 133 |
+
url
|
| 134 |
+
FROM articles.articles
|
| 135 |
+
ORDER BY distance
|
| 136 |
+
LIMIT {limit}
|
| 137 |
+
''').format(
|
| 138 |
+
sql.Literal(query_embedding),
|
| 139 |
+
limit=sql.Literal(limit)
|
| 140 |
+
)
|
| 141 |
+
cursor.execute(fallback_query)
|
| 142 |
+
articles = cursor.fetchall()
|
| 143 |
+
|
| 144 |
+
# Format results
|
| 145 |
+
formatted_results = [
|
| 146 |
+
{
|
| 147 |
+
"content": content,
|
| 148 |
+
"distance": distance,
|
| 149 |
+
"date": art_date,
|
| 150 |
+
"topic": art_topic,
|
| 151 |
+
"url": url,
|
| 152 |
+
}
|
| 153 |
+
for content, distance, art_date, art_topic,url in articles
|
| 154 |
+
]
|
| 155 |
+
|
| 156 |
+
return formatted_results
|
| 157 |
+
|
| 158 |
+
except Exception as e:
|
| 159 |
+
print(f"Database query error: {e}")
|
| 160 |
+
return []
|
| 161 |
+
|
| 162 |
+
async def close(self):
|
| 163 |
+
# No persistent connection to close in psycopg2
|
| 164 |
+
pass
|
main.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 3 |
+
import numpy as np
|
| 4 |
+
from models.LexRank import degree_centrality_scores
|
| 5 |
+
import logging
|
| 6 |
+
from datetime import datetime as dt
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
class QueryProcessor:
|
| 11 |
+
def __init__(self, embedding_model, summarization_model, nlp_model, db_service):
|
| 12 |
+
self.embedding_model = embedding_model
|
| 13 |
+
self.summarization_model = summarization_model
|
| 14 |
+
self.nlp_model = nlp_model
|
| 15 |
+
self.db_service = db_service
|
| 16 |
+
logger.info("QueryProcessor initialized")
|
| 17 |
+
|
| 18 |
+
async def process(
|
| 19 |
+
self,
|
| 20 |
+
query: str,
|
| 21 |
+
topic: Optional[str] = None,
|
| 22 |
+
start_date: Optional[str] = None,
|
| 23 |
+
end_date: Optional[str] = None
|
| 24 |
+
) -> Dict[str, Any]:
|
| 25 |
+
try:
|
| 26 |
+
# Date handling
|
| 27 |
+
start_dt = self._parse_date(start_date) if start_date else None
|
| 28 |
+
end_dt = self._parse_date(end_date) if end_date else None
|
| 29 |
+
|
| 30 |
+
# Query processing
|
| 31 |
+
query_embedding = self.embedding_model.encode(query).tolist()
|
| 32 |
+
entities = self.nlp_model.extract_entities(query)
|
| 33 |
+
print(f"Extracted entities: {entities}")
|
| 34 |
+
|
| 35 |
+
# Database search
|
| 36 |
+
articles = await self._execute_semantic_search(
|
| 37 |
+
query_embedding,
|
| 38 |
+
start_dt,
|
| 39 |
+
end_dt,
|
| 40 |
+
topic,
|
| 41 |
+
entities
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
if not articles:
|
| 45 |
+
return {"message": "No articles found", "articles": []}
|
| 46 |
+
|
| 47 |
+
# Summary generation
|
| 48 |
+
print("Starting summary generation")
|
| 49 |
+
summary_data = self._generate_summary(articles)
|
| 50 |
+
return {
|
| 51 |
+
"summary": summary_data["summary"],
|
| 52 |
+
"articles": articles,
|
| 53 |
+
"entities": entities
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
except Exception as e:
|
| 57 |
+
logger.error(f"Processing failed: {str(e)}", exc_info=True)
|
| 58 |
+
return {"error": str(e)}
|
| 59 |
+
|
| 60 |
+
def _parse_date(self, date_str: str) -> dt:
|
| 61 |
+
"""Safe date parsing with validation"""
|
| 62 |
+
try:
|
| 63 |
+
return dt.strptime(date_str, "%Y-%m-%d")
|
| 64 |
+
except ValueError as e:
|
| 65 |
+
logger.error(f"Invalid date format: {date_str}")
|
| 66 |
+
raise ValueError(f"Invalid date format. Expected YYYY-MM-DD, got {date_str}")
|
| 67 |
+
|
| 68 |
+
def _extract_entities_safely(self, text: str) -> List[Tuple[str, str]]:
|
| 69 |
+
"""Robust entity extraction handling both strings and lists"""
|
| 70 |
+
try:
|
| 71 |
+
if isinstance(text, list):
|
| 72 |
+
logger.warning("Received list input for entity extraction, joining to string")
|
| 73 |
+
text = " ".join(text)
|
| 74 |
+
return self.nlp_model.extract_entities(text)
|
| 75 |
+
except Exception as e:
|
| 76 |
+
logger.error(f"Entity extraction failed: {str(e)}")
|
| 77 |
+
return []
|
| 78 |
+
|
| 79 |
+
async def _execute_semantic_search(
|
| 80 |
+
self,
|
| 81 |
+
query_embedding: List[float],
|
| 82 |
+
start_date: Optional[dt],
|
| 83 |
+
end_date: Optional[dt],
|
| 84 |
+
topic: Optional[str],
|
| 85 |
+
entities: List[Tuple[str, str]]
|
| 86 |
+
) -> List[Dict[str, Any]]:
|
| 87 |
+
"""Execute search with proper error handling"""
|
| 88 |
+
try:
|
| 89 |
+
return await self.db_service.semantic_search(
|
| 90 |
+
query_embedding=query_embedding,
|
| 91 |
+
start_date=start_date,
|
| 92 |
+
end_date=end_date,
|
| 93 |
+
topic=topic,
|
| 94 |
+
entities=entities
|
| 95 |
+
)
|
| 96 |
+
except Exception as e:
|
| 97 |
+
logger.error(f"Semantic search failed: {str(e)}")
|
| 98 |
+
raise
|
| 99 |
+
|
| 100 |
+
def _generate_summary(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 101 |
+
"""Generate summary from articles with fallback handling"""
|
| 102 |
+
try:
|
| 103 |
+
contents = [article["content"] for article in articles[:5]]
|
| 104 |
+
sentences = []
|
| 105 |
+
|
| 106 |
+
for content in contents:
|
| 107 |
+
if content:
|
| 108 |
+
sentences.extend(self.nlp_model.tokenize_sentences(content))
|
| 109 |
+
|
| 110 |
+
if not sentences:
|
| 111 |
+
logger.warning("No sentences available for summarization")
|
| 112 |
+
return {
|
| 113 |
+
"summary": "No content available for summarization",
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
print("Starting first summary generation")
|
| 117 |
+
embeddings = self.embedding_model.encode(sentences)
|
| 118 |
+
similarity_matrix = np.dot(embeddings, embeddings.T) / (np.linalg.norm(embeddings, axis=1, keepdims=True) * np.linalg.norm(embeddings, axis=1, keepdims=True).T)
|
| 119 |
+
centrality_scores = degree_centrality_scores(similarity_matrix, threshold=0.1)
|
| 120 |
+
|
| 121 |
+
top_indices = np.argsort(-centrality_scores)[:10]
|
| 122 |
+
key_sentences = [sentences[idx].strip() for idx in top_indices]
|
| 123 |
+
combined_text = ' '.join(key_sentences)
|
| 124 |
+
|
| 125 |
+
print(f"First summary done with: {len(key_sentences)} sentences")
|
| 126 |
+
|
| 127 |
+
return {
|
| 128 |
+
"summary": self.summarization_model.summarize(combined_text),
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.error(f"Summary generation failed: {str(e)}")
|
| 133 |
+
return {
|
| 134 |
+
"summary": "Summary generation failed",
|
| 135 |
+
}
|
models/LexRank.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LexRank implementation
|
| 3 |
+
Source: https://github.com/crabcamp/lexrank/tree/dev
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
from scipy.sparse.csgraph import connected_components
|
| 10 |
+
from scipy.special import softmax
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def degree_centrality_scores(
|
| 16 |
+
similarity_matrix,
|
| 17 |
+
threshold=None,
|
| 18 |
+
increase_power=True,
|
| 19 |
+
):
|
| 20 |
+
if not (threshold is None or isinstance(threshold, float) and 0 <= threshold < 1):
|
| 21 |
+
raise ValueError(
|
| 22 |
+
"'threshold' should be a floating-point number " "from the interval [0, 1) or None",
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
if threshold is None:
|
| 26 |
+
markov_matrix = create_markov_matrix(similarity_matrix)
|
| 27 |
+
|
| 28 |
+
else:
|
| 29 |
+
markov_matrix = create_markov_matrix_discrete(
|
| 30 |
+
similarity_matrix,
|
| 31 |
+
threshold,
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
scores = stationary_distribution(
|
| 35 |
+
markov_matrix,
|
| 36 |
+
increase_power=increase_power,
|
| 37 |
+
normalized=False,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
return scores
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _power_method(transition_matrix, increase_power=True, max_iter=10000):
|
| 44 |
+
eigenvector = np.ones(len(transition_matrix))
|
| 45 |
+
|
| 46 |
+
if len(eigenvector) == 1:
|
| 47 |
+
return eigenvector
|
| 48 |
+
|
| 49 |
+
transition = transition_matrix.transpose()
|
| 50 |
+
|
| 51 |
+
for _ in range(max_iter):
|
| 52 |
+
eigenvector_next = np.dot(transition, eigenvector)
|
| 53 |
+
|
| 54 |
+
if np.allclose(eigenvector_next, eigenvector):
|
| 55 |
+
return eigenvector_next
|
| 56 |
+
|
| 57 |
+
eigenvector = eigenvector_next
|
| 58 |
+
|
| 59 |
+
if increase_power:
|
| 60 |
+
transition = np.dot(transition, transition)
|
| 61 |
+
|
| 62 |
+
logger.warning("Maximum number of iterations for power method exceeded without convergence!")
|
| 63 |
+
return eigenvector_next
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def connected_nodes(matrix):
|
| 67 |
+
_, labels = connected_components(matrix)
|
| 68 |
+
|
| 69 |
+
groups = []
|
| 70 |
+
|
| 71 |
+
for tag in np.unique(labels):
|
| 72 |
+
group = np.where(labels == tag)[0]
|
| 73 |
+
groups.append(group)
|
| 74 |
+
|
| 75 |
+
return groups
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def create_markov_matrix(weights_matrix):
|
| 79 |
+
n_1, n_2 = weights_matrix.shape
|
| 80 |
+
if n_1 != n_2:
|
| 81 |
+
raise ValueError("'weights_matrix' should be square")
|
| 82 |
+
|
| 83 |
+
row_sum = weights_matrix.sum(axis=1, keepdims=True)
|
| 84 |
+
|
| 85 |
+
# normalize probability distribution differently if we have negative transition values
|
| 86 |
+
if np.min(weights_matrix) <= 0:
|
| 87 |
+
return softmax(weights_matrix, axis=1)
|
| 88 |
+
|
| 89 |
+
return weights_matrix / row_sum
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def create_markov_matrix_discrete(weights_matrix, threshold):
|
| 93 |
+
discrete_weights_matrix = np.zeros(weights_matrix.shape)
|
| 94 |
+
ixs = np.where(weights_matrix >= threshold)
|
| 95 |
+
discrete_weights_matrix[ixs] = 1
|
| 96 |
+
|
| 97 |
+
return create_markov_matrix(discrete_weights_matrix)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def stationary_distribution(
|
| 101 |
+
transition_matrix,
|
| 102 |
+
increase_power=True,
|
| 103 |
+
normalized=True,
|
| 104 |
+
):
|
| 105 |
+
n_1, n_2 = transition_matrix.shape
|
| 106 |
+
if n_1 != n_2:
|
| 107 |
+
raise ValueError("'transition_matrix' should be square")
|
| 108 |
+
|
| 109 |
+
distribution = np.zeros(n_1)
|
| 110 |
+
|
| 111 |
+
grouped_indices = connected_nodes(transition_matrix)
|
| 112 |
+
|
| 113 |
+
for group in grouped_indices:
|
| 114 |
+
t_matrix = transition_matrix[np.ix_(group, group)]
|
| 115 |
+
eigenvector = _power_method(t_matrix, increase_power=increase_power)
|
| 116 |
+
distribution[group] = eigenvector
|
| 117 |
+
|
| 118 |
+
if normalized:
|
| 119 |
+
distribution /= n_1
|
| 120 |
+
|
| 121 |
+
return distribution
|
models/embedding.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
class EmbeddingModel:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 7 |
+
self.model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
| 8 |
+
|
| 9 |
+
def encode(self, text: str):
|
| 10 |
+
return self.model.encode(text, device=self.device)
|
models/nlp.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spacy
|
| 2 |
+
from typing import List, Union
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
class NLPModel:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
try:
|
| 10 |
+
# Load spaCy model only
|
| 11 |
+
self.nlp = spacy.load("pt_core_news_md")
|
| 12 |
+
logger.info("spaCy model initialized successfully")
|
| 13 |
+
except Exception as e:
|
| 14 |
+
logger.error(f"Failed to initialize spaCy model: {str(e)}")
|
| 15 |
+
raise
|
| 16 |
+
|
| 17 |
+
def extract_entities(self, text: Union[str, List[str]]) -> List[tuple]:
|
| 18 |
+
"""Entity extraction using spaCy"""
|
| 19 |
+
try:
|
| 20 |
+
if isinstance(text, list):
|
| 21 |
+
text = " ".join(text)
|
| 22 |
+
doc = self.nlp(text)
|
| 23 |
+
return [(ent.text.lower(), ent.label_) for ent in doc.ents]
|
| 24 |
+
except Exception as e:
|
| 25 |
+
logger.error(f"Entity extraction failed: {str(e)}")
|
| 26 |
+
return []
|
| 27 |
+
|
| 28 |
+
def tokenize_sentences(self, text: str) -> List[str]:
|
| 29 |
+
"""Sentence tokenization using spaCy"""
|
| 30 |
+
try:
|
| 31 |
+
doc = self.nlp(text)
|
| 32 |
+
return [sent.text for sent in doc.sents]
|
| 33 |
+
except Exception as e:
|
| 34 |
+
logger.error(f"Sentence tokenization failed: {str(e)}")
|
| 35 |
+
return [text] # Fallback to returning whole text
|
models/summarization.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
class SummarizationModel:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 7 |
+
self.tokenizer = T5Tokenizer.from_pretrained('unicamp-dl/ptt5-base-portuguese-vocab')
|
| 8 |
+
self.model = T5ForConditionalGeneration.from_pretrained('recogna-nlp/ptt5-base-summ').to(self.device)
|
| 9 |
+
|
| 10 |
+
def summarize(self, text: str, max_length: int = 256, min_length: int = 128) -> str:
|
| 11 |
+
inputs = self.tokenizer.encode(
|
| 12 |
+
text,
|
| 13 |
+
max_length=512,
|
| 14 |
+
truncation=True,
|
| 15 |
+
return_tensors='pt'
|
| 16 |
+
).to(self.device)
|
| 17 |
+
|
| 18 |
+
summary_ids = self.model.generate(
|
| 19 |
+
inputs,
|
| 20 |
+
max_length=max_length,
|
| 21 |
+
min_length=min_length,
|
| 22 |
+
num_beams=4,
|
| 23 |
+
no_repeat_ngram_size=3,
|
| 24 |
+
early_stopping=True,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
logging
|
| 4 |
+
transformers
|
| 5 |
+
torch
|
| 6 |
+
sentence_transformers
|
| 7 |
+
nltk
|
| 8 |
+
spacy
|
| 9 |
+
numpy
|
| 10 |
+
pandas
|
| 11 |
+
scipy
|
| 12 |
+
psycopg2
|
| 13 |
+
sentencepiece
|