Spaces:

0edon
/

KairosNews

Sleeping

App Files Files Community

Quintino Fernandes commited on May 14

Commit

379003a

1 Parent(s): 4bbe44e

Initial

Browse files

Files changed (10) hide show

Dockerfile +37 -0
README.md +4 -5
app.py +164 -0
database/query.py +164 -0
main.py +135 -0
models/LexRank.py +121 -0
models/embedding.py +10 -0
models/nlp.py +35 -0
models/summarization.py +27 -0
requirements.txt +13 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,37 @@

+FROM python:3.12-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    libpq-dev \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Add a non-root user
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+# Copy and install Python dependencies
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Pre-download Spacy model
+RUN python -m spacy download pt_core_news_md
+# Pre-download T5 models
+RUN python -c "from transformers import T5Tokenizer, T5ForConditionalGeneration; \
+    T5Tokenizer.from_pretrained('unicamp-dl/ptt5-base-portuguese-vocab'); \
+    T5ForConditionalGeneration.from_pretrained('recogna-nlp/ptt5-base-summ')"
+# Copy application code
+COPY --chown=user . /app
+# Expose the application port
+EXPOSE 7860
+# Run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1,10 @@
 ---
-title: KairosNews
-emoji: 🏃
-colorFrom: pink
-colorTo: yellow
 sdk: docker
 pinned: false
-short_description: Summary Snapshots of multiple Portuguese Articles
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Test
+emoji: 📈
+colorFrom: red
+colorTo: indigo
 sdk: docker
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import logging
+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import Dict, Optional, List, Any
+import uuid
+from datetime import datetime
+from contextlib import asynccontextmanager
+from models.embedding import EmbeddingModel
+from models.summarization import SummarizationModel
+from models.nlp import NLPModel
+from database.query import DatabaseService
+from KairosNews.main import QueryProcessor
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler()]
+)
+logger = logging.getLogger(__name__)
+# Initialize models
+embedding_model = None
+summarization_model = None
+nlp_model = None
+db_service = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global embedding_model, summarization_model, nlp_model, db_service
+    # Model initialization
+    logger.info("Initializing models...")
+    try:
+        embedding_model = EmbeddingModel()
+        summarization_model = SummarizationModel()
+        nlp_model = NLPModel()
+        db_service = DatabaseService()
+        logger.info("All models initialized successfully")
+    except Exception as e:
+        logger.error(f"Model initialization failed: {str(e)}")
+        raise
+    yield
+    # Cleanup
+    logger.info("Shutting down application...")
+    if db_service:
+        try:
+            await db_service.close()
+            logger.info("Database connection closed successfully")
+        except Exception as e:
+            logger.error(f"Error closing database connection: {str(e)}")
+app = FastAPI(
+    title="Kairos News API",
+    version="1.0",
+    lifespan=lifespan
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# In-memory job storage
+jobs_db: Dict[str, Dict] = {}
+class PostRequest(BaseModel):
+    query: str
+    topic: Optional[str] = None
+    start_date: Optional[str] = None
+    end_date: Optional[str] = None
+class JobStatus(BaseModel):
+    id: str
+    status: str
+    created_at: datetime
+    completed_at: Optional[datetime] = None
+    request: PostRequest
+    result: Optional[Dict[str, Any]] = None
+@app.post("/index", response_model=JobStatus)
+async def create_job(request: PostRequest, background_tasks: BackgroundTasks):
+    job_id = str(uuid.uuid4())
+    logger.info(f"Creating new job {job_id} with request: {request.dict()}")
+    jobs_db[job_id] = {
+        "id": job_id,
+        "status": "processing",
+        "created_at": datetime.now(),
+        "completed_at": None,
+        "request": request.dict(),
+        "result": None
+    }
+    background_tasks.add_task(
+        process_job,
+        job_id,
+        request,
+        embedding_model,
+        summarization_model,
+        nlp_model,
+        db_service
+    )
+    logger.info(f"Job {job_id} created and processing started")
+    return jobs_db[job_id]
+@app.get("/loading", response_model=JobStatus)
+async def get_job_status(id: str):
+    logger.info(f"Checking status for job {id}")
+    if id not in jobs_db:
+        logger.warning(f"Job {id} not found")
+        raise HTTPException(status_code=404, detail="Job not found")
+    logger.info(f"Returning status for job {id}: {jobs_db[id]['status']}")
+    return jobs_db[id]
+async def process_job(
+    job_id: str,
+    request: PostRequest,
+    embedding_model: EmbeddingModel,
+    summarization_model: SummarizationModel,
+    nlp_model: NLPModel,
+    db_service: DatabaseService
+):
+    try:
+        logger.info(f"Starting processing for job {job_id}")
+        processor = QueryProcessor(
+            embedding_model=embedding_model,
+            summarization_model=summarization_model,
+            nlp_model=nlp_model,
+            db_service=db_service
+        )
+        logger.debug(f"Processing query: {request.query}")
+        result = await processor.process(
+            query=request.query,
+            topic=request.topic,
+            start_date=request.start_date,
+            end_date=request.end_date
+        )
+        jobs_db[job_id].update({
+            "status": "completed",
+            "completed_at": datetime.now(),
+            "result": result if result else {"message": "No results found"}
+        })
+        logger.info(f"Job {job_id} completed successfully")
+    except Exception as e:
+        logger.error(f"Error processing job {job_id}: {str(e)}", exc_info=True)
+        jobs_db[job_id].update({
+            "status": "failed",
+            "completed_at": datetime.now(),
+            "result": {"error": str(e)}
+        })
+        logger.info(f"Job {job_id} marked as failed")

database/query.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+from typing import List, Dict, Optional,Tuple
+from datetime import datetime
+import psycopg2
+from psycopg2 import sql
+class DatabaseService:
+    def __init__(self):
+        # Connection parameters
+        self.DB_HOST = os.getenv("SUPABASE_HOST", "aws-0-eu-west-3.pooler.supabase.com")
+        self.DB_PORT = os.getenv("DB_PORT", "6543")
+        self.DB_NAME = os.getenv("DB_NAME", "postgres")
+        self.DB_USER = os.getenv("DB_USER")
+        self.DB_PASSWORD = os.getenv("DB_PASSWORD")
+    async def semantic_search(
+        self,
+        query_embedding: List[float],
+        start_date: Optional[datetime] = None,
+        end_date: Optional[datetime] = None,
+        topic: Optional[str] = None,
+        entities: Optional[List[Tuple[str,str]]] = None,
+        limit: int = 10
+    ) -> List[Dict[str, any]]:
+        print(f"Extracted entities2: {entities}")
+        try:
+            with psycopg2.connect(
+                user=self.DB_USER,
+                password=self.DB_PASSWORD,
+                host=self.DB_HOST,
+                port=self.DB_PORT,
+                dbname=self.DB_NAME
+            ) as conn:
+                with conn.cursor() as cursor:
+                    # Base query with date range and topic filters
+                    base_query = sql.SQL('''
+                        WITH filtered_articles AS (
+                            SELECT article_id
+                            FROM articles.articles
+                            WHERE 1=1
+                    ''')
+                    # Add date range filter (if both start and end dates provided)
+                    if start_date and end_date:
+                        base_query = sql.SQL('{}{}').format(
+                            base_query,
+                            sql.SQL(' AND date BETWEEN {} AND {}').format(
+                                sql.Literal(start_date),
+                                sql.Literal(end_date)
+                            )
+                        )
+                    # Add topic filter (if provided)
+                    if topic:
+                        base_query = sql.SQL('{}{}').format(
+                            base_query,
+                            sql.SQL(' AND topic = {}').format(sql.Literal(topic))
+                        )
+                    base_query = sql.SQL('{} {}').format(
+                        base_query,
+                        sql.SQL(')')
+                    )
+                    # Add entity filter (if entities exist)
+                    if entities:
+                        entity_conditions = sql.SQL(" OR ").join(
+                            sql.SQL("""
+                                (LOWER(UNACCENT(word)) = LOWER(UNACCENT({}))
+                                AND entity_group = {})
+                            """).format(
+                                sql.Literal(e[0]),  # Lowercase + unaccented entity text
+                                sql.Literal(e[1])   # Original entity label (case-sensitive)
+                            ) for e in entities
+                        )
+                        final_query = sql.SQL('''
+                            {base_query},
+                            target_articles AS (
+                                SELECT DISTINCT article_id
+                                FROM articles.ner
+                                WHERE ({entity_conditions})
+                                AND article_id IN (SELECT article_id FROM filtered_articles)
+                            )
+                            SELECT
+                                a.content,
+                                a.embedding <=> {embedding}::vector AS distance,
+                                a.date,
+                                a.topic,
+                                a.url
+                            FROM articles.articles a
+                            JOIN target_articles t ON a.article_id = t.article_id
+                            ORDER BY distance
+                            LIMIT {limit}
+                        ''').format(
+                            base_query=base_query,
+                            entity_conditions=entity_conditions,
+                            embedding=sql.Literal(query_embedding),
+                            limit=sql.Literal(limit)
+                        )
+                    else:
+                        final_query = sql.SQL('''
+                            {base_query}
+                            SELECT
+                                a.content,
+                                a.embedding <=> {embedding}::vector AS distance,
+                                a.date,
+                                a.topic,
+                                a.url
+                            FROM articles.articles a
+                            JOIN filtered_articles f ON a.article_id = f.article_id
+                            ORDER BY distance
+                            LIMIT {limit}
+                        ''').format(
+                            base_query=base_query,
+                            embedding=sql.Literal(query_embedding),
+                            limit=sql.Literal(limit)
+                        )
+                    cursor.execute(final_query)
+                    articles = cursor.fetchall()
+                    # Fallback: Retry with fewer filters if no results
+                    if not articles:
+                        print("No articles found with entities...")
+                        fallback_query = sql.SQL('''
+                            SELECT
+                                content,
+                                embedding <=> {}::vector AS distance,
+                                date,
+                                topic,
+                                url
+                            FROM articles.articles
+                            ORDER BY distance
+                            LIMIT {limit}
+                        ''').format(
+                            sql.Literal(query_embedding),
+                            limit=sql.Literal(limit)
+                        )
+                        cursor.execute(fallback_query)
+                        articles = cursor.fetchall()
+                    # Format results
+                    formatted_results = [
+                        {
+                            "content": content,
+                            "distance": distance,
+                            "date": art_date,
+                            "topic": art_topic,
+                            "url": url,
+                        }
+                        for content, distance, art_date, art_topic,url in articles
+                    ]
+                    return formatted_results
+        except Exception as e:
+            print(f"Database query error: {e}")
+            return []
+    async def close(self):
+        # No persistent connection to close in psycopg2
+        pass

main.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import datetime
+from typing import List, Dict, Any, Optional, Tuple
+import numpy as np
+from models.LexRank import degree_centrality_scores
+import logging
+from datetime import datetime as dt
+logger = logging.getLogger(__name__)
+class QueryProcessor:
+    def __init__(self, embedding_model, summarization_model, nlp_model, db_service):
+        self.embedding_model = embedding_model
+        self.summarization_model = summarization_model
+        self.nlp_model = nlp_model
+        self.db_service = db_service
+        logger.info("QueryProcessor initialized")
+    async def process(
+        self,
+        query: str,
+        topic: Optional[str] = None,
+        start_date: Optional[str] = None,
+        end_date: Optional[str] = None
+    ) -> Dict[str, Any]:
+        try:
+            # Date handling
+            start_dt = self._parse_date(start_date) if start_date else None
+            end_dt = self._parse_date(end_date) if end_date else None
+            # Query processing
+            query_embedding = self.embedding_model.encode(query).tolist()
+            entities = self.nlp_model.extract_entities(query)
+            print(f"Extracted entities: {entities}")
+            # Database search
+            articles = await self._execute_semantic_search(
+                query_embedding,
+                start_dt,
+                end_dt,
+                topic,
+                entities
+            )
+            if not articles:
+                return {"message": "No articles found", "articles": []}
+            # Summary generation
+            print("Starting summary generation")
+            summary_data = self._generate_summary(articles)
+            return {
+                "summary": summary_data["summary"],
+                "articles": articles,
+                "entities": entities
+            }
+        except Exception as e:
+            logger.error(f"Processing failed: {str(e)}", exc_info=True)
+            return {"error": str(e)}
+    def _parse_date(self, date_str: str) -> dt:
+        """Safe date parsing with validation"""
+        try:
+            return dt.strptime(date_str, "%Y-%m-%d")
+        except ValueError as e:
+            logger.error(f"Invalid date format: {date_str}")
+            raise ValueError(f"Invalid date format. Expected YYYY-MM-DD, got {date_str}")
+    def _extract_entities_safely(self, text: str) -> List[Tuple[str, str]]:
+        """Robust entity extraction handling both strings and lists"""
+        try:
+            if isinstance(text, list):
+                logger.warning("Received list input for entity extraction, joining to string")
+                text = " ".join(text)
+            return self.nlp_model.extract_entities(text)
+        except Exception as e:
+            logger.error(f"Entity extraction failed: {str(e)}")
+            return []
+    async def _execute_semantic_search(
+        self,
+        query_embedding: List[float],
+        start_date: Optional[dt],
+        end_date: Optional[dt],
+        topic: Optional[str],
+        entities: List[Tuple[str, str]]
+    ) -> List[Dict[str, Any]]:
+        """Execute search with proper error handling"""
+        try:
+            return await self.db_service.semantic_search(
+                query_embedding=query_embedding,
+                start_date=start_date,
+                end_date=end_date,
+                topic=topic,
+                entities=entities
+            )
+        except Exception as e:
+            logger.error(f"Semantic search failed: {str(e)}")
+            raise
+    def _generate_summary(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Generate summary from articles with fallback handling"""
+        try:
+            contents = [article["content"] for article in articles[:5]]
+            sentences = []
+            for content in contents:
+                if content:
+                    sentences.extend(self.nlp_model.tokenize_sentences(content))
+            if not sentences:
+                logger.warning("No sentences available for summarization")
+                return {
+                    "summary": "No content available for summarization",
+                }
+            print("Starting first summary generation")
+            embeddings = self.embedding_model.encode(sentences)
+            similarity_matrix = np.dot(embeddings, embeddings.T) / (np.linalg.norm(embeddings, axis=1, keepdims=True) * np.linalg.norm(embeddings, axis=1, keepdims=True).T)
+            centrality_scores = degree_centrality_scores(similarity_matrix, threshold=0.1)
+            top_indices = np.argsort(-centrality_scores)[:10]
+            key_sentences = [sentences[idx].strip() for idx in top_indices]
+            combined_text = ' '.join(key_sentences)
+            print(f"First summary done with: {len(key_sentences)} sentences")
+            return {
+                "summary": self.summarization_model.summarize(combined_text),
+            }
+        except Exception as e:
+            logger.error(f"Summary generation failed: {str(e)}")
+            return {
+                "summary": "Summary generation failed",
+            }

models/LexRank.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""
+LexRank implementation
+Source: https://github.com/crabcamp/lexrank/tree/dev
+"""
+import logging
+import numpy as np
+from scipy.sparse.csgraph import connected_components
+from scipy.special import softmax
+logger = logging.getLogger(__name__)
+def degree_centrality_scores(
+    similarity_matrix,
+    threshold=None,
+    increase_power=True,
+):
+    if not (threshold is None or isinstance(threshold, float) and 0 <= threshold < 1):
+        raise ValueError(
+            "'threshold' should be a floating-point number " "from the interval [0, 1) or None",
+        )
+    if threshold is None:
+        markov_matrix = create_markov_matrix(similarity_matrix)
+    else:
+        markov_matrix = create_markov_matrix_discrete(
+            similarity_matrix,
+            threshold,
+        )
+    scores = stationary_distribution(
+        markov_matrix,
+        increase_power=increase_power,
+        normalized=False,
+    )
+    return scores
+def _power_method(transition_matrix, increase_power=True, max_iter=10000):
+    eigenvector = np.ones(len(transition_matrix))
+    if len(eigenvector) == 1:
+        return eigenvector
+    transition = transition_matrix.transpose()
+    for _ in range(max_iter):
+        eigenvector_next = np.dot(transition, eigenvector)
+        if np.allclose(eigenvector_next, eigenvector):
+            return eigenvector_next
+        eigenvector = eigenvector_next
+        if increase_power:
+            transition = np.dot(transition, transition)
+    logger.warning("Maximum number of iterations for power method exceeded without convergence!")
+    return eigenvector_next
+def connected_nodes(matrix):
+    _, labels = connected_components(matrix)
+    groups = []
+    for tag in np.unique(labels):
+        group = np.where(labels == tag)[0]
+        groups.append(group)
+    return groups
+def create_markov_matrix(weights_matrix):
+    n_1, n_2 = weights_matrix.shape
+    if n_1 != n_2:
+        raise ValueError("'weights_matrix' should be square")
+    row_sum = weights_matrix.sum(axis=1, keepdims=True)
+    # normalize probability distribution differently if we have negative transition values
+    if np.min(weights_matrix) <= 0:
+        return softmax(weights_matrix, axis=1)
+    return weights_matrix / row_sum
+def create_markov_matrix_discrete(weights_matrix, threshold):
+    discrete_weights_matrix = np.zeros(weights_matrix.shape)
+    ixs = np.where(weights_matrix >= threshold)
+    discrete_weights_matrix[ixs] = 1
+    return create_markov_matrix(discrete_weights_matrix)
+def stationary_distribution(
+    transition_matrix,
+    increase_power=True,
+    normalized=True,
+):
+    n_1, n_2 = transition_matrix.shape
+    if n_1 != n_2:
+        raise ValueError("'transition_matrix' should be square")
+    distribution = np.zeros(n_1)
+    grouped_indices = connected_nodes(transition_matrix)
+    for group in grouped_indices:
+        t_matrix = transition_matrix[np.ix_(group, group)]
+        eigenvector = _power_method(t_matrix, increase_power=increase_power)
+        distribution[group] = eigenvector
+    if normalized:
+        distribution /= n_1
+    return distribution

models/embedding.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from sentence_transformers import SentenceTransformer
+import torch
+class EmbeddingModel:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
+    def encode(self, text: str):
+        return self.model.encode(text, device=self.device)

models/nlp.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import spacy
+from typing import List, Union
+import logging
+logger = logging.getLogger(__name__)
+class NLPModel:
+    def __init__(self):
+        try:
+            # Load spaCy model only
+            self.nlp = spacy.load("pt_core_news_md")
+            logger.info("spaCy model initialized successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize spaCy model: {str(e)}")
+            raise
+    def extract_entities(self, text: Union[str, List[str]]) -> List[tuple]:
+        """Entity extraction using spaCy"""
+        try:
+            if isinstance(text, list):
+                text = " ".join(text)
+            doc = self.nlp(text)
+            return [(ent.text.lower(), ent.label_) for ent in doc.ents]
+        except Exception as e:
+            logger.error(f"Entity extraction failed: {str(e)}")
+            return []
+    def tokenize_sentences(self, text: str) -> List[str]:
+        """Sentence tokenization using spaCy"""
+        try:
+            doc = self.nlp(text)
+            return [sent.text for sent in doc.sents]
+        except Exception as e:
+            logger.error(f"Sentence tokenization failed: {str(e)}")
+            return [text]  # Fallback to returning whole text

models/summarization.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from transformers import T5Tokenizer, T5ForConditionalGeneration
+import torch
+class SummarizationModel:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tokenizer = T5Tokenizer.from_pretrained('unicamp-dl/ptt5-base-portuguese-vocab')
+        self.model = T5ForConditionalGeneration.from_pretrained('recogna-nlp/ptt5-base-summ').to(self.device)
+    def summarize(self, text: str, max_length: int = 256, min_length: int = 128) -> str:
+        inputs = self.tokenizer.encode(
+            text,
+            max_length=512,
+            truncation=True,
+            return_tensors='pt'
+        ).to(self.device)
+        summary_ids = self.model.generate(
+            inputs,
+            max_length=max_length,
+            min_length=min_length,
+            num_beams=4,
+            no_repeat_ngram_size=3,
+            early_stopping=True,
+        )
+        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi
+uvicorn[standard]
+logging
+transformers
+torch
+sentence_transformers
+nltk
+spacy
+numpy
+pandas
+scipy
+psycopg2
+sentencepiece