Quintino Fernandes commited on
Commit
379003a
·
1 Parent(s): 4bbe44e
Files changed (10) hide show
  1. Dockerfile +37 -0
  2. README.md +4 -5
  3. app.py +164 -0
  4. database/query.py +164 -0
  5. main.py +135 -0
  6. models/LexRank.py +121 -0
  7. models/embedding.py +10 -0
  8. models/nlp.py +35 -0
  9. models/summarization.py +27 -0
  10. requirements.txt +13 -0
Dockerfile ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ build-essential \
6
+ libpq-dev \
7
+ curl \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Add a non-root user
11
+ RUN useradd -m -u 1000 user
12
+ USER user
13
+ ENV PATH="/home/user/.local/bin:$PATH"
14
+
15
+ WORKDIR /app
16
+
17
+ # Copy and install Python dependencies
18
+ COPY --chown=user ./requirements.txt requirements.txt
19
+ RUN pip install --no-cache-dir --upgrade pip && \
20
+ pip install --no-cache-dir -r requirements.txt
21
+
22
+ # Pre-download Spacy model
23
+ RUN python -m spacy download pt_core_news_md
24
+
25
+ # Pre-download T5 models
26
+ RUN python -c "from transformers import T5Tokenizer, T5ForConditionalGeneration; \
27
+ T5Tokenizer.from_pretrained('unicamp-dl/ptt5-base-portuguese-vocab'); \
28
+ T5ForConditionalGeneration.from_pretrained('recogna-nlp/ptt5-base-summ')"
29
+
30
+ # Copy application code
31
+ COPY --chown=user . /app
32
+
33
+ # Expose the application port
34
+ EXPOSE 7860
35
+
36
+ # Run the application
37
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1,10 @@
1
  ---
2
- title: KairosNews
3
- emoji: 🏃
4
- colorFrom: pink
5
- colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
- short_description: Summary Snapshots of multiple Portuguese Articles
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Test
3
+ emoji: 📈
4
+ colorFrom: red
5
+ colorTo: indigo
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from pydantic import BaseModel
5
+ from typing import Dict, Optional, List, Any
6
+ import uuid
7
+ from datetime import datetime
8
+ from contextlib import asynccontextmanager
9
+
10
+ from models.embedding import EmbeddingModel
11
+ from models.summarization import SummarizationModel
12
+ from models.nlp import NLPModel
13
+ from database.query import DatabaseService
14
+ from KairosNews.main import QueryProcessor
15
+
16
+ # Configure logging
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
20
+ handlers=[logging.StreamHandler()]
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Initialize models
25
+ embedding_model = None
26
+ summarization_model = None
27
+ nlp_model = None
28
+ db_service = None
29
+
30
+ @asynccontextmanager
31
+ async def lifespan(app: FastAPI):
32
+ global embedding_model, summarization_model, nlp_model, db_service
33
+
34
+ # Model initialization
35
+ logger.info("Initializing models...")
36
+ try:
37
+ embedding_model = EmbeddingModel()
38
+ summarization_model = SummarizationModel()
39
+ nlp_model = NLPModel()
40
+ db_service = DatabaseService()
41
+ logger.info("All models initialized successfully")
42
+ except Exception as e:
43
+ logger.error(f"Model initialization failed: {str(e)}")
44
+ raise
45
+
46
+ yield
47
+
48
+ # Cleanup
49
+ logger.info("Shutting down application...")
50
+ if db_service:
51
+ try:
52
+ await db_service.close()
53
+ logger.info("Database connection closed successfully")
54
+ except Exception as e:
55
+ logger.error(f"Error closing database connection: {str(e)}")
56
+
57
+ app = FastAPI(
58
+ title="Kairos News API",
59
+ version="1.0",
60
+ lifespan=lifespan
61
+ )
62
+
63
+ app.add_middleware(
64
+ CORSMiddleware,
65
+ allow_origins=["*"],
66
+ allow_methods=["*"],
67
+ allow_headers=["*"],
68
+ )
69
+
70
+ # In-memory job storage
71
+ jobs_db: Dict[str, Dict] = {}
72
+
73
+ class PostRequest(BaseModel):
74
+ query: str
75
+ topic: Optional[str] = None
76
+ start_date: Optional[str] = None
77
+ end_date: Optional[str] = None
78
+
79
+ class JobStatus(BaseModel):
80
+ id: str
81
+ status: str
82
+ created_at: datetime
83
+ completed_at: Optional[datetime] = None
84
+ request: PostRequest
85
+ result: Optional[Dict[str, Any]] = None
86
+
87
+ @app.post("/index", response_model=JobStatus)
88
+ async def create_job(request: PostRequest, background_tasks: BackgroundTasks):
89
+ job_id = str(uuid.uuid4())
90
+ logger.info(f"Creating new job {job_id} with request: {request.dict()}")
91
+
92
+ jobs_db[job_id] = {
93
+ "id": job_id,
94
+ "status": "processing",
95
+ "created_at": datetime.now(),
96
+ "completed_at": None,
97
+ "request": request.dict(),
98
+ "result": None
99
+ }
100
+
101
+ background_tasks.add_task(
102
+ process_job,
103
+ job_id,
104
+ request,
105
+ embedding_model,
106
+ summarization_model,
107
+ nlp_model,
108
+ db_service
109
+ )
110
+
111
+ logger.info(f"Job {job_id} created and processing started")
112
+ return jobs_db[job_id]
113
+
114
+ @app.get("/loading", response_model=JobStatus)
115
+ async def get_job_status(id: str):
116
+ logger.info(f"Checking status for job {id}")
117
+ if id not in jobs_db:
118
+ logger.warning(f"Job {id} not found")
119
+ raise HTTPException(status_code=404, detail="Job not found")
120
+
121
+ logger.info(f"Returning status for job {id}: {jobs_db[id]['status']}")
122
+ return jobs_db[id]
123
+
124
+ async def process_job(
125
+ job_id: str,
126
+ request: PostRequest,
127
+ embedding_model: EmbeddingModel,
128
+ summarization_model: SummarizationModel,
129
+ nlp_model: NLPModel,
130
+ db_service: DatabaseService
131
+ ):
132
+ try:
133
+ logger.info(f"Starting processing for job {job_id}")
134
+
135
+ processor = QueryProcessor(
136
+ embedding_model=embedding_model,
137
+ summarization_model=summarization_model,
138
+ nlp_model=nlp_model,
139
+ db_service=db_service
140
+ )
141
+
142
+ logger.debug(f"Processing query: {request.query}")
143
+ result = await processor.process(
144
+ query=request.query,
145
+ topic=request.topic,
146
+ start_date=request.start_date,
147
+ end_date=request.end_date
148
+ )
149
+
150
+ jobs_db[job_id].update({
151
+ "status": "completed",
152
+ "completed_at": datetime.now(),
153
+ "result": result if result else {"message": "No results found"}
154
+ })
155
+ logger.info(f"Job {job_id} completed successfully")
156
+
157
+ except Exception as e:
158
+ logger.error(f"Error processing job {job_id}: {str(e)}", exc_info=True)
159
+ jobs_db[job_id].update({
160
+ "status": "failed",
161
+ "completed_at": datetime.now(),
162
+ "result": {"error": str(e)}
163
+ })
164
+ logger.info(f"Job {job_id} marked as failed")
database/query.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Dict, Optional,Tuple
3
+ from datetime import datetime
4
+ import psycopg2
5
+ from psycopg2 import sql
6
+
7
+ class DatabaseService:
8
+ def __init__(self):
9
+ # Connection parameters
10
+ self.DB_HOST = os.getenv("SUPABASE_HOST", "aws-0-eu-west-3.pooler.supabase.com")
11
+ self.DB_PORT = os.getenv("DB_PORT", "6543")
12
+ self.DB_NAME = os.getenv("DB_NAME", "postgres")
13
+ self.DB_USER = os.getenv("DB_USER")
14
+ self.DB_PASSWORD = os.getenv("DB_PASSWORD")
15
+
16
+ async def semantic_search(
17
+ self,
18
+ query_embedding: List[float],
19
+ start_date: Optional[datetime] = None,
20
+ end_date: Optional[datetime] = None,
21
+ topic: Optional[str] = None,
22
+ entities: Optional[List[Tuple[str,str]]] = None,
23
+ limit: int = 10
24
+ ) -> List[Dict[str, any]]:
25
+
26
+ print(f"Extracted entities2: {entities}")
27
+ try:
28
+ with psycopg2.connect(
29
+ user=self.DB_USER,
30
+ password=self.DB_PASSWORD,
31
+ host=self.DB_HOST,
32
+ port=self.DB_PORT,
33
+ dbname=self.DB_NAME
34
+ ) as conn:
35
+ with conn.cursor() as cursor:
36
+ # Base query with date range and topic filters
37
+ base_query = sql.SQL('''
38
+ WITH filtered_articles AS (
39
+ SELECT article_id
40
+ FROM articles.articles
41
+ WHERE 1=1
42
+ ''')
43
+
44
+ # Add date range filter (if both start and end dates provided)
45
+ if start_date and end_date:
46
+ base_query = sql.SQL('{}{}').format(
47
+ base_query,
48
+ sql.SQL(' AND date BETWEEN {} AND {}').format(
49
+ sql.Literal(start_date),
50
+ sql.Literal(end_date)
51
+ )
52
+ )
53
+
54
+ # Add topic filter (if provided)
55
+ if topic:
56
+ base_query = sql.SQL('{}{}').format(
57
+ base_query,
58
+ sql.SQL(' AND topic = {}').format(sql.Literal(topic))
59
+ )
60
+
61
+ base_query = sql.SQL('{} {}').format(
62
+ base_query,
63
+ sql.SQL(')')
64
+ )
65
+
66
+ # Add entity filter (if entities exist)
67
+ if entities:
68
+ entity_conditions = sql.SQL(" OR ").join(
69
+ sql.SQL("""
70
+ (LOWER(UNACCENT(word)) = LOWER(UNACCENT({}))
71
+ AND entity_group = {})
72
+ """).format(
73
+ sql.Literal(e[0]), # Lowercase + unaccented entity text
74
+ sql.Literal(e[1]) # Original entity label (case-sensitive)
75
+ ) for e in entities
76
+ )
77
+
78
+ final_query = sql.SQL('''
79
+ {base_query},
80
+ target_articles AS (
81
+ SELECT DISTINCT article_id
82
+ FROM articles.ner
83
+ WHERE ({entity_conditions})
84
+ AND article_id IN (SELECT article_id FROM filtered_articles)
85
+ )
86
+ SELECT
87
+ a.content,
88
+ a.embedding <=> {embedding}::vector AS distance,
89
+ a.date,
90
+ a.topic,
91
+ a.url
92
+ FROM articles.articles a
93
+ JOIN target_articles t ON a.article_id = t.article_id
94
+ ORDER BY distance
95
+ LIMIT {limit}
96
+ ''').format(
97
+ base_query=base_query,
98
+ entity_conditions=entity_conditions,
99
+ embedding=sql.Literal(query_embedding),
100
+ limit=sql.Literal(limit)
101
+ )
102
+ else:
103
+ final_query = sql.SQL('''
104
+ {base_query}
105
+ SELECT
106
+ a.content,
107
+ a.embedding <=> {embedding}::vector AS distance,
108
+ a.date,
109
+ a.topic,
110
+ a.url
111
+ FROM articles.articles a
112
+ JOIN filtered_articles f ON a.article_id = f.article_id
113
+ ORDER BY distance
114
+ LIMIT {limit}
115
+ ''').format(
116
+ base_query=base_query,
117
+ embedding=sql.Literal(query_embedding),
118
+ limit=sql.Literal(limit)
119
+ )
120
+
121
+ cursor.execute(final_query)
122
+ articles = cursor.fetchall()
123
+
124
+ # Fallback: Retry with fewer filters if no results
125
+ if not articles:
126
+ print("No articles found with entities...")
127
+ fallback_query = sql.SQL('''
128
+ SELECT
129
+ content,
130
+ embedding <=> {}::vector AS distance,
131
+ date,
132
+ topic,
133
+ url
134
+ FROM articles.articles
135
+ ORDER BY distance
136
+ LIMIT {limit}
137
+ ''').format(
138
+ sql.Literal(query_embedding),
139
+ limit=sql.Literal(limit)
140
+ )
141
+ cursor.execute(fallback_query)
142
+ articles = cursor.fetchall()
143
+
144
+ # Format results
145
+ formatted_results = [
146
+ {
147
+ "content": content,
148
+ "distance": distance,
149
+ "date": art_date,
150
+ "topic": art_topic,
151
+ "url": url,
152
+ }
153
+ for content, distance, art_date, art_topic,url in articles
154
+ ]
155
+
156
+ return formatted_results
157
+
158
+ except Exception as e:
159
+ print(f"Database query error: {e}")
160
+ return []
161
+
162
+ async def close(self):
163
+ # No persistent connection to close in psycopg2
164
+ pass
main.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ from typing import List, Dict, Any, Optional, Tuple
3
+ import numpy as np
4
+ from models.LexRank import degree_centrality_scores
5
+ import logging
6
+ from datetime import datetime as dt
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class QueryProcessor:
11
+ def __init__(self, embedding_model, summarization_model, nlp_model, db_service):
12
+ self.embedding_model = embedding_model
13
+ self.summarization_model = summarization_model
14
+ self.nlp_model = nlp_model
15
+ self.db_service = db_service
16
+ logger.info("QueryProcessor initialized")
17
+
18
+ async def process(
19
+ self,
20
+ query: str,
21
+ topic: Optional[str] = None,
22
+ start_date: Optional[str] = None,
23
+ end_date: Optional[str] = None
24
+ ) -> Dict[str, Any]:
25
+ try:
26
+ # Date handling
27
+ start_dt = self._parse_date(start_date) if start_date else None
28
+ end_dt = self._parse_date(end_date) if end_date else None
29
+
30
+ # Query processing
31
+ query_embedding = self.embedding_model.encode(query).tolist()
32
+ entities = self.nlp_model.extract_entities(query)
33
+ print(f"Extracted entities: {entities}")
34
+
35
+ # Database search
36
+ articles = await self._execute_semantic_search(
37
+ query_embedding,
38
+ start_dt,
39
+ end_dt,
40
+ topic,
41
+ entities
42
+ )
43
+
44
+ if not articles:
45
+ return {"message": "No articles found", "articles": []}
46
+
47
+ # Summary generation
48
+ print("Starting summary generation")
49
+ summary_data = self._generate_summary(articles)
50
+ return {
51
+ "summary": summary_data["summary"],
52
+ "articles": articles,
53
+ "entities": entities
54
+ }
55
+
56
+ except Exception as e:
57
+ logger.error(f"Processing failed: {str(e)}", exc_info=True)
58
+ return {"error": str(e)}
59
+
60
+ def _parse_date(self, date_str: str) -> dt:
61
+ """Safe date parsing with validation"""
62
+ try:
63
+ return dt.strptime(date_str, "%Y-%m-%d")
64
+ except ValueError as e:
65
+ logger.error(f"Invalid date format: {date_str}")
66
+ raise ValueError(f"Invalid date format. Expected YYYY-MM-DD, got {date_str}")
67
+
68
+ def _extract_entities_safely(self, text: str) -> List[Tuple[str, str]]:
69
+ """Robust entity extraction handling both strings and lists"""
70
+ try:
71
+ if isinstance(text, list):
72
+ logger.warning("Received list input for entity extraction, joining to string")
73
+ text = " ".join(text)
74
+ return self.nlp_model.extract_entities(text)
75
+ except Exception as e:
76
+ logger.error(f"Entity extraction failed: {str(e)}")
77
+ return []
78
+
79
+ async def _execute_semantic_search(
80
+ self,
81
+ query_embedding: List[float],
82
+ start_date: Optional[dt],
83
+ end_date: Optional[dt],
84
+ topic: Optional[str],
85
+ entities: List[Tuple[str, str]]
86
+ ) -> List[Dict[str, Any]]:
87
+ """Execute search with proper error handling"""
88
+ try:
89
+ return await self.db_service.semantic_search(
90
+ query_embedding=query_embedding,
91
+ start_date=start_date,
92
+ end_date=end_date,
93
+ topic=topic,
94
+ entities=entities
95
+ )
96
+ except Exception as e:
97
+ logger.error(f"Semantic search failed: {str(e)}")
98
+ raise
99
+
100
+ def _generate_summary(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
101
+ """Generate summary from articles with fallback handling"""
102
+ try:
103
+ contents = [article["content"] for article in articles[:5]]
104
+ sentences = []
105
+
106
+ for content in contents:
107
+ if content:
108
+ sentences.extend(self.nlp_model.tokenize_sentences(content))
109
+
110
+ if not sentences:
111
+ logger.warning("No sentences available for summarization")
112
+ return {
113
+ "summary": "No content available for summarization",
114
+ }
115
+
116
+ print("Starting first summary generation")
117
+ embeddings = self.embedding_model.encode(sentences)
118
+ similarity_matrix = np.dot(embeddings, embeddings.T) / (np.linalg.norm(embeddings, axis=1, keepdims=True) * np.linalg.norm(embeddings, axis=1, keepdims=True).T)
119
+ centrality_scores = degree_centrality_scores(similarity_matrix, threshold=0.1)
120
+
121
+ top_indices = np.argsort(-centrality_scores)[:10]
122
+ key_sentences = [sentences[idx].strip() for idx in top_indices]
123
+ combined_text = ' '.join(key_sentences)
124
+
125
+ print(f"First summary done with: {len(key_sentences)} sentences")
126
+
127
+ return {
128
+ "summary": self.summarization_model.summarize(combined_text),
129
+ }
130
+
131
+ except Exception as e:
132
+ logger.error(f"Summary generation failed: {str(e)}")
133
+ return {
134
+ "summary": "Summary generation failed",
135
+ }
models/LexRank.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LexRank implementation
3
+ Source: https://github.com/crabcamp/lexrank/tree/dev
4
+ """
5
+
6
+ import logging
7
+
8
+ import numpy as np
9
+ from scipy.sparse.csgraph import connected_components
10
+ from scipy.special import softmax
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def degree_centrality_scores(
16
+ similarity_matrix,
17
+ threshold=None,
18
+ increase_power=True,
19
+ ):
20
+ if not (threshold is None or isinstance(threshold, float) and 0 <= threshold < 1):
21
+ raise ValueError(
22
+ "'threshold' should be a floating-point number " "from the interval [0, 1) or None",
23
+ )
24
+
25
+ if threshold is None:
26
+ markov_matrix = create_markov_matrix(similarity_matrix)
27
+
28
+ else:
29
+ markov_matrix = create_markov_matrix_discrete(
30
+ similarity_matrix,
31
+ threshold,
32
+ )
33
+
34
+ scores = stationary_distribution(
35
+ markov_matrix,
36
+ increase_power=increase_power,
37
+ normalized=False,
38
+ )
39
+
40
+ return scores
41
+
42
+
43
+ def _power_method(transition_matrix, increase_power=True, max_iter=10000):
44
+ eigenvector = np.ones(len(transition_matrix))
45
+
46
+ if len(eigenvector) == 1:
47
+ return eigenvector
48
+
49
+ transition = transition_matrix.transpose()
50
+
51
+ for _ in range(max_iter):
52
+ eigenvector_next = np.dot(transition, eigenvector)
53
+
54
+ if np.allclose(eigenvector_next, eigenvector):
55
+ return eigenvector_next
56
+
57
+ eigenvector = eigenvector_next
58
+
59
+ if increase_power:
60
+ transition = np.dot(transition, transition)
61
+
62
+ logger.warning("Maximum number of iterations for power method exceeded without convergence!")
63
+ return eigenvector_next
64
+
65
+
66
+ def connected_nodes(matrix):
67
+ _, labels = connected_components(matrix)
68
+
69
+ groups = []
70
+
71
+ for tag in np.unique(labels):
72
+ group = np.where(labels == tag)[0]
73
+ groups.append(group)
74
+
75
+ return groups
76
+
77
+
78
+ def create_markov_matrix(weights_matrix):
79
+ n_1, n_2 = weights_matrix.shape
80
+ if n_1 != n_2:
81
+ raise ValueError("'weights_matrix' should be square")
82
+
83
+ row_sum = weights_matrix.sum(axis=1, keepdims=True)
84
+
85
+ # normalize probability distribution differently if we have negative transition values
86
+ if np.min(weights_matrix) <= 0:
87
+ return softmax(weights_matrix, axis=1)
88
+
89
+ return weights_matrix / row_sum
90
+
91
+
92
+ def create_markov_matrix_discrete(weights_matrix, threshold):
93
+ discrete_weights_matrix = np.zeros(weights_matrix.shape)
94
+ ixs = np.where(weights_matrix >= threshold)
95
+ discrete_weights_matrix[ixs] = 1
96
+
97
+ return create_markov_matrix(discrete_weights_matrix)
98
+
99
+
100
+ def stationary_distribution(
101
+ transition_matrix,
102
+ increase_power=True,
103
+ normalized=True,
104
+ ):
105
+ n_1, n_2 = transition_matrix.shape
106
+ if n_1 != n_2:
107
+ raise ValueError("'transition_matrix' should be square")
108
+
109
+ distribution = np.zeros(n_1)
110
+
111
+ grouped_indices = connected_nodes(transition_matrix)
112
+
113
+ for group in grouped_indices:
114
+ t_matrix = transition_matrix[np.ix_(group, group)]
115
+ eigenvector = _power_method(t_matrix, increase_power=increase_power)
116
+ distribution[group] = eigenvector
117
+
118
+ if normalized:
119
+ distribution /= n_1
120
+
121
+ return distribution
models/embedding.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import torch
3
+
4
+ class EmbeddingModel:
5
+ def __init__(self):
6
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
7
+ self.model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
8
+
9
+ def encode(self, text: str):
10
+ return self.model.encode(text, device=self.device)
models/nlp.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from typing import List, Union
3
+ import logging
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ class NLPModel:
8
+ def __init__(self):
9
+ try:
10
+ # Load spaCy model only
11
+ self.nlp = spacy.load("pt_core_news_md")
12
+ logger.info("spaCy model initialized successfully")
13
+ except Exception as e:
14
+ logger.error(f"Failed to initialize spaCy model: {str(e)}")
15
+ raise
16
+
17
+ def extract_entities(self, text: Union[str, List[str]]) -> List[tuple]:
18
+ """Entity extraction using spaCy"""
19
+ try:
20
+ if isinstance(text, list):
21
+ text = " ".join(text)
22
+ doc = self.nlp(text)
23
+ return [(ent.text.lower(), ent.label_) for ent in doc.ents]
24
+ except Exception as e:
25
+ logger.error(f"Entity extraction failed: {str(e)}")
26
+ return []
27
+
28
+ def tokenize_sentences(self, text: str) -> List[str]:
29
+ """Sentence tokenization using spaCy"""
30
+ try:
31
+ doc = self.nlp(text)
32
+ return [sent.text for sent in doc.sents]
33
+ except Exception as e:
34
+ logger.error(f"Sentence tokenization failed: {str(e)}")
35
+ return [text] # Fallback to returning whole text
models/summarization.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
2
+ import torch
3
+
4
+ class SummarizationModel:
5
+ def __init__(self):
6
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
7
+ self.tokenizer = T5Tokenizer.from_pretrained('unicamp-dl/ptt5-base-portuguese-vocab')
8
+ self.model = T5ForConditionalGeneration.from_pretrained('recogna-nlp/ptt5-base-summ').to(self.device)
9
+
10
+ def summarize(self, text: str, max_length: int = 256, min_length: int = 128) -> str:
11
+ inputs = self.tokenizer.encode(
12
+ text,
13
+ max_length=512,
14
+ truncation=True,
15
+ return_tensors='pt'
16
+ ).to(self.device)
17
+
18
+ summary_ids = self.model.generate(
19
+ inputs,
20
+ max_length=max_length,
21
+ min_length=min_length,
22
+ num_beams=4,
23
+ no_repeat_ngram_size=3,
24
+ early_stopping=True,
25
+ )
26
+
27
+ return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ logging
4
+ transformers
5
+ torch
6
+ sentence_transformers
7
+ nltk
8
+ spacy
9
+ numpy
10
+ pandas
11
+ scipy
12
+ psycopg2
13
+ sentencepiece