OxbridgeEconomics commited on
Commit
88c1f77
·
unverified ·
2 Parent(s): 9141a95 7e15dd1

Merge pull request #43 from oxbridge-econ/dev

Browse files
app/app.py CHANGED
@@ -10,7 +10,7 @@ from fastapi.responses import JSONResponse
10
  from apscheduler.schedulers.asyncio import AsyncIOScheduler
11
  from apscheduler.triggers.cron import CronTrigger
12
 
13
- from routes import category, summary, keyword, lda, knowledge_base # pylint: disable=import-error
14
 
15
 
16
  class Config: # pylint: disable=too-few-public-methods
@@ -172,7 +172,7 @@ app.include_router(category.router)
172
  app.include_router(summary.router)
173
  app.include_router(keyword.router)
174
  app.include_router(lda.router)
175
- app.include_router(knowledge_base.router)
176
 
177
  @app.get("/_health")
178
  def health():
 
10
  from apscheduler.schedulers.asyncio import AsyncIOScheduler
11
  from apscheduler.triggers.cron import CronTrigger
12
 
13
+ from routes import category, summary, keyword, lda, entity # pylint: disable=import-error
14
 
15
 
16
  class Config: # pylint: disable=too-few-public-methods
 
172
  app.include_router(summary.router)
173
  app.include_router(keyword.router)
174
  app.include_router(lda.router)
175
+ app.include_router(entity.router)
176
 
177
  @app.get("/_health")
178
  def health():
app/collectors/finfast/keyword_analysis.py CHANGED
@@ -140,4 +140,3 @@ def _perform_analysis_for_all_periods() -> None:
140
  except Exception as e: # pylint: disable=broad-exception-caught
141
  logger.error("Error analyzing %s period: %s", period, str(e))
142
  # Continue with other periods even if one fails
143
-
 
140
  except Exception as e: # pylint: disable=broad-exception-caught
141
  logger.error("Error analyzing %s period: %s", period, str(e))
142
  # Continue with other periods even if one fails
 
app/controllers/entity.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Controller for entity-related operations."""
2
+ from models.database import entity_collection
3
+
4
+ def retrieve_hot_entity():
5
+ """Retrieves the top 200 documents from a MongoDB collection,
6
+ sorted by the occurrence field in descending order.
7
+ The function returns a dict containing the count of documents and a list of documents
8
+ Parameters
9
+ None
10
+
11
+ Type: dict
12
+ Structure:
13
+
14
+ Count (int): The number of documents retrieved (up to 200).
15
+ Items (list): A list of dictionaries, each containing:
16
+
17
+ entity: The value of the entity field from the document.
18
+ entityType: The value of the entityType field from the document.
19
+ total_occurrence: The value of the occurrence field from the document.
20
+ """
21
+ result = list(entity_collection.find(
22
+ {},
23
+ {'entity': 1, 'entityType': 1, 'total_occurrence': '$occurrence', '_id': 0}
24
+ ).sort('occurrence', -1).limit(200))
25
+ res = {
26
+ "Count": len(result),
27
+ "Items": result
28
+ }
29
+ return res
app/controllers/keyword.py CHANGED
@@ -661,7 +661,7 @@ def invoke_llm_for_batch(articles_in_batch: list, historical_keywords: set) -> L
661
 
662
  try:
663
  categories_text = "\n".join([
664
- f"- {cat}: {', '.join(keywords[:10])}" + ("..." if len(keywords) > 10 else "")
665
  for cat, keywords in FIN_KEYWORDS.items()
666
  ])
667
 
 
661
 
662
  try:
663
  categories_text = "\n".join([
664
+ f"- {cat}: {', '.join(keywords[:10])}" + ("..." if len(keywords) > 10 else "")
665
  for cat, keywords in FIN_KEYWORDS.items()
666
  ])
667
 
app/models/database/astra.py CHANGED
@@ -67,9 +67,9 @@ class KnowledgeBase: # pylint: disable=too-few-public-methods
67
  total_count = len(unique_docs)
68
 
69
  return {
70
- "emails": email_count,
71
- "files": file_count,
72
- "total_documents": total_count
73
  }
74
 
75
  except Exception as e: # pylint: disable=broad-exception-caught
 
67
  total_count = len(unique_docs)
68
 
69
  return {
70
+ "gmail": email_count,
71
+ "file": file_count,
72
+ "total": total_count
73
  }
74
 
75
  except Exception as e: # pylint: disable=broad-exception-caught
app/models/llm/azure.py CHANGED
@@ -32,4 +32,3 @@ class GPTModel(AzureChatOpenAI):
32
  streaming=True,
33
  temperature=0
34
  )
35
-
 
32
  streaming=True,
33
  temperature=0
34
  )
 
app/routes/entity.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This module defines the /lda route for the FastAPI application."""
2
+
3
+ import logging
4
+ from fastapi import APIRouter
5
+ from fastapi.responses import JSONResponse
6
+ from controllers.entity import retrieve_hot_entity # pylint: disable=import-error
7
+
8
+ # Configure logger
9
+ logger = logging.getLogger(__name__)
10
+
11
+ # Create FastAPI Router
12
+ router = APIRouter(prefix="/entity", tags=["entity"])
13
+
14
+ @router.get('/hot')
15
+ async def get_hot_entity():
16
+ """
17
+ Handles GET requests to retrieve hot entity.
18
+
19
+ Returns:
20
+ dict: JSON response containing count and a list of hot entity.
21
+ """
22
+ return JSONResponse(content=retrieve_hot_entity())
app/routes/knowledge_base.py DELETED
@@ -1,59 +0,0 @@
1
- """This module defines the /knowledge-base route for the FastAPI application."""
2
- import logging
3
- from fastapi import APIRouter, HTTPException, Path
4
- from fastapi.responses import JSONResponse
5
- from models.database import knowledge_base # pylint: disable=import-error
6
-
7
- router = APIRouter(prefix="/knowledge-base", tags=["knowledge-base"])
8
-
9
- @router.get("/{user_id}")
10
- async def get_user_document_stats(
11
- user_id: str = Path(..., description="User's email address")
12
- ) -> JSONResponse:
13
- """
14
- Get document statistics for a specific user.
15
-
16
- This endpoint counts the number of unique emails and files uploaded by the user.
17
- It groups documents by metadata.id to ensure unique document counting (not chunks).
18
-
19
- Args:
20
- user_id (str): The user's email address
21
-
22
- Returns:
23
- JSONResponse: A JSON response containing document counts:
24
- {
25
- "user_id": "[email protected]",
26
- "emails": 5,
27
- "files": 12,
28
- "total_documents": 17
29
- }
30
-
31
- Raises:
32
- HTTPException: 400 for invalid user_id, 500 for database errors
33
- """
34
- try:
35
- # Validate user_id format (basic email validation)
36
- if not user_id or "@" not in user_id or "." not in user_id:
37
- raise HTTPException(
38
- status_code=400,
39
- detail="Invalid user_id format. Must be a valid email address."
40
- )
41
-
42
- # Get document counts from database
43
- result = knowledge_base.get_doc_count(user_id)
44
-
45
- # Add user_id to response
46
- result["user_id"] = user_id
47
-
48
- return JSONResponse(content=result, status_code=200)
49
-
50
- except ValueError as e:
51
- logging.error("Validation error for user %s: %s", user_id, str(e))
52
- raise HTTPException(status_code=400, detail=str(e)) # pylint: disable=raise-missing-from
53
-
54
- except Exception as e: # pylint: disable=broad-exception-caught
55
- logging.error("Database error for user %s: %s", user_id, str(e))
56
- raise HTTPException( # pylint: disable=raise-missing-from
57
- status_code=500,
58
- detail="Internal server error while retrieving document statistics."
59
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/routes/summary.py CHANGED
@@ -1,12 +1,67 @@
1
  """This module defines the /summary route for the Flask application."""
2
 
3
  import importlib
4
- from fastapi import APIRouter
 
 
5
  from fastapi.responses import JSONResponse
6
  from controllers.summary import get_summary_data # pylint: disable=import-error
 
7
 
8
  router = APIRouter(prefix="/summary", tags=["summary"])
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  @router.get('')
11
  async def get_summary() -> JSONResponse:
12
  """
@@ -31,7 +86,6 @@ async def get_summary() -> JSONResponse:
31
  except Exception as e: #pylint: disable=broad-except
32
  return JSONResponse(content={"error": str(e)}, status_code=500)
33
 
34
-
35
  @router.get("/{module}/{chart_id}")
36
  def get_summary_chart(module: str, chart_id: str) -> JSONResponse:
37
  """
@@ -59,7 +113,6 @@ def get_summary_chart(module: str, chart_id: str) -> JSONResponse:
59
  except AttributeError as e:
60
  return JSONResponse(content={"error": str(e)}, status_code=500)
61
 
62
-
63
  @router.get("/{module}")
64
  async def get_summary_module(module: str) -> JSONResponse:
65
  """
@@ -79,3 +132,55 @@ async def get_summary_module(module: str) -> JSONResponse:
79
  return JSONResponse(content={"error": str(e)}, status_code=404)
80
  except Exception as e: #pylint: disable=broad-except
81
  return JSONResponse(content={"error": str(e)}, status_code=500)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """This module defines the /summary route for the Flask application."""
2
 
3
  import importlib
4
+ from venv import logger
5
+ import re
6
+ from fastapi import APIRouter, HTTPException, Path
7
  from fastapi.responses import JSONResponse
8
  from controllers.summary import get_summary_data # pylint: disable=import-error
9
+ from models.database import knowledge_base # pylint: disable=import-error
10
 
11
  router = APIRouter(prefix="/summary", tags=["summary"])
12
 
13
+ @router.get("/{email}")
14
+ async def get_user_document_stats(
15
+ email: str = Path(..., description="User's email address")
16
+ ) -> JSONResponse:
17
+ """
18
+ Get document statistics for a specific user.
19
+
20
+ This endpoint counts the number of unique emails and files uploaded by the user.
21
+ It groups documents by metadata.id to ensure unique document counting (not chunks).
22
+
23
+ Args:
24
+ email (str): The user's email address
25
+
26
+ Returns:
27
+ JSONResponse: A JSON response containing document counts:
28
+ {
29
+ "email": "[email protected]",
30
+ "emails": 5,
31
+ "files": 12,
32
+ "total_documents": 17
33
+ }
34
+
35
+ Raises:
36
+ HTTPException: 400 for invalid email, 500 for database errors
37
+ """
38
+ try:
39
+ email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
40
+ if not email or not re.match(email_pattern, email):
41
+ raise HTTPException(
42
+ status_code=400,
43
+ detail="Invalid email format. Must be a valid email address."
44
+ )
45
+
46
+ # Get document counts from database
47
+ result = knowledge_base.get_doc_count(email)
48
+
49
+ # Add email to response
50
+ result["email"] = email
51
+
52
+ return JSONResponse(content=result, status_code=200)
53
+
54
+ except ValueError as e:
55
+ logger.error("Validation error for user %s: %s", email, str(e))
56
+ raise HTTPException(status_code=400, detail=str(e)) # pylint: disable=raise-missing-from
57
+
58
+ except Exception as e: # pylint: disable=broad-exception-caught
59
+ logger.error("Database error for user %s: %s", email, str(e))
60
+ raise HTTPException( # pylint: disable=raise-missing-from
61
+ status_code=500,
62
+ detail="Internal server error while retrieving document statistics."
63
+ )
64
+
65
  @router.get('')
66
  async def get_summary() -> JSONResponse:
67
  """
 
86
  except Exception as e: #pylint: disable=broad-except
87
  return JSONResponse(content={"error": str(e)}, status_code=500)
88
 
 
89
  @router.get("/{module}/{chart_id}")
90
  def get_summary_chart(module: str, chart_id: str) -> JSONResponse:
91
  """
 
113
  except AttributeError as e:
114
  return JSONResponse(content={"error": str(e)}, status_code=500)
115
 
 
116
  @router.get("/{module}")
117
  async def get_summary_module(module: str) -> JSONResponse:
118
  """
 
132
  return JSONResponse(content={"error": str(e)}, status_code=404)
133
  except Exception as e: #pylint: disable=broad-except
134
  return JSONResponse(content={"error": str(e)}, status_code=500)
135
+
136
+ # @router.get("/{email}")
137
+ # async def get_user_document_stats(
138
+ # email: str = Path(..., description="User's email address")
139
+ # ) -> JSONResponse:
140
+ # """
141
+ # Get document statistics for a specific user.
142
+
143
+ # This endpoint counts the number of unique emails and files uploaded by the user.
144
+ # It groups documents by metadata.id to ensure unique document counting (not chunks).
145
+
146
+ # Args:
147
+ # email (str): The user's email address
148
+
149
+ # Returns:
150
+ # JSONResponse: A JSON response containing document counts:
151
+ # {
152
+ # "email": "[email protected]",
153
+ # "emails": 5,
154
+ # "files": 12,
155
+ # "total_documents": 17
156
+ # }
157
+
158
+ # Raises:
159
+ # HTTPException: 400 for invalid email, 500 for database errors
160
+ # """
161
+ # try:
162
+ # email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
163
+ # if not email or not re.match(email_pattern, email):
164
+ # raise HTTPException(
165
+ # status_code=400,
166
+ # detail="Invalid email format. Must be a valid email address."
167
+ # )
168
+
169
+ # # Get document counts from database
170
+ # result = knowledge_base.get_doc_count(email)
171
+
172
+ # # Add email to response
173
+ # result["email"] = email
174
+
175
+ # return JSONResponse(content=result, status_code=200)
176
+
177
+ # except ValueError as e:
178
+ # logger.error("Validation error for user %s: %s", email, str(e))
179
+ # raise HTTPException(status_code=400, detail=str(e)) # pylint: disable=raise-missing-from
180
+
181
+ # except Exception as e: # pylint: disable=broad-exception-caught
182
+ # logger.error("Database error for user %s: %s", email, str(e))
183
+ # raise HTTPException( # pylint: disable=raise-missing-from
184
+ # status_code=500,
185
+ # detail="Internal server error while retrieving document statistics."
186
+ # )