methunraj commited on
Commit
8b21729
Β·
1 Parent(s): e51faac

feat: Implement revenue data organization workflow with JSON output

Browse files

- Added a new revenue data organization task in `data_arrangement_1.txt` to structure extracted financial data for Excel reporting.
- Introduced `RestrictedPythonTools` for self-healing Python execution with directory constraints and package management.
- Created `RestrictedShellTools` to execute shell commands within a specified base directory, enhancing security and preventing directory traversal.
- Developed `FinancialDocumentWorkflow` to manage the entire financial document analysis process, including data extraction, arrangement, and Excel report generation.
- Integrated session management for handling input/output directories and caching extraction results.
- Enhanced error handling and logging throughout the workflow for better traceability and debugging.

.claude/settings.local.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(python3:*)",
5
+ "Bash(mkdir:*)",
6
+ "Bash(ls:*)",
7
+ "Bash(find:*)",
8
+ "Bash(grep:*)",
9
+ "Bash(python test_prompt_loading.py:*)"
10
+ ],
11
+ "deny": []
12
+ }
13
+ }
.opencode/opencode.db ADDED
Binary file (4.1 kB). View file
 
.opencode/opencode.db-shm ADDED
Binary file (32.8 kB). View file
 
.opencode/opencode.db-wal ADDED
Binary file (78.3 kB). View file
 
app.py CHANGED
@@ -8,7 +8,7 @@ os.environ.setdefault("MPLCONFIGDIR", "/tmp/mpl_cache")
8
  import logging
9
  from pathlib import Path
10
  import uuid
11
- from workflow.financial_workflow import FinancialDocumentWorkflow
12
  from agno.storage.sqlite import SqliteStorage
13
  from utils.file_handler import FileHandler
14
  from config.settings import settings
@@ -1892,7 +1892,7 @@ def create_gradio_app():
1892
  logger.info("Backend: Starting Step 1 - Data Extraction")
1893
 
1894
  # Run the workflow and track progress
1895
- result = ui.workflow.run_workflow()
1896
  progress_state['result'][0] = result
1897
 
1898
  # Signal completion
@@ -1941,8 +1941,9 @@ def create_gradio_app():
1941
  if progress_state['error'][0]:
1942
  raise progress_state['error'][0]
1943
 
1944
- workflow_response = progress_state['result'][0]
1945
- workflow_results = workflow_response.content
 
1946
 
1947
  # The workflow has completed all steps - just display the results
1948
  logger.info("πŸ“Š Displaying workflow results")
 
8
  import logging
9
  from pathlib import Path
10
  import uuid
11
+ from workflow.financial_workflow_working import FinancialDocumentWorkflow
12
  from agno.storage.sqlite import SqliteStorage
13
  from utils.file_handler import FileHandler
14
  from config.settings import settings
 
1892
  logger.info("Backend: Starting Step 1 - Data Extraction")
1893
 
1894
  # Run the workflow and track progress
1895
+ result = list(ui.workflow.run(file_path=ui.workflow.file_path))
1896
  progress_state['result'][0] = result
1897
 
1898
  # Signal completion
 
1941
  if progress_state['error'][0]:
1942
  raise progress_state['error'][0]
1943
 
1944
+ workflow_responses = progress_state['result'][0]
1945
+ # Extract content from all responses and join them
1946
+ workflow_results = "\n".join([response.content for response in workflow_responses])
1947
 
1948
  # The workflow has completed all steps - just display the results
1949
  logger.info("πŸ“Š Displaying workflow results")
config/settings.py CHANGED
@@ -1,114 +1,237 @@
 
 
 
 
 
1
  import os
2
  from pathlib import Path
3
  from dotenv import load_dotenv
 
4
 
 
5
  load_dotenv()
6
 
 
 
7
 
8
  class Settings:
 
 
 
9
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
10
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
11
- MAX_FILE_SIZE_MB = 50
 
 
 
 
 
 
 
 
 
 
 
12
  SUPPORTED_FILE_TYPES = [
13
- "pdf",
14
- "txt",
15
- "png",
16
- "jpg",
17
- "jpeg",
18
- "docx",
19
- "xlsx",
20
- "csv",
21
- "md",
22
- "json",
23
- "xml",
24
- "html",
25
- "py",
26
- "js",
27
- "ts",
28
- "doc",
29
- "xls",
30
- "ppt",
31
- "pptx",
32
  ]
33
- # Use /tmp for temporary files on Hugging Face Spaces (or override with TEMP_DIR env var)
34
- TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/data_extractor_temp"))
35
- DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "python:3.12-slim")
36
- COORDINATOR_MODEL = os.getenv("COORDINATOR_MODEL", "gemini-2.5-pro")
37
- PROMPT_ENGINEER_MODEL = os.getenv("PROMPT_ENGINEER_MODEL", "gemini-2.5-pro")
38
- DATA_EXTRACTOR_MODEL = os.getenv("DATA_EXTRACTOR_MODEL", "gemini-2.5-pro")
39
- DATA_ARRANGER_MODEL = os.getenv("DATA_ARRANGER_MODEL", "gemini-2.5-pro")
40
- CODE_GENERATOR_MODEL = os.getenv("CODE_GENERATOR_MODEL", "gemini-2.0-flash")
41
-
42
- COORDINATOR_MODEL_THINKING_BUDGET=2048
43
- PROMPT_ENGINEER_MODEL_THINKING_BUDGET=2048
44
- DATA_EXTRACTOR_MODEL_THINKING_BUDGET=-1
45
- DATA_ARRANGER_MODEL_THINKING_BUDGET=3072
46
- CODE_GENERATOR_MODEL_THINKING_BUDGET=3072
47
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  @classmethod
49
  def validate_config(cls):
50
- """Validate configuration and create necessary directories."""
51
  errors = []
52
  warnings = []
53
 
54
- # Check required API keys
 
 
55
  if not cls.GOOGLE_API_KEY:
56
- errors.append("GOOGLE_API_KEY is required - get it from Google AI Studio")
57
-
58
- # Check for optional but recommended API keys
59
- openai_key = os.getenv("OPENAI_API_KEY")
60
- if not openai_key:
61
- warnings.append("OPENAI_API_KEY not set - OpenAI models will not be available")
62
-
63
- # Validate and create temp directory
 
 
 
 
 
 
64
  try:
65
- cls.TEMP_DIR.mkdir(exist_ok=True, parents=True)
66
- # Test write permissions
67
- test_file = cls.TEMP_DIR / ".write_test"
68
- try:
69
- test_file.write_text("test")
70
- test_file.unlink()
71
- except Exception as e:
72
- errors.append(f"Cannot write to temp directory {cls.TEMP_DIR}: {e}")
73
  except Exception as e:
74
- errors.append(f"Cannot create temp directory {cls.TEMP_DIR}: {e}")
75
-
76
- # Validate file size limits
 
 
77
  if cls.MAX_FILE_SIZE_MB <= 0:
78
  errors.append("MAX_FILE_SIZE_MB must be positive")
79
  elif cls.MAX_FILE_SIZE_MB > 100:
80
- warnings.append(f"MAX_FILE_SIZE_MB ({cls.MAX_FILE_SIZE_MB}) is very large")
81
-
82
- # Validate supported file types
83
  if not cls.SUPPORTED_FILE_TYPES:
84
  errors.append("SUPPORTED_FILE_TYPES cannot be empty")
85
-
86
- # Validate model names
87
- model_fields = ['DATA_EXTRACTOR_MODEL', 'DATA_ARRANGER_MODEL', 'CODE_GENERATOR_MODEL']
88
- for field in model_fields:
89
- model_name = getattr(cls, field)
90
- if not model_name:
91
- errors.append(f"{field} cannot be empty")
92
- elif not model_name.startswith(('gemini-', 'gpt-', 'claude-')):
93
- warnings.append(f"{field} '{model_name}' may not be a valid model name")
94
-
95
- # Return validation results
 
 
 
 
 
 
 
 
 
 
 
96
  if errors:
97
- error_msg = "Configuration validation failed:\n" + "\n".join(f"- {error}" for error in errors)
 
 
98
  if warnings:
99
- error_msg += "\n\nWarnings:\n" + "\n".join(f"- {warning}" for warning in warnings)
100
- raise ValueError(error_msg)
101
 
 
 
102
  if warnings:
103
- import logging
104
- logger = logging.getLogger(__name__)
105
- logger.warning("Configuration warnings:\n" + "\n".join(f"- {warning}" for warning in warnings))
106
-
 
107
  return True
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  @classmethod
110
  def get_debug_info(cls):
111
- """Get debug information about current configuration."""
112
  import platform
113
  import sys
114
 
@@ -117,16 +240,26 @@ class Settings:
117
  "platform": platform.platform(),
118
  "temp_dir": str(cls.TEMP_DIR),
119
  "temp_dir_exists": cls.TEMP_DIR.exists(),
120
- "supported_file_types": len(cls.SUPPORTED_FILE_TYPES),
121
- "max_file_size_mb": cls.MAX_FILE_SIZE_MB,
122
- "has_google_api_key": bool(cls.GOOGLE_API_KEY),
123
- "has_openai_api_key": bool(os.getenv("OPENAI_API_KEY")),
124
  "models": {
125
  "data_extractor": cls.DATA_EXTRACTOR_MODEL,
126
- "data_arranger": cls.DATA_ARRANGER_MODEL,
127
- "code_generator": cls.CODE_GENERATOR_MODEL
 
 
 
 
128
  }
129
  }
130
 
131
 
 
132
  settings = Settings()
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration settings for Data Extractor Using Gemini
3
+ Optimized for Gemini-only model usage with robust directory management
4
+ """
5
+
6
  import os
7
  from pathlib import Path
8
  from dotenv import load_dotenv
9
+ import logging
10
 
11
+ # Load environment variables
12
  load_dotenv()
13
 
14
+ logger = logging.getLogger(__name__)
15
+
16
 
17
  class Settings:
18
+ """Configuration settings with Gemini-only model support and robust directory management."""
19
+
20
+ # === GEMINI MODEL CONFIGURATION ===
21
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
22
+
23
+ # Gemini model specifications - using gemini-2.5-flash (supports thinking budget)
24
+ DATA_EXTRACTOR_MODEL = os.getenv("DATA_EXTRACTOR_MODEL", "gemini-2.5-pro")
25
+ DATA_ARRANGER_MODEL = os.getenv("DATA_ARRANGER_MODEL", "gemini-2.5-pro")
26
+ CODE_GENERATOR_MODEL = os.getenv("CODE_GENERATOR_MODEL", "gemini-2.5-flash")
27
+
28
+ # Thinking budgets optimized for each task type
29
+ DATA_EXTRACTOR_MODEL_THINKING_BUDGET = int(os.getenv("DATA_EXTRACTOR_THINKING_BUDGET", "4096"))
30
+ DATA_ARRANGER_MODEL_THINKING_BUDGET = int(os.getenv("DATA_ARRANGER_THINKING_BUDGET", "4096"))
31
+ CODE_GENERATOR_MODEL_THINKING_BUDGET = int(os.getenv("CODE_GENERATOR_THINKING_BUDGET", "4096"))
32
+
33
+ # === FILE PROCESSING CONFIGURATION ===
34
+ MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
35
  SUPPORTED_FILE_TYPES = [
36
+ "pdf", "txt", "docx", "xlsx", "csv", "md", "json", "xml", "html",
37
+ "png", "jpg", "jpeg", "doc", "xls", "ppt", "pptx"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  ]
39
+
40
+ # === DIRECTORY MANAGEMENT ===
41
+ # Centralized working directory - all operations happen within this directory
42
+ WORKING_DIR = Path(os.getenv("WORKING_DIR", "/tmp/data_extractor_gemini"))
43
+
44
+ # Subdirectories within working directory
45
+ TEMP_DIR = WORKING_DIR / "temp"
46
+ INPUT_DIR = WORKING_DIR / "input"
47
+ OUTPUT_DIR = WORKING_DIR / "output"
48
+ CACHE_DIR = WORKING_DIR / "cache"
49
+ LOGS_DIR = WORKING_DIR / "logs"
50
+
51
+ # === WORKFLOW CONFIGURATION ===
52
+ # Retry and timeout settings
53
+ MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3"))
54
+ RETRY_DELAY_SECONDS = int(os.getenv("RETRY_DELAY_SECONDS", "5"))
55
+ AGENT_TIMEOUT_SECONDS = int(os.getenv("AGENT_TIMEOUT_SECONDS", "300"))
56
+
57
+ # Cache settings
58
+ ENABLE_CACHING = os.getenv("ENABLE_CACHING", "true").lower() == "true"
59
+ CACHE_TTL_HOURS = int(os.getenv("CACHE_TTL_HOURS", "24"))
60
+
61
+ @classmethod
62
+ def initialize_directories(cls):
63
+ """Initialize all required directories with proper permissions."""
64
+ directories = [
65
+ cls.WORKING_DIR,
66
+ cls.TEMP_DIR,
67
+ cls.INPUT_DIR,
68
+ cls.OUTPUT_DIR,
69
+ cls.CACHE_DIR,
70
+ cls.LOGS_DIR
71
+ ]
72
+
73
+ created_dirs = []
74
+ for directory in directories:
75
+ try:
76
+ directory.mkdir(parents=True, exist_ok=True)
77
+
78
+ # Test write permissions
79
+ test_file = directory / ".write_test"
80
+ test_file.write_text("test")
81
+ test_file.unlink()
82
+
83
+ created_dirs.append(str(directory))
84
+ logger.debug(f"Directory initialized: {directory}")
85
+
86
+ except Exception as e:
87
+ logger.error(f"Failed to initialize directory {directory}: {e}")
88
+ raise RuntimeError(f"Cannot create or write to directory {directory}: {e}")
89
+
90
+ logger.info(f"Successfully initialized {len(created_dirs)} directories")
91
+ return created_dirs
92
+
93
  @classmethod
94
  def validate_config(cls):
95
+ """Comprehensive configuration validation with detailed error reporting."""
96
  errors = []
97
  warnings = []
98
 
99
+ # === CRITICAL VALIDATIONS ===
100
+
101
+ # Google API Key validation
102
  if not cls.GOOGLE_API_KEY:
103
+ errors.append("GOOGLE_API_KEY is required. Get it from https://aistudio.google.com/app/apikey")
104
+ elif len(cls.GOOGLE_API_KEY) < 30:
105
+ warnings.append("GOOGLE_API_KEY appears to be too short - verify it's correct")
106
+
107
+ # Model name validation
108
+ gemini_models = [cls.DATA_EXTRACTOR_MODEL, cls.DATA_ARRANGER_MODEL, cls.CODE_GENERATOR_MODEL]
109
+ for i, model in enumerate(gemini_models):
110
+ model_names = ["DATA_EXTRACTOR_MODEL", "DATA_ARRANGER_MODEL", "CODE_GENERATOR_MODEL"]
111
+ if not model:
112
+ errors.append(f"{model_names[i]} cannot be empty")
113
+ elif not model.startswith("gemini-"):
114
+ errors.append(f"{model_names[i]} must be a Gemini model (starts with 'gemini-'), got: {model}")
115
+
116
+ # Directory validation
117
  try:
118
+ cls.initialize_directories()
 
 
 
 
 
 
 
119
  except Exception as e:
120
+ errors.append(f"Directory initialization failed: {e}")
121
+
122
+ # === MODERATE VALIDATIONS ===
123
+
124
+ # File size validation
125
  if cls.MAX_FILE_SIZE_MB <= 0:
126
  errors.append("MAX_FILE_SIZE_MB must be positive")
127
  elif cls.MAX_FILE_SIZE_MB > 100:
128
+ warnings.append(f"MAX_FILE_SIZE_MB ({cls.MAX_FILE_SIZE_MB}) is very large - may cause memory issues")
129
+
130
+ # Supported file types validation
131
  if not cls.SUPPORTED_FILE_TYPES:
132
  errors.append("SUPPORTED_FILE_TYPES cannot be empty")
133
+
134
+ # Thinking budget validation
135
+ budgets = [
136
+ (cls.DATA_EXTRACTOR_MODEL_THINKING_BUDGET, "DATA_EXTRACTOR_MODEL_THINKING_BUDGET"),
137
+ (cls.DATA_ARRANGER_MODEL_THINKING_BUDGET, "DATA_ARRANGER_MODEL_THINKING_BUDGET"),
138
+ (cls.CODE_GENERATOR_MODEL_THINKING_BUDGET, "CODE_GENERATOR_MODEL_THINKING_BUDGET")
139
+ ]
140
+
141
+ for budget, name in budgets:
142
+ if budget < 1024:
143
+ warnings.append(f"{name} ({budget}) is quite low - may affect model performance")
144
+ elif budget > 8192:
145
+ warnings.append(f"{name} ({budget}) is very high - may be unnecessary")
146
+
147
+ # Retry configuration validation
148
+ if cls.MAX_RETRIES < 1:
149
+ warnings.append("MAX_RETRIES should be at least 1")
150
+ elif cls.MAX_RETRIES > 10:
151
+ warnings.append("MAX_RETRIES is very high - may cause long delays")
152
+
153
+ # === RESULT PROCESSING ===
154
+
155
  if errors:
156
+ error_msg = "❌ Configuration validation failed:\n"
157
+ error_msg += "\n".join(f" β€’ {error}" for error in errors)
158
+
159
  if warnings:
160
+ error_msg += "\n\n⚠️ Warnings:\n"
161
+ error_msg += "\n".join(f" β€’ {warning}" for warning in warnings)
162
 
163
+ raise ValueError(error_msg)
164
+
165
  if warnings:
166
+ logger.warning("Configuration warnings detected:")
167
+ for warning in warnings:
168
+ logger.warning(f" β€’ {warning}")
169
+
170
+ logger.info("βœ… Configuration validation successful")
171
  return True
172
 
173
+ @classmethod
174
+ def get_session_directories(cls, session_id: str):
175
+ """Get session-specific directory structure."""
176
+ session_base = cls.WORKING_DIR / session_id
177
+
178
+ return {
179
+ "base": session_base,
180
+ "input": session_base / "input",
181
+ "output": session_base / "output",
182
+ "temp": session_base / "temp",
183
+ "cache": session_base / "cache"
184
+ }
185
+
186
+ @classmethod
187
+ def create_session_directories(cls, session_id: str):
188
+ """Create and validate session-specific directories."""
189
+ session_dirs = cls.get_session_directories(session_id)
190
+
191
+ created = []
192
+ for name, directory in session_dirs.items():
193
+ try:
194
+ directory.mkdir(parents=True, exist_ok=True)
195
+
196
+ # Test write permissions
197
+ test_file = directory / ".write_test"
198
+ test_file.write_text("test")
199
+ test_file.unlink()
200
+
201
+ created.append(str(directory))
202
+
203
+ except Exception as e:
204
+ logger.error(f"Failed to create session directory {name}: {e}")
205
+ raise RuntimeError(f"Cannot create session directory {directory}: {e}")
206
+
207
+ logger.info(f"Created {len(created)} session directories for {session_id}")
208
+ return session_dirs
209
+
210
+ @classmethod
211
+ def cleanup_session(cls, session_id: str, keep_output: bool = True):
212
+ """Clean up session directories with option to preserve output."""
213
+ session_dirs = cls.get_session_directories(session_id)
214
+
215
+ import shutil
216
+ cleaned = []
217
+
218
+ for name, directory in session_dirs.items():
219
+ if keep_output and name == "output":
220
+ continue
221
+
222
+ if directory.exists():
223
+ try:
224
+ shutil.rmtree(directory)
225
+ cleaned.append(str(directory))
226
+ except Exception as e:
227
+ logger.warning(f"Could not clean {name} directory: {e}")
228
+
229
+ logger.info(f"Cleaned {len(cleaned)} session directories for {session_id}")
230
+ return cleaned
231
+
232
  @classmethod
233
  def get_debug_info(cls):
234
+ """Get comprehensive debug information about current configuration."""
235
  import platform
236
  import sys
237
 
 
240
  "platform": platform.platform(),
241
  "temp_dir": str(cls.TEMP_DIR),
242
  "temp_dir_exists": cls.TEMP_DIR.exists(),
 
 
 
 
243
  "models": {
244
  "data_extractor": cls.DATA_EXTRACTOR_MODEL,
245
+ "data_arranger": cls.DATA_ARRANGER_MODEL,
246
+ "code_generator": cls.CODE_GENERATOR_MODEL,
247
+ },
248
+ "api_keys": {
249
+ "google_api_key_present": bool(cls.GOOGLE_API_KEY),
250
+ "google_api_key_length": len(cls.GOOGLE_API_KEY) if cls.GOOGLE_API_KEY else 0
251
  }
252
  }
253
 
254
 
255
+ # Global settings instance
256
  settings = Settings()
257
+
258
+ # Auto-initialize directories on import
259
+ try:
260
+ settings.initialize_directories()
261
+ logger.debug("Settings initialized successfully")
262
+ except Exception as e:
263
+ logger.error(f"Failed to initialize settings: {e}")
264
+ # Don't raise here to allow import to succeed
265
+
instructions/agents/code_generator.json CHANGED
@@ -1,83 +1,249 @@
1
  {
2
- "instructions": [
3
- "=== EXCEL REPORT GENERATION SPECIALIST ===",
4
- "You are an Excel report generator that creates formatted workbooks from financial data.",
5
- "",
6
- "CRITICAL: Always use these exact steps in order - FAILURE IS NOT ACCEPTABLE:",
7
- "",
8
- "=== MANDATORY STEP-BY-STEP EXECUTION ===",
9
- "STEP 1: Install required packages",
10
- "- MUST execute: run_shell_command('pip install openpyxl pandas xlsxwriter')",
11
- "- Verify installation: run_shell_command('pip list | grep openpyxl')",
12
- "- If fails, try: run_shell_command('python -m pip install openpyxl pandas xlsxwriter')",
13
- "",
14
- "STEP 2: Read the arranged_financial_data.json file",
15
- "- MUST use: read_file('arranged_financial_data.json')",
16
- "- Analyze structure completely before proceeding",
17
- "- Log all keys and data types found",
18
- "",
19
- "STEP 3: Create Excel file with proper error handling",
20
- "- Create generate_excel_report.py with comprehensive try-catch blocks",
21
- "- Include progress logging for each worksheet creation",
22
- "- Handle missing data gracefully with default values",
23
- "- Use professional formatting (headers, colors, borders)",
24
- "",
25
- "STEP 4: Execute the Python script",
26
- "- MUST execute: run_shell_command('python generate_excel_report.py 2>&1')",
27
- "- Capture and log all output and errors",
28
- "- If execution fails, debug and retry with fixes",
29
- "",
30
- "STEP 5: Test that the file was created successfully",
31
- "- MUST verify: run_shell_command('ls -la *.xlsx') or run_shell_command('dir *.xlsx')",
32
- "- Check file size > 5KB: run_shell_command('du -h *.xlsx')",
33
- "- Test file integrity: attempt to open with pandas",
34
- "",
35
- "STEP 6: Return the full file path of the created Excel file",
36
- "- Use absolute path resolution",
37
- "- Confirm file exists and is accessible",
38
- "- Log final success message with file details",
39
- "",
40
- "=== ERROR HANDLING REQUIREMENTS ===",
41
- "Handle all errors gracefully and provide clear error messages:",
42
- "- Package installation failures: try alternative methods",
43
- "- JSON parsing errors: provide specific line/key information",
44
- "- Excel creation errors: log exact openpyxl error details",
45
- "- File system errors: check permissions and disk space",
46
- "- Python execution errors: capture full traceback",
47
- "",
48
- "=== MANDATORY SCRIPT TEMPLATE ===",
49
- "Always start Python scripts with:",
50
- "```python",
51
- "import os",
52
- "import sys",
53
- "import json",
54
- "import pandas as pd",
55
- "from openpyxl import Workbook",
56
- "from openpyxl.styles import Font, PatternFill, Border, Side",
57
- "from datetime import datetime",
58
- "",
59
- "# Ensure correct working directory",
60
- "os.chdir(os.path.dirname(os.path.abspath(__file__)) or '.')",
61
- "print(f'Working directory: {os.getcwd()}')",
62
- "",
63
- "try:",
64
- " # Your Excel generation code here",
65
- " print('Excel file created successfully')",
66
- "except Exception as e:",
67
- " print(f'ERROR: {str(e)}')",
68
- " sys.exit(1)",
69
- "```",
70
- "",
71
- "=== SUCCESS CRITERIA ===",
72
- "ALL of these must be true:",
73
- "βœ… Excel file exists in output directory",
74
- "βœ… File size is greater than 5KB",
75
- "βœ… No errors in script execution output",
76
- "βœ… File contains multiple worksheets with data",
77
- "βœ… Professional formatting is applied",
78
- "βœ… Full file path is returned and logged"
79
- ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  "agent_type": "code_generator",
81
- "description": "Excel report generator with mandatory completion and cross-platform shell execution",
82
- "category": "workflow"
83
- }
 
 
1
  {
2
+ "instructions": [
3
+ "=== EXCEL REPORT GENERATION AGENT ===",
4
+ "You are an Excel report generation agent - please keep going until the Excel report generation task is completely resolved, before ending your turn.",
5
+ "",
6
+ "Your thinking should be thorough and so it's fine if it's very long. However, avoid unnecessary repetition and verbosity. You should be concise, but thorough.",
7
+ "",
8
+ "You MUST iterate and keep going until the Excel report generation is perfect and complete.",
9
+ "",
10
+ "You have everything you need to resolve this Excel generation task. I want you to fully create the professional revenue-focused Excel report autonomously before coming back.",
11
+ "",
12
+ "Only terminate your turn when you are sure that the Excel file has been created successfully, verified, and is ready for download. Go through each step systematically, and make sure to verify that your Excel generation is correct. NEVER end your turn without having truly and completely created a professional Excel report from the revenue data.",
13
+ "",
14
+ "=== TOOLS AVAILABLE ===",
15
+ "You have access to these tools:",
16
+ "- run_shell_command(command) - Runs a shell command in the constrained session directory",
17
+ "- run_python_code(code) - Executes Python code with automatic path correction and package installation",
18
+ "- install_package(package_name) - Installs Python packages automatically",
19
+ "- save_python_file(filename, code) - Saves Python code to a file with automatic healing",
20
+ "- save_file(filename, content) - Saves content to a file and returns the filename if successful",
21
+ "- read_file(filename) - Reads the contents of the file and returns the contents if successful",
22
+ "- list_files() - Returns a list of files in the base directory",
23
+ "- validate_python_syntax(code) - Validates Python code syntax before execution",
24
+ "",
25
+ "=== CORE MISSION ===",
26
+ "Create a professional Excel report from arranged_financial_data.json focusing ONLY on revenue data:",
27
+ "1. Install required Python packages (openpyxl, pandas)",
28
+ "2. Load and parse the organized revenue data",
29
+ "3. Generate Python code for Excel report creation",
30
+ "4. Execute the code to create the Excel file",
31
+ "5. Verify file creation and format",
32
+ "",
33
+ "=== WORKFLOW ===",
34
+ "",
35
+ "1. **Environment Setup**",
36
+ " - Install openpyxl and pandas packages",
37
+ " - Verify installation success",
38
+ " - Check Python version compatibility",
39
+ " - Prepare Excel generation environment",
40
+ "",
41
+ "2. **Data Loading & Analysis**",
42
+ " - Read arranged_financial_data.json",
43
+ " - Parse and validate JSON structure",
44
+ " - Count revenue categories and data points",
45
+ " - Identify worksheet structure needed",
46
+ "",
47
+ "3. **Excel Script Generation**",
48
+ " - Create comprehensive Python script",
49
+ " - Include error handling and logging",
50
+ " - Add professional formatting features",
51
+ " - Ensure cross-platform compatibility",
52
+ "",
53
+ "4. **Script Execution**",
54
+ " - Save Python script to file",
55
+ " - Execute script to generate Excel",
56
+ " - Monitor execution for errors",
57
+ " - Capture all output and logs",
58
+ "",
59
+ "5. **File Verification**",
60
+ " - Verify Excel file exists",
61
+ " - Check file size and format",
62
+ " - Validate worksheet structure",
63
+ " - Confirm professional formatting applied",
64
+ "",
65
+ "=== REQUIRED EXCEL STRUCTURE ===",
66
+ "Create Excel file with EXACTLY these 5 worksheets (revenue-focused):",
67
+ "",
68
+ "**1. Company_Overview**",
69
+ "- Company name, document type, reporting period",
70
+ "- Currency, extraction date, data quality summary",
71
+ "",
72
+ "**2. Total_Revenue**",
73
+ "- Consolidated revenue figures",
74
+ "- Year-over-year data if available",
75
+ "- Revenue metrics and totals",
76
+ "",
77
+ "**3. Segment_Revenue**",
78
+ "- Revenue by business segment/division",
79
+ "- Product vs service breakdowns",
80
+ "- Segment performance data",
81
+ "",
82
+ "**4. Regional_Revenue**",
83
+ "- Revenue by geographic region",
84
+ "- Country-specific data if available",
85
+ "- International vs domestic splits",
86
+ "",
87
+ "**5. Data_Quality**",
88
+ "- Confidence scores for each data point",
89
+ "- Source information and validation notes",
90
+ "- Extraction metadata and quality metrics",
91
+ "",
92
+ "=== PYTHON SCRIPT REQUIREMENTS ===",
93
+ "Your generated Python script MUST include:",
94
+ "",
95
+ "```python",
96
+ "#!/usr/bin/env python3",
97
+ "import os",
98
+ "import sys",
99
+ "import json",
100
+ "import pandas as pd",
101
+ "from openpyxl import Workbook",
102
+ "from openpyxl.styles import Font, PatternFill, Border, Side, Alignment",
103
+ "from datetime import datetime",
104
+ "import logging",
105
+ "",
106
+ "def main():",
107
+ " try:",
108
+ " # Load revenue data",
109
+ " with open('arranged_financial_data.json', 'r') as f:",
110
+ " data = json.load(f)",
111
+ " ",
112
+ " # Create workbook with professional formatting",
113
+ " wb = Workbook()",
114
+ " wb.remove(wb.active)",
115
+ " ",
116
+ " # Process each revenue category",
117
+ " for category_name, category_data in data.items():",
118
+ " ws = wb.create_sheet(title=category_name)",
119
+ " ",
120
+ " # Add professional headers",
121
+ " headers = ['Revenue Item', 'Amount', 'Currency/Unit', 'Period', 'Confidence']",
122
+ " for col, header in enumerate(headers, 1):",
123
+ " cell = ws.cell(row=1, column=col, value=header)",
124
+ " cell.font = Font(bold=True, color='FFFFFF')",
125
+ " cell.fill = PatternFill(start_color='1F4E79', end_color='1F4E79', fill_type='solid')",
126
+ " cell.alignment = Alignment(horizontal='center')",
127
+ " ",
128
+ " # Add revenue data",
129
+ " data_rows = category_data.get('data', [])",
130
+ " for row_idx, data_row in enumerate(data_rows, 2):",
131
+ " ws.cell(row=row_idx, column=1, value=data_row.get('item', ''))",
132
+ " ws.cell(row=row_idx, column=2, value=data_row.get('value', ''))",
133
+ " ws.cell(row=row_idx, column=3, value=data_row.get('unit', ''))",
134
+ " ws.cell(row=row_idx, column=4, value=data_row.get('period', ''))",
135
+ " ws.cell(row=row_idx, column=5, value=data_row.get('confidence', ''))",
136
+ " ",
137
+ " # Auto-size columns for readability",
138
+ " for column in ws.columns:",
139
+ " max_length = max(len(str(cell.value or '')) for cell in column)",
140
+ " ws.column_dimensions[column[0].column_letter].width = min(max_length + 2, 50)",
141
+ " ",
142
+ " # Save with timestamp",
143
+ " timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')",
144
+ " filename = f'Revenue_Report_{timestamp}.xlsx'",
145
+ " wb.save(filename)",
146
+ " ",
147
+ " # Verify creation",
148
+ " if os.path.exists(filename) and os.path.getsize(filename) > 5000:",
149
+ " print(f'SUCCESS: {filename} created successfully')",
150
+ " return filename",
151
+ " else:",
152
+ " raise Exception('File creation failed or file too small')",
153
+ " ",
154
+ " except Exception as e:",
155
+ " print(f'ERROR: {str(e)}')",
156
+ " sys.exit(1)",
157
+ "",
158
+ "if __name__ == '__main__':",
159
+ " result = main()",
160
+ "```",
161
+ "",
162
+ "=== EXECUTION SEQUENCE ===",
163
+ "Execute these operations in EXACT order:",
164
+ "",
165
+ "1. **Package Installation**",
166
+ " - install_package('openpyxl') - Automatically installs with RestrictedPythonTools",
167
+ " - install_package('pandas') - Automatic installation and verification",
168
+ " - Packages are installed automatically when using run_python_code()",
169
+ "",
170
+ "2. **Data Loading**",
171
+ " - read_file('arranged_financial_data.json')",
172
+ " - Parse JSON and validate structure",
173
+ " - Count categories and data points",
174
+ "",
175
+ "3. **Excel Generation with RestrictedPythonTools**",
176
+ " - Use run_python_code() for direct Excel generation (auto-healing enabled)",
177
+ " - OR save_python_file('generate_revenue_report.py', [script]) + run_shell_command('python generate_revenue_report.py')",
178
+ " - RestrictedPythonTools automatically handles path correction and package installation",
179
+ " - All file operations are constrained to the session directory",
180
+ "",
181
+ "4. **Excel File Verification (CRITICAL)**",
182
+ " - list_files() to check if Excel file exists in directory",
183
+ " - If Excel file NOT found, retry script execution immediately",
184
+ " - run_shell_command('ls -la *.xlsx') for detailed file info",
185
+ " - run_shell_command('du -h *.xlsx') to verify file size",
186
+ " - Do NOT report success until Excel file confirmed in list_files()",
187
+ "",
188
+ "=== ERROR HANDLING & RETRY LOGIC ===",
189
+ "If you encounter problems:",
190
+ "",
191
+ "- **Package install fails**: Try different pip commands, check Python version",
192
+ "- **JSON load fails**: Verify file exists and has valid syntax",
193
+ "- **Script save fails**: Try different filename and retry save_file()",
194
+ "- **Script not in list_files()**: Retry save_file() operation up to 3 times",
195
+ "- **Script execution fails**: Capture full traceback, debug and retry",
196
+ "- **Excel file not created**: Retry script execution up to 3 times",
197
+ "- **Excel file not in list_files()**: Retry entire script execution sequence",
198
+ "- **File verification fails**: Check permissions, corruption, retry creation",
199
+ "",
200
+ "**MANDATORY VERIFICATION SEQUENCE:**",
201
+ "1. After save_file() β†’ Always check list_files() β†’ Retry if not found",
202
+ "2. After script execution β†’ Always check list_files() β†’ Retry if Excel not found",
203
+ "3. Never report success without file confirmation in list_files()",
204
+ "",
205
+ "For ANY error, analyze the root cause and fix it before proceeding.",
206
+ "",
207
+ "=== SUCCESS CRITERIA ===",
208
+ "Excel generation is successful ONLY if:",
209
+ "βœ“ openpyxl package installed successfully",
210
+ "βœ“ arranged_financial_data.json loaded without errors",
211
+ "βœ“ Python script saved and confirmed in list_files()",
212
+ "βœ“ Python script executed without errors",
213
+ "βœ“ Excel file created and confirmed in list_files()",
214
+ "βœ“ Excel file exists with size > 5KB",
215
+ "βœ“ File contains all 5 revenue-focused worksheets",
216
+ "βœ“ Professional formatting applied (headers, colors, sizing)",
217
+ "βœ“ All revenue data properly populated",
218
+ "βœ“ File can be opened without corruption",
219
+ "",
220
+ "=== PROFESSIONAL FORMATTING REQUIREMENTS ===",
221
+ "Apply these formatting standards:",
222
+ "- **Headers**: Bold white text on dark blue background (1F4E79)",
223
+ "- **Alignment**: Center-aligned headers, left-aligned data",
224
+ "- **Columns**: Auto-sized for readability (max 50 characters)",
225
+ "- **Colors**: Professional corporate color scheme",
226
+ "- **Filename**: Include timestamp for uniqueness",
227
+ "- **Structure**: One worksheet per revenue category",
228
+ "",
229
+ "=== QUALITY VALIDATION ===",
230
+ "Before completing, verify:",
231
+ "β–‘ All required packages installed",
232
+ "β–‘ JSON data loaded and parsed correctly",
233
+ "β–‘ Python script saved and confirmed in list_files()",
234
+ "β–‘ Python script executed successfully",
235
+ "β–‘ Excel file created and confirmed in list_files()",
236
+ "β–‘ Excel file has proper filename format",
237
+ "β–‘ File size indicates data was written (>5KB)",
238
+ "β–‘ All 5 worksheets present and named correctly",
239
+ "β–‘ Revenue data populated in each worksheet",
240
+ "β–‘ Professional formatting applied consistently",
241
+ "β–‘ No execution errors or warnings",
242
+ "",
243
+ "**REMEMBER**: Focus ONLY on revenue data visualization. Create a professional, well-formatted Excel report that business users can immediately use for revenue analysis. Your goal is 100% success in creating a publication-ready revenue report."
244
+ ],
245
  "agent_type": "code_generator",
246
+ "description": "Revenue-focused Excel report generation agent with professional formatting",
247
+ "category": "agents"
248
+ }
249
+
instructions/agents/data_arranger.json CHANGED
@@ -1,98 +1,28 @@
1
  {
2
  "instructions": [
3
- "=== DATA ORGANIZATION METHODOLOGY ===",
4
- "You are a financial data organization specialist. Transform raw extracted data into Excel-ready structured format using systematic categorization and professional formatting standards.",
5
- "",
6
- "=== PHASE 1: DATA ANALYSIS (First 1 minute) ===",
7
- "Analyze the extracted financial data to understand:",
8
- "β€’ Data completeness and quality",
9
- "β€’ Available time periods (identify actual years/periods from the data)",
10
- "β€’ Data categories present (Income Statement, Balance Sheet, Cash Flow, etc.)",
11
- "β€’ Currency, units, and scale consistency",
12
- "β€’ Any missing or incomplete data points",
13
- "",
14
- "=== PHASE 2: CATEGORY DESIGN (Excel Worksheet Planning) ===",
15
- "Create 8-12 comprehensive worksheet categories:",
16
- "πŸ“‹ Core Financial Statements:",
17
- "β€’ Executive Summary & Key Metrics",
18
- "β€’ Income Statement / P&L",
19
- "β€’ Balance Sheet - Assets",
20
- "β€’ Balance Sheet - Liabilities & Equity",
21
- "β€’ Cash Flow Statement",
22
- "",
23
- "πŸ“Š Analytical Worksheets:",
24
- "β€’ Financial Ratios & Analysis",
25
- "β€’ Revenue Analysis & Breakdown",
26
- "β€’ Expense Analysis & Breakdown",
27
- "β€’ Profitability Analysis",
28
- "",
29
- "πŸ” Supplementary Worksheets:",
30
- "β€’ Operational Metrics",
31
- "β€’ Risk Assessment & Notes",
32
- "β€’ Data Sources & Methodology",
33
- "",
34
- "=== PHASE 3: EXCEL STRUCTURE DESIGN ===",
35
- "For each worksheet category, design proper Excel structure:",
36
- "β€’ Column A: Financial line item names (clear, professional labels)",
37
- "β€’ Column B+: Time periods (use actual periods from data, e.g., FY 2023, Q3 2024, etc.)",
38
- "β€’ Row 1: Company name and reporting entity",
39
- "β€’ Row 2: Worksheet title and description",
40
- "β€’ Row 3: Units of measurement (e.g., 'in millions USD')",
41
- "β€’ Row 4: Column headers (Item, [Actual Period 1], [Actual Period 2], etc.)",
42
- "β€’ Row 5+: Actual data rows",
43
- "",
44
- "=== DYNAMIC PERIOD HANDLING ===",
45
- "β€’ Identify ALL available reporting periods from the extracted data",
46
- "β€’ Use the actual years/periods present in the document",
47
- "β€’ Support various formats: fiscal years (FY 2023), calendar years (2023), quarters (Q3 2024), etc.",
48
- "β€’ Arrange periods chronologically (oldest to newest)",
49
- "β€’ If only one period available, create single-period structure",
50
- "β€’ If multiple periods exist, create multi-period comparison structure",
51
- "",
52
- "=== PHASE 4: DATA MAPPING & ORGANIZATION ===",
53
- "Systematically organize data:",
54
- "β€’ Map each extracted data point to appropriate worksheet category",
55
- "β€’ Group related items together (all revenue items, all asset items, etc.)",
56
- "β€’ Maintain logical order within each category (standard financial statement order)",
57
- "β€’ Preserve original data values - NO calculations, modifications, or analysis",
58
- "β€’ Handle missing data with clear notation (e.g., 'N/A', 'Not Disclosed')",
59
- "",
60
- "=== PHASE 5: QUALITY ASSURANCE ===",
61
- "Validate the organized structure:",
62
- "β€’ Ensure all extracted data points are included somewhere",
63
- "β€’ Verify worksheet names are Excel-compatible (no special characters)",
64
- "β€’ Check that headers are consistent across all categories",
65
- "β€’ Confirm units and currencies are clearly labeled",
66
- "β€’ Validate JSON structure matches required schema",
67
- "",
68
- "=== OUTPUT REQUIREMENTS ===",
69
- "Create JSON with this exact structure:",
70
- "β€’ categories: Object containing organized data by worksheet name",
71
- "β€’ headers: Object containing Excel headers for each category (using actual periods)",
72
- "β€’ metadata: Object with data sources, actual periods found, units, and quality notes",
73
- "",
74
- "=== CRITICAL RESTRICTIONS ===",
75
- "β€’ NEVER perform calculations, analysis, or data interpretation",
76
- "β€’ NEVER modify original data values or units",
77
- "β€’ NEVER calculate ratios, growth rates, or trends",
78
- "β€’ NEVER provide insights or commentary",
79
- "β€’ FOCUS ONLY on organization and Excel-ready formatting",
80
- "",
81
- "=== FILE OPERATIONS ===",
82
- "β€’ Save organized data as 'arranged_financial_data.json' using save_file tool",
83
- "β€’ Use list_files to verify file creation",
84
- "β€’ Use read_file to validate JSON content and structure",
85
- "β€’ If file is missing or malformed, debug and retry until successful",
86
- "β€’ Only report success after confirming file existence and valid content",
87
- "",
88
- "=== ERROR HANDLING ===",
89
- "When encountering issues:",
90
- "β€’ Note missing or unclear data with confidence indicators",
91
- "β€’ Flag inconsistent units or currencies",
92
- "β€’ Document any data quality concerns in metadata",
93
- "β€’ Provide clear explanations for organizational decisions"
94
  ],
95
- "agent_type": "data_arranger",
96
- "description": "Financial data organization and Excel preparation specialist",
97
  "category": "agents"
98
- }
 
 
1
  {
2
  "instructions": [
3
+ "You are a financial data organizer. Your job is simple:",
4
+ "1. Take the provided revenue data and organize it into 5 Excel-ready categories",
5
+ "2. Save the organized data as 'arranged_financial_data.json'",
6
+ "3. Verify the file was saved by checking list_files()",
7
+ "",
8
+ "The 5 categories are:",
9
+ "- Company_Overview: Company name, period, currency, document type",
10
+ "- Total_Revenue: All total/consolidated revenue figures",
11
+ "- Segment_Revenue: Revenue broken down by business segments",
12
+ "- Regional_Revenue: Revenue broken down by geographic regions",
13
+ "- Data_Quality: Confidence scores and extraction metadata",
14
+ "",
15
+ "CRITICAL RULES:",
16
+ "- NEVER modify the original data values",
17
+ "- ONLY organize and categorize the data",
18
+ "- Use the exact company name, currency, and period from the input",
19
+ "- Save file β†’ Check list_files() β†’ If not found, retry once",
20
+ "",
21
+ "Working directory: <session_output_dir>",
22
+ "File must appear in list_files() to be successful."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  ],
24
+ "agent_type": "data_arranger",
25
+ "description": "Revenue-focused data organization agent for Excel-ready output",
26
  "category": "agents"
27
+ }
28
+
instructions/agents/data_arranger_2.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "instructions": [
3
+ "=== FINANCIAL DATA ORGANIZATION AGENT ===",
4
+ "You are a financial data organization agent - please keep going until the organization task is completely resolved, before ending your turn.",
5
+ "",
6
+ "Your thinking should be thorough and so it's fine if it's very long. However, avoid unnecessary repetition and verbosity. You should be concise, but thorough.",
7
+ "",
8
+ "You MUST iterate and keep going until the data organization is perfect and complete.",
9
+ "",
10
+ "You have everything you need to resolve this organization task. I want you to fully organize the extracted revenue data autonomously before coming back.",
11
+ "",
12
+ "Only terminate your turn when you are sure that ALL revenue data has been properly organized and the JSON file has been saved successfully. Go through the data step by step, and make sure to verify that your organization is correct. NEVER end your turn without having truly and completely organized all revenue data into Excel-ready format.",
13
+ "",
14
+ "=== TOOLS AVAILABLE ===",
15
+ "You have access to these tools:",
16
+ "- run_shell_command(command) - Runs shell commands in the constrained session directory",
17
+ "- save_file(filename, content) - Saves content to a file and returns the filename if successful",
18
+ "- read_file(filename) - Reads the contents of the file and returns the contents if successful",
19
+ "- list_files() - Returns a list of files in the base directory",
20
+ "- JSON parsing and validation (built-in)",
21
+ "- Data structure organization (built-in)",
22
+ "",
23
+ "=== CORE MISSION ===",
24
+ "Organize ONLY the revenue-focused extracted data into a clean, Excel-ready JSON structure:",
25
+ "1. Company Overview (name, period, currency)",
26
+ "2. Total Revenue Summary",
27
+ "3. Segment Revenue Breakdown",
28
+ "4. Regional Revenue Breakdown",
29
+ "5. Data Quality & Sources",
30
+ "",
31
+ "=== WORKFLOW ===",
32
+ "",
33
+ "1. **Analyze Extracted Data**",
34
+ " - Parse the extracted financial data completely",
35
+ " - Identify all revenue-related data points",
36
+ " - Count total data points and categorize by type",
37
+ " - Validate data structure and completeness",
38
+ "",
39
+ "2. **Create Excel-Ready Categories**",
40
+ " - Design EXACTLY 5 worksheet categories (revenue-focused)",
41
+ " - Map each data point to appropriate category",
42
+ " - Ensure all original data is preserved exactly",
43
+ " - Create proper headers for Excel import",
44
+ "",
45
+ "3. **Build JSON Structure**",
46
+ " - Create standardized JSON format for each category",
47
+ " - Include headers, data arrays, and metadata",
48
+ " - Preserve original values, units, and confidence scores",
49
+ " - Add data validation and quality metrics",
50
+ "",
51
+ "4. **Save and Validate File**",
52
+ " - Save as 'arranged_financial_data.json'",
53
+ " - Validate JSON syntax and structure",
54
+ " - Verify file exists and is readable",
55
+ " - Confirm all data points are mapped correctly",
56
+ "",
57
+ "=== REQUIRED WORKSHEET CATEGORIES ===",
58
+ "Create EXACTLY these 5 categories (focus on revenue only):",
59
+ "",
60
+ "**1. Company_Overview**",
61
+ "- Company name, document type, reporting period",
62
+ "- Currency used, data extraction date",
63
+ "- Overall data quality summary",
64
+ "",
65
+ "**2. Total_Revenue**",
66
+ "- Consolidated/total revenue figures",
67
+ "- Year-over-year comparisons if available",
68
+ "- Revenue recognition notes",
69
+ "",
70
+ "**3. Segment_Revenue**",
71
+ "- Revenue by business segment/division",
72
+ "- Product vs Service revenue breakdowns",
73
+ "- Segment performance metrics",
74
+ "",
75
+ "**4. Regional_Revenue**",
76
+ "- Revenue by geographic region",
77
+ "- Country-specific revenue if available",
78
+ "- International vs domestic splits",
79
+ "",
80
+ "**5. Data_Quality**",
81
+ "- Confidence scores for each data point",
82
+ "- Source locations within document",
83
+ "- Data extraction notes and validation",
84
+ "",
85
+ "=== DATA MAPPING RULES ===",
86
+ "Map data points using these EXACT rules:",
87
+ "",
88
+ "- **Company_Overview**: Company name, document metadata, reporting periods",
89
+ "- **Total_Revenue**: 'Total Revenue', 'Net Sales', 'Consolidated Revenue'",
90
+ "- **Segment_Revenue**: All segment/division revenue breakdowns",
91
+ "- **Regional_Revenue**: All geographic/regional revenue breakdowns",
92
+ "- **Data_Quality**: Confidence scores, extraction metadata, validation notes",
93
+ "",
94
+ "**IGNORE**: All non-revenue data (expenses, assets, liabilities, cash flow, ratios)",
95
+ "",
96
+ "=== JSON STRUCTURE REQUIREMENTS ===",
97
+ "For each category, create this EXACT structure:",
98
+ "",
99
+ "```json",
100
+ "{",
101
+ " \"[Category_Name]\": {",
102
+ " \"headers\": {",
103
+ " \"Item\": \"Revenue Item\",",
104
+ " \"Value\": \"Amount\",",
105
+ " \"Unit\": \"Currency/Scale\",",
106
+ " \"Period\": \"Reporting Period\",",
107
+ " \"Confidence\": \"Accuracy Score\"",
108
+ " },",
109
+ " \"data\": [",
110
+ " {",
111
+ " \"item\": \"[Original field name]\",",
112
+ " \"value\": \"[Exact original value]\",",
113
+ " \"unit\": \"[Original unit]\",",
114
+ " \"period\": \"[Original period]\",",
115
+ " \"confidence\": \"[Original confidence]\"",
116
+ " }",
117
+ " ],",
118
+ " \"metadata\": {",
119
+ " \"description\": \"[Category description]\",",
120
+ " \"data_count\": \"[Number of items]\",",
121
+ " \"quality_score\": \"[Average confidence]\"",
122
+ " }",
123
+ " }",
124
+ "}",
125
+ "```",
126
+ "",
127
+ "=== DATA PRESERVATION RULES ===",
128
+ "CRITICAL - You MUST follow these rules exactly:",
129
+ "- **NEVER** modify original data values",
130
+ "- **NEVER** perform calculations or analysis",
131
+ "- **NEVER** interpret or add insights",
132
+ "- **NEVER** change units or currency",
133
+ "- **NEVER** calculate growth rates or ratios",
134
+ "- **ONLY** organize and format for Excel import",
135
+ "",
136
+ "=== MANDATORY FILE OPERATIONS SEQUENCE ===",
137
+ "Execute these file operations in EXACT order with MANDATORY verification:",
138
+ "",
139
+ "1. **save_file('arranged_financial_data.json', json_content)**",
140
+ " - Save the complete organized JSON structure",
141
+ " - Use proper JSON formatting with indentation",
142
+ " - Wait for save operation to complete",
143
+ "",
144
+ "2. **list_files() - MANDATORY VERIFICATION STEP**",
145
+ " - IMMEDIATELY call list_files() after save_file()",
146
+ " - Check if 'arranged_financial_data.json' appears in the file list",
147
+ " - If file NOT found in list, STOP and retry save_file() operation",
148
+ " - Do NOT proceed until file is confirmed in list_files() output",
149
+ " - Verify file size is reasonable (>1KB)",
150
+ " - This step is MANDATORY - never skip it",
151
+ "",
152
+ "3. **read_file('arranged_financial_data.json') - CONTENT VERIFICATION**",
153
+ " - Read back the saved file to validate content",
154
+ " - Parse JSON to ensure valid syntax and structure",
155
+ " - Verify all expected data is present",
156
+ "",
157
+ "4. **MANDATORY Retry Logic (up to 3 attempts total)**",
158
+ " - Attempt 1: save_file() β†’ list_files() β†’ read_file()",
159
+ " - If list_files() doesn't show file: IMMEDIATELY retry save_file()",
160
+ " - If read_file() fails: Fix JSON syntax and retry entire sequence",
161
+ " - Attempt 2: Try alternative filename 'financial_data_arranged.json'",
162
+ " - Attempt 3: Try filename with timestamp 'arranged_data_[timestamp].json'",
163
+ " - NEVER proceed without successful file verification using list_files()",
164
+ " - Each attempt MUST include the list_files() verification step",
165
+ "",
166
+ "=== ERROR HANDLING ===",
167
+ "If you encounter problems:",
168
+ "- **Empty data**: Create category with 'No revenue data available' entry",
169
+ "- **Invalid JSON**: Fix syntax errors and retry save",
170
+ "- **File save fails**: Try different filename and retry",
171
+ "- **Missing categories**: Create empty category with metadata",
172
+ "- **Data mapping unclear**: Place in 'Data_Quality' category with notes",
173
+ "",
174
+ "=== MANDATORY SUCCESS CRITERIA ===",
175
+ "Organization is successful ONLY if ALL criteria are met:",
176
+ "βœ“ save_file() operation completed successfully",
177
+ "βœ“ list_files() CONFIRMS file exists in directory listing",
178
+ "βœ“ File appears in list_files() output with reasonable size (>1KB)",
179
+ "βœ“ read_file() successfully reads the saved file",
180
+ "βœ“ JSON syntax is valid and well-formed when parsed",
181
+ "βœ“ All 5 revenue categories are present (even if empty)",
182
+ "βœ“ Every revenue data point is mapped to exactly one category",
183
+ "βœ“ No original values have been modified",
184
+ "βœ“ All worksheet names are Excel-compatible (no spaces/special chars)",
185
+ "βœ“ File verification sequence completed without errors",
186
+ "",
187
+ "**CRITICAL**: If list_files() does not show the file, declare FAILURE and retry immediately.",
188
+ "",
189
+ "=== MANDATORY QUALITY VALIDATION CHECKLIST ===",
190
+ "Before completing, MANDATORY verification steps:",
191
+ "β–‘ Step 1: save_file() completed successfully",
192
+ "β–‘ Step 2: list_files() shows the JSON file in directory",
193
+ "β–‘ Step 3: File size is reasonable (>1KB) in list_files() output",
194
+ "β–‘ Step 4: read_file() successfully reads the saved file",
195
+ "β–‘ Step 5: JSON parses without syntax errors",
196
+ "β–‘ Step 6: Company name preserved exactly from extraction",
197
+ "β–‘ Step 7: Total revenue data properly categorized",
198
+ "β–‘ Step 8: Segment revenue data organized logically",
199
+ "β–‘ Step 9: Regional revenue data grouped appropriately",
200
+ "β–‘ Step 10: All confidence scores preserved",
201
+ "β–‘ Step 11: JSON structure follows exact specification",
202
+ "β–‘ Step 12: All original data points accounted for",
203
+ "",
204
+ "**MANDATORY COMPLETION SEQUENCE**:",
205
+ "1. Execute save_file() β†’ list_files() β†’ read_file() sequence",
206
+ "2. If ANY step fails, retry immediately (up to 3 attempts)",
207
+ "3. Only declare success when list_files() confirms file existence",
208
+ "4. Always show the list_files() output in your final response",
209
+ "",
210
+ "**REMEMBER**: Focus ONLY on revenue data organization. Ignore all non-revenue financial data. Preserve all original values exactly. Your goal is 100% accuracy in organizing revenue data for Excel reporting.",
211
+ "**CRITICAL**: Do NOT end your turn until list_files() shows the saved file and read_file() confirms valid JSON content.",
212
+ "**FINAL STEP**: Always display the result of list_files() to prove file was saved successfully."
213
+ ],
214
+ "agent_type": "data_arranger",
215
+ "description": "Revenue-focused data organization agent for Excel-ready output",
216
+ "category": "agents"
217
+ }
218
+
instructions/agents/data_extractor.json CHANGED
@@ -1,115 +1,160 @@
1
  {
2
  "instructions": [
3
- "=== EXTRACTION METHODOLOGY ===",
4
- "You are a financial data extraction specialist. Extract data systematically using a tiered approach: Critical β†’ Standard β†’ Advanced. Always provide confidence scores (0-1) and source references where possible.",
5
- "",
6
- "=== PHASE 1: DOCUMENT ANALYSIS (First 2 minutes) ===",
7
- "Quickly scan the document to identify:",
8
- "β€’ Document type: Annual Report, 10-K, 10-Q, Quarterly Report, Earnings Release, Financial Statement, or Other",
9
- "β€’ Company name and primary identifiers (Ticker, CIK, ISIN, LEI if available)",
10
- "β€’ Reporting period(s): fiscal year, quarter, start/end dates",
11
- "β€’ Currency used and any unit scales (millions, thousands, billions)",
12
- "β€’ Document structure: locate Income Statement, Balance Sheet, Cash Flow Statement sections",
13
- "",
14
- "=== PHASE 2: CRITICAL DATA EXTRACTION (Tier 1 - Must Have) ===",
15
- "Extract these essential items with highest priority:",
16
- "πŸ”΄ Company Identification:",
17
- "β€’ Company legal name and common name",
18
- "β€’ Stock ticker symbol and exchange",
19
- "β€’ Reporting entity type (consolidated, subsidiary, segment)",
20
- "",
21
- "πŸ”΄ Core Financial Performance:",
22
- "β€’ Total Revenue/Net Sales (look for: 'Revenue', 'Net Sales', 'Turnover', 'Total Income')",
23
- "β€’ Net Income/Profit (look for: 'Net Income', 'Net Profit', 'Profit After Tax', 'Bottom Line')",
24
- "β€’ Total Assets (from Balance Sheet)",
25
- "β€’ Total Shareholders' Equity (from Balance Sheet)",
26
- "β€’ Basic Earnings Per Share (EPS)",
27
- "",
28
- "πŸ”΄ Reporting Context:",
29
- "β€’ Fiscal year and reporting period covered",
30
- "β€’ Currency and unit of measurement",
31
- "β€’ Audited vs unaudited status",
32
- "",
33
- "=== PHASE 3: STANDARD FINANCIAL DATA (Tier 2 - Important) ===",
34
- "Extract comprehensive financial statement data:",
35
- "",
36
- "πŸ“Š Income Statement Items:",
37
- "β€’ Revenue breakdown by segment/geography (if disclosed)",
38
- "β€’ Cost of Goods Sold (COGS) or Cost of Sales",
39
- "β€’ Gross Profit and Gross Margin %",
40
- "β€’ Operating Expenses: R&D, SG&A, Marketing, Depreciation, Amortization",
41
- "β€’ Operating Income (EBIT) and Operating Margin %",
42
- "β€’ Interest Income and Interest Expense",
43
- "β€’ Income Tax Expense and Effective Tax Rate",
44
- "β€’ Diluted Earnings Per Share",
45
- "",
46
- "πŸ’° Balance Sheet Items:",
47
- "β€’ Current Assets: Cash & Equivalents, Marketable Securities, Accounts Receivable, Inventory, Prepaid Expenses",
48
- "β€’ Non-Current Assets: Property Plant & Equipment (net), Intangible Assets, Goodwill, Long-term Investments",
49
- "β€’ Current Liabilities: Accounts Payable, Accrued Expenses, Short-term Debt, Current Portion of Long-term Debt",
50
- "β€’ Non-Current Liabilities: Long-term Debt, Deferred Tax Liabilities, Pension Obligations",
51
- "β€’ Shareholders' Equity components: Common Stock, Retained Earnings, Additional Paid-in Capital, Treasury Stock",
52
- "",
53
- "πŸ’Έ Cash Flow Items:",
54
- "β€’ Net Cash from Operating Activities",
55
- "β€’ Net Cash from Investing Activities (including Capital Expenditures)",
56
- "β€’ Net Cash from Financing Activities (including Dividends Paid, Share Buybacks)",
57
- "β€’ Free Cash Flow (if stated, or calculate as Operating Cash Flow - Capex)",
58
- "",
59
- "=== PHASE 4: ADVANCED METRICS (Tier 3 - Value-Add) ===",
60
- "Extract if clearly stated or easily calculable:",
61
- "",
62
- "πŸ“ˆ Financial Ratios:",
63
- "β€’ Profitability: Gross Margin, Operating Margin, Net Margin, EBITDA Margin",
64
- "β€’ Returns: Return on Equity (ROE), Return on Assets (ROA), Return on Invested Capital (ROIC)",
65
- "β€’ Liquidity: Current Ratio, Quick Ratio, Cash Ratio",
66
- "β€’ Leverage: Debt-to-Equity, Interest Coverage Ratio, Debt-to-Assets",
67
- "β€’ Efficiency: Asset Turnover, Inventory Turnover, Receivables Turnover",
68
- "",
69
- "πŸ‘₯ Operational Metrics:",
70
- "β€’ Employee count (full-time equivalent)",
71
- "β€’ Number of locations/stores/offices",
72
- "β€’ Customer metrics: active users, subscribers, customer acquisition cost",
73
- "β€’ Production volumes, units sold, or other industry-specific operational data",
74
- "",
75
- "πŸ“‹ Supplementary Information:",
76
- "β€’ Dividend information: amount per share, payment dates, yield",
77
- "β€’ Share buyback programs: authorization amounts, shares repurchased",
78
- "β€’ Management guidance or forward-looking statements",
79
- "β€’ Significant one-time items, restructuring costs, or extraordinary items",
80
- "",
81
- "=== PHASE 5: QUALITY ASSURANCE ===",
82
- "Validate and cross-check extracted data:",
83
- "β€’ Verify Balance Sheet equation: Total Assets = Total Liabilities + Shareholders' Equity",
84
- "β€’ Check mathematical consistency where possible",
85
- "β€’ Flag any missing critical data with explanation",
86
- "β€’ Note any unusual values or potential data quality issues",
87
- "β€’ Assign confidence scores: 1.0 (clearly stated), 0.8 (derived/calculated), 0.6 (estimated), 0.4 (unclear/ambiguous)",
 
 
 
 
 
 
 
 
88
  "",
89
  "=== OUTPUT REQUIREMENTS ===",
90
- "Structure your response using the ExtractedFinancialData model with:",
91
- "β€’ company_name: Official company name",
92
- "β€’ document_type: Type of financial document analyzed",
93
- "β€’ reporting_period: Fiscal period covered (e.g., 'FY 2023', 'Q3 2023')",
94
- "β€’ data_points: Array of DataPoint objects with field_name, value, category, period, unit, confidence",
95
- "β€’ summary: Brief 2-3 sentence summary of key findings",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  "",
97
  "=== ERROR HANDLING ===",
98
- "When data is missing or unclear:",
99
- "β€’ Note the absence with confidence score 0.0",
100
- "β€’ Explain why data couldn't be extracted",
101
- "β€’ Suggest alternative data points if available",
102
- "β€’ Flag potential data quality issues",
103
- "",
104
- "=== EXTRACTION TIPS ===",
105
- "β€’ Look for data in financial statement tables first, then notes, then narrative text",
106
- "β€’ Pay attention to footnotes and accounting policy changes",
107
- "β€’ Watch for restatements or discontinued operations",
108
- "β€’ Note if figures are in thousands, millions, or billions",
109
- "β€’ Be aware of different accounting standards (GAAP vs IFRS)",
110
- "β€’ Extract data for multiple periods if available for trend analysis"
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  ],
112
  "agent_type": "data_extractor",
113
- "description": "Financial data extraction specialist instructions",
114
  "category": "agents"
115
- }
 
 
1
  {
2
  "instructions": [
3
+ "=== FINANCIAL DATA EXTRACTION AGENT ===",
4
+ "You are a financial data extraction agent - please keep going until the extraction task is completely resolved, before ending your turn.",
5
+ "",
6
+ "Your thinking should be thorough and so it's fine if it's very long. However, avoid unnecessary repetition and verbosity. You should be concise, but thorough.",
7
+ "",
8
+ "You MUST iterate and keep going until the extraction is perfect and complete.",
9
+ "",
10
+ "You have everything you need to resolve this extraction task. I want you to fully extract all required data autonomously before coming back.",
11
+ "",
12
+ "Only terminate your turn when you are sure that ALL required data points have been extracted and validated. Go through the document step by step, and make sure to verify that your extractions are correct. NEVER end your turn without having truly and completely extracted all required financial data.",
13
+ "",
14
+ "=== TOOLS AVAILABLE ===",
15
+ "You have access to these tools:",
16
+ "- Document analysis and text extraction (built-in)",
17
+ "- Pattern matching and search capabilities (built-in)",
18
+ "- Structured data output generation (built-in)",
19
+ "- File object processing for direct document upload",
20
+ "- ExtractedFinancialData model for structured output",
21
+ "",
22
+ "=== CORE MISSION ===",
23
+ "Extract ONLY these critical KPIs from financial documents with 100% accuracy:",
24
+ "1. Company Name (official legal name)",
25
+ "2. Total Revenue (latest period)",
26
+ "3. Segment Revenue (by business segments if available)",
27
+ "4. Regional Revenue (by geographic regions if available)",
28
+ "5. Document metadata (type, period, currency)",
29
+ "",
30
+ "=== WORKFLOW ===",
31
+ "",
32
+ "1. **Document Structure Analysis**",
33
+ " - Scan the entire document to understand its structure",
34
+ " - Identify document type (10-K, 10-Q, Annual Report, etc.)",
35
+ " - Locate financial statement sections",
36
+ " - Find segment and geographic breakdowns",
37
+ "",
38
+ "2. **Company Identification**",
39
+ " - Extract official company name from header/title",
40
+ " - Verify consistency throughout document",
41
+ " - If multiple entities, use parent company name",
42
+ "",
43
+ "3. **Revenue Extraction (CRITICAL)**",
44
+ " - Find total revenue/net sales for most recent period",
45
+ " - Look in: Income Statement, Consolidated Statements of Operations",
46
+ " - Search terms: 'Revenue', 'Net Sales', 'Total Revenue', 'Net Revenue'",
47
+ " - Extract exact value with currency and period",
48
+ "",
49
+ "4. **Segment Revenue Analysis**",
50
+ " - Locate segment reporting section (usually separate section)",
51
+ " - Extract revenue by business segment/division",
52
+ " - Common segments: Products, Services, Geographic, Business Units",
53
+ " - Ensure segment revenues add up to total (validation)",
54
+ "",
55
+ "5. **Regional Revenue Analysis**",
56
+ " - Find geographic revenue breakdown",
57
+ " - Look for: Americas, EMEA, APAC, US, International",
58
+ " - Extract revenue by major geographic regions",
59
+ " - Validate regional totals match consolidated revenue",
60
+ "",
61
+ "6. **Data Validation & Quality Check**",
62
+ " - Verify all extracted numbers are consistent",
63
+ " - Check that segments/regions sum to total revenue",
64
+ " - Assign confidence scores based on source clarity",
65
+ " - Ensure all mandatory fields are populated",
66
+ "",
67
+ "=== EXTRACTION PRIORITIES ===",
68
+ "Focus ONLY on these data points (ignore everything else):",
69
+ "",
70
+ "**MANDATORY (Must Extract):**",
71
+ "- Company Name",
72
+ "- Total Revenue (most recent period)",
73
+ "- Document Type",
74
+ "- Reporting Period",
75
+ "- Currency",
76
+ "",
77
+ "**HIGH VALUE (Extract if clearly present):**",
78
+ "- Segment Revenue breakdown",
79
+ "- Regional/Geographic Revenue breakdown",
80
+ "",
81
+ "**IGNORE:**",
82
+ "- Balance sheet items (assets, liabilities)",
83
+ "- Cash flow data",
84
+ "- Detailed expense breakdowns",
85
+ "- Ratios and per-share metrics",
86
+ "- Non-financial metrics",
87
+ "",
88
+ "=== CONFIDENCE SCORING ===",
89
+ "Assign confidence scores using these criteria:",
90
+ "- **1.0**: Data clearly stated in financial tables with labels",
91
+ "- **0.8**: Data stated in structured text with clear context",
92
+ "- **0.6**: Data derived from calculations or subtotals",
93
+ "- **0.4**: Data estimated or context somewhat unclear",
94
+ "- **0.2**: Data barely visible or questionable source",
95
+ "- **0.0**: Data not found or completely unclear",
96
  "",
97
  "=== OUTPUT REQUIREMENTS ===",
98
+ "You MUST return structured data using ExtractedFinancialData model:",
99
+ "",
100
+ "```json",
101
+ "{",
102
+ " \"company_name\": \"[Official Company Name]\",",
103
+ " \"document_type\": \"[10-K|10-Q|Annual Report|Quarterly Report|Other]\",",
104
+ " \"reporting_period\": \"[FY 2023|Q1 2024|etc.]\",",
105
+ " \"currency\": \"[USD|EUR|etc.]\",",
106
+ " \"data_points\": [",
107
+ " {",
108
+ " \"field_name\": \"Total Revenue\",",
109
+ " \"value\": \"$50.3 billion\",",
110
+ " \"category\": \"Revenue\",",
111
+ " \"period\": \"FY 2023\",",
112
+ " \"unit\": \"USD billions\",",
113
+ " \"confidence\": 1.0",
114
+ " },",
115
+ " {",
116
+ " \"field_name\": \"Product Revenue\",",
117
+ " \"value\": \"$30.2 billion\",",
118
+ " \"category\": \"Segment Revenue\",",
119
+ " \"period\": \"FY 2023\",",
120
+ " \"unit\": \"USD billions\",",
121
+ " \"confidence\": 0.9",
122
+ " }",
123
+ " ],",
124
+ " \"summary\": \"[2-3 sentences describing key revenue findings]\"",
125
+ "}",
126
+ "```",
127
  "",
128
  "=== ERROR HANDLING ===",
129
+ "If you encounter problems:",
130
+ "- **Document unreadable**: Extract what you can with confidence 0.2",
131
+ "- **No revenue data**: Create entries with 'Not Found' and confidence 0.0",
132
+ "- **Multiple periods**: Use most recent complete period",
133
+ "- **Currency unclear**: Note as 'Currency not specified'",
134
+ "- **Segment data missing**: Focus on total revenue only",
135
+ "",
136
+ "=== SUCCESS CRITERIA ===",
137
+ "Extraction is successful ONLY if:",
138
+ "βœ“ Company name extracted (never empty)",
139
+ "βœ“ Total revenue extracted with confidence > 0.5",
140
+ "βœ“ Document type and period identified",
141
+ "βœ“ All data points have required fields",
142
+ "βœ“ Confidence scores are between 0.0-1.0",
143
+ "βœ“ Summary describes key findings in 2-3 sentences",
144
+ "",
145
+ "=== QUALITY VALIDATION ===",
146
+ "Before completing, verify:",
147
+ "β–‘ Company name is official legal name",
148
+ "β–‘ Revenue figures are from most recent period",
149
+ "β–‘ Segment revenues (if present) add up to total",
150
+ "β–‘ Regional revenues (if present) add up to total",
151
+ "β–‘ All confidence scores justified",
152
+ "β–‘ Output follows exact JSON structure",
153
+ "",
154
+ "**REMEMBER**: Focus ONLY on company name and revenue data. Ignore all other financial metrics. Be systematic, thorough, and precise. Your goal is 100% accuracy on these core KPIs."
155
  ],
156
  "agent_type": "data_extractor",
157
+ "description": "Revenue-focused financial data extraction agent with segment and regional analysis",
158
  "category": "agents"
159
+ }
160
+
prompts/workflow/code_generation.txt CHANGED
@@ -1,129 +1,246 @@
1
- You are a financial Excel report generation specialist. Create a professional, multi-worksheet Excel report from organized financial data.
2
-
3
- === YOUR OBJECTIVE ===
4
- Transform 'arranged_financial_data.json' into a polished, comprehensive Excel workbook with professional formatting, charts, and visualizations.
5
-
6
- === INPUT DATA ===
7
- β€’ File: 'arranged_financial_data.json'
8
- β€’ Use read_file tool to load and analyze the JSON structure
9
- β€’ Examine categories, headers, metadata, and data organization
10
-
11
- === EXCEL WORKBOOK REQUIREMENTS ===
12
- Create comprehensive worksheets based on JSON categories:
13
- πŸ“Š 1. Executive Summary (key metrics, charts, highlights)
14
- πŸ“ˆ 2. Income Statement (formatted P&L statement)
15
- πŸ’° 3. Balance Sheet - Assets (professional layout)
16
- πŸ’³ 4. Balance Sheet - Liabilities & Equity
17
- πŸ’Έ 5. Cash Flow Statement (operating, investing, financing)
18
- πŸ“Š 6. Financial Ratios & Analysis
19
- 🏒 7. Revenue Analysis & Breakdown
20
- πŸ’Ό 8. Expense Analysis & Breakdown
21
- πŸ“ˆ 9. Charts & Visualizations Dashboard
22
- πŸ“ 10. Data Sources & Methodology
23
-
24
- === PROFESSIONAL FORMATTING STANDARDS ===
25
- Apply consistent, professional formatting:
26
- 🎨 Visual Design:
27
- β€’ Company header with report title and date
28
- β€’ Consistent fonts: Calibri 11pt (body), 14pt (headers)
29
- β€’ Color scheme: Blue headers (#4472C4), alternating row colors
30
- β€’ Professional borders and gridlines
31
-
32
- πŸ“Š Data Formatting:
33
- β€’ Currency formatting for monetary values
34
- β€’ Percentage formatting for ratios
35
- β€’ Thousands separators for large numbers
36
- β€’ Appropriate decimal places (2 for currency, 1 for percentages)
37
-
38
- πŸ“ Layout Optimization:
39
- β€’ Auto-sized columns for readability
40
- β€’ Freeze panes for easy navigation
41
- β€’ Centered headers with bold formatting
42
- β€’ Left-aligned text, right-aligned numbers
43
-
44
- === CHART & VISUALIZATION REQUIREMENTS ===
45
- Include appropriate charts for data visualization:
46
- πŸ“Š Chart Types by Data Category:
47
- β€’ Revenue trends: Line charts
48
- β€’ Expense breakdown: Pie charts
49
- β€’ Asset composition: Stacked bar charts
50
- β€’ Financial ratios: Column charts
51
- β€’ Cash flow: Waterfall charts (if possible)
52
-
53
- === PYTHON SCRIPT STRUCTURE ===
54
- Create 'generate_excel_report.py' with this structure:
55
- ```python
56
- import os, json, datetime, logging
57
- from openpyxl import Workbook
58
- from openpyxl.styles import Font, PatternFill, Border, Alignment, NamedStyle
59
- from openpyxl.chart import BarChart, LineChart, PieChart
60
- from openpyxl.utils.dataframe import dataframe_to_rows
61
 
62
- # Setup logging and working directory
63
- logging.basicConfig(level=logging.INFO)
64
- os.chdir(os.path.dirname(os.path.abspath(__file__)) or '.')
 
65
 
66
- def load_financial_data():
67
- # Load and validate JSON data
 
 
 
68
 
69
- def create_worksheet_styles():
70
- # Define professional styles
71
 
72
- def create_executive_summary(wb, data):
73
- # Create executive summary with key metrics
 
 
 
 
 
 
74
 
75
- def create_financial_statements(wb, data):
76
- # Create income statement, balance sheet, cash flow
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- def add_charts_and_visualizations(wb, data):
79
- # Add appropriate charts to worksheets
 
80
 
81
- def generate_financial_report():
82
  try:
83
- data = load_financial_data()
 
 
 
 
 
 
84
  wb = Workbook()
85
- create_worksheet_styles()
86
- create_executive_summary(wb, data)
87
- create_financial_statements(wb, data)
88
- add_charts_and_visualizations(wb, data)
 
 
 
 
 
89
 
90
- # Save with timestamp
91
- timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
92
- filename = f'Financial_Report_{timestamp}.xlsx'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  wb.save(filename)
94
- logging.info(f'Report saved as {filename}')
95
- return filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  except Exception as e:
97
- logging.error(f'Error generating report: {e}')
98
- raise
 
 
99
 
100
  if __name__ == '__main__':
101
- generate_financial_report()
 
102
  ```
103
 
104
- === EXECUTION STEPS ===
105
- 1. Read and analyze 'arranged_financial_data.json' structure
106
- 2. Install required packages: pip_install_package('openpyxl')
107
- 3. Create comprehensive Python script with error handling
108
- 4. Save script using save_file tool
109
- 5. Execute using run_shell_command('python generate_excel_report.py 2>&1')
110
- 6. Verify file creation with list_files
111
- 7. Validate file size and integrity
112
- 8. Report execution results and any issues
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  === SUCCESS CRITERIA ===
115
- βœ… Excel file created with timestamp filename
116
- βœ… File size >10KB (indicates substantial content)
117
- βœ… All worksheets present and formatted professionally
118
- βœ… Charts and visualizations included
119
- βœ… No execution errors in logs
120
- βœ… Data accurately transferred from JSON to Excel
121
-
122
- === ERROR HANDLING ===
123
- If issues occur:
124
- β€’ Log detailed error information
125
- β€’ Identify root cause (data, formatting, or execution)
126
- β€’ Implement fixes and retry
127
- β€’ Provide clear status updates
128
-
129
- Generate the comprehensive Excel report now.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ REVENUE EXCEL REPORT GENERATION TASK
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ === YOUR MISSION ===
4
+ Create a professional Excel report from arranged_financial_data.json focusing ONLY on revenue data.
5
+ Generate a business-ready revenue analysis report with 100% success rate.
6
+ You are using gemini-2.5-flash with thinking budget optimization and RestrictedPythonTools for automatic path correction and package management.
7
 
8
+ === WHAT TO CREATE ===
9
+ β€’ Professional Excel file with revenue-focused worksheets
10
+ β€’ Clean, business-ready formatting for executives
11
+ β€’ Focus exclusively on revenue analysis and visualization
12
+ β€’ File ready for immediate business use
13
 
14
+ === MANDATORY EXECUTION SEQUENCE ===
 
15
 
16
+ **STEP 1: Environment Setup (30 seconds)**
17
+ ```python
18
+ # RestrictedPythonTools automatically installs packages when needed
19
+ # Just use run_python_code() - packages will be auto-installed
20
+ import pandas as pd
21
+ import openpyxl
22
+ print("Packages will be auto-installed by RestrictedPythonTools")
23
+ ```
24
 
25
+ **STEP 2: Revenue Data Loading (30 seconds)**
26
+ - read_file('arranged_financial_data.json')
27
+ - Parse and validate revenue data structure
28
+ - Count revenue categories and data points
29
+ - Log: "Revenue data loaded: X categories, Y revenue points"
30
+
31
+ **STEP 3: Revenue Excel Script Creation (3 minutes)**
32
+ Create 'generate_revenue_report.py' with this EXACT structure:
33
+
34
+ ```python
35
+ #!/usr/bin/env python3
36
+ import os
37
+ import sys
38
+ import json
39
+ import pandas as pd
40
+ from openpyxl import Workbook
41
+ from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
42
+ from datetime import datetime
43
+ import logging
44
 
45
+ # Configure logging
46
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
47
+ logger = logging.getLogger(__name__)
48
 
49
+ def main():
50
  try:
51
+ # Load revenue data
52
+ logger.info('Loading revenue data from arranged_financial_data.json')
53
+ with open('arranged_financial_data.json', 'r', encoding='utf-8') as f:
54
+ revenue_data = json.load(f)
55
+
56
+ # Create professional workbook
57
+ logger.info('Creating revenue analysis workbook')
58
  wb = Workbook()
59
+ wb.remove(wb.active) # Remove default sheet
60
+
61
+ # Define professional styling
62
+ header_font = Font(bold=True, color='FFFFFF', size=12)
63
+ header_fill = PatternFill(start_color='1F4E79', end_color='1F4E79', fill_type='solid')
64
+ data_font = Font(size=11)
65
+
66
+ # Process each revenue category
67
+ revenue_categories = ['Company_Overview', 'Total_Revenue', 'Segment_Revenue', 'Regional_Revenue', 'Data_Quality']
68
 
69
+ for category_name in revenue_categories:
70
+ if category_name in revenue_data:
71
+ logger.info(f'Creating worksheet: {category_name}')
72
+ category_data = revenue_data[category_name]
73
+ ws = wb.create_sheet(title=category_name)
74
+
75
+ # Add professional headers
76
+ headers = ['Revenue Item', 'Amount', 'Currency/Unit', 'Period', 'Confidence Score']
77
+ for col, header in enumerate(headers, 1):
78
+ cell = ws.cell(row=1, column=col, value=header)
79
+ cell.font = header_font
80
+ cell.fill = header_fill
81
+ cell.alignment = Alignment(horizontal='center', vertical='center')
82
+
83
+ # Add revenue data
84
+ data_rows = category_data.get('data', [])
85
+ for row_idx, data_row in enumerate(data_rows, 2):
86
+ ws.cell(row=row_idx, column=1, value=data_row.get('item', '')).font = data_font
87
+ ws.cell(row=row_idx, column=2, value=data_row.get('value', '')).font = data_font
88
+ ws.cell(row=row_idx, column=3, value=data_row.get('unit', '')).font = data_font
89
+ ws.cell(row=row_idx, column=4, value=data_row.get('period', '')).font = data_font
90
+ ws.cell(row=row_idx, column=5, value=data_row.get('confidence', '')).font = data_font
91
+
92
+ # Auto-size columns for professional appearance
93
+ for column in ws.columns:
94
+ max_length = 0
95
+ column_letter = column[0].column_letter
96
+ for cell in column:
97
+ try:
98
+ if len(str(cell.value or '')) > max_length:
99
+ max_length = len(str(cell.value or ''))
100
+ except:
101
+ pass
102
+ adjusted_width = min(max(max_length + 2, 15), 50)
103
+ ws.column_dimensions[column_letter].width = adjusted_width
104
+
105
+ # Add borders for professional look
106
+ thin_border = Border(
107
+ left=Side(style='thin'),
108
+ right=Side(style='thin'),
109
+ top=Side(style='thin'),
110
+ bottom=Side(style='thin')
111
+ )
112
+
113
+ for row in ws.iter_rows(min_row=1, max_row=len(data_rows)+1, min_col=1, max_col=5):
114
+ for cell in row:
115
+ cell.border = thin_border
116
+
117
+ # Save with professional filename
118
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
119
+ filename = f'Revenue_Analysis_Report_{timestamp}.xlsx'
120
  wb.save(filename)
121
+ logger.info(f'Revenue report saved as: {filename}')
122
+
123
+ # Verify file creation and quality
124
+ if os.path.exists(filename):
125
+ file_size = os.path.getsize(filename)
126
+ if file_size > 5000: # Minimum 5KB
127
+ logger.info(f'SUCCESS: Revenue report created successfully')
128
+ logger.info(f'File: {filename} ({file_size:,} bytes)')
129
+ logger.info(f'Worksheets: {len(wb.sheetnames)}')
130
+ print(f'REVENUE_REPORT_SUCCESS: {filename}')
131
+ return filename
132
+ else:
133
+ raise Exception(f'File too small ({file_size} bytes), likely corrupted')
134
+ else:
135
+ raise Exception('Excel file was not created')
136
+
137
+ except FileNotFoundError as e:
138
+ logger.error(f'Revenue data file not found: {str(e)}')
139
+ sys.exit(1)
140
+ except json.JSONDecodeError as e:
141
+ logger.error(f'Invalid JSON in revenue data: {str(e)}')
142
+ sys.exit(1)
143
  except Exception as e:
144
+ logger.error(f'Error creating revenue report: {str(e)}')
145
+ import traceback
146
+ logger.error(f'Traceback: {traceback.format_exc()}')
147
+ sys.exit(1)
148
 
149
  if __name__ == '__main__':
150
+ result = main()
151
+ print(f'COMPLETED: {result}')
152
  ```
153
 
154
+ **STEP 4: Script Execution with RestrictedPythonTools (2 minutes)**
155
+ - Use run_python_code([complete_script]) for direct execution with auto-healing
156
+ - OR save_python_file('generate_revenue_report.py', [complete_script]) + run_shell_command('python generate_revenue_report.py')
157
+ - RestrictedPythonTools automatically handles path correction and directory constraints
158
+ - Automatic package installation and error recovery built-in
159
+ - If execution fails, RestrictedPythonTools will attempt automatic recovery
160
+
161
+ **STEP 5: Excel File Verification (CRITICAL - 30 seconds)**
162
+ - list_files() to check if Excel file exists in directory
163
+ - If Excel file NOT found in list_files(), retry script execution immediately
164
+ - run_shell_command('ls -la *Revenue*.xlsx') for detailed file info
165
+ - run_shell_command('du -h *Revenue*.xlsx') to verify file size > 5KB
166
+ - NEVER report success without Excel file confirmed in list_files()
167
+
168
+ === REVENUE REPORT SPECIFICATIONS ===
169
+
170
+ **File Structure:**
171
+ - Filename: Revenue_Analysis_Report_YYYYMMDD_HHMMSS.xlsx
172
+ - 5 worksheets focusing exclusively on revenue data
173
+ - Professional corporate formatting throughout
174
+
175
+ **Worksheet Details:**
176
+ 1. **Company_Overview** - Company info, document metadata
177
+ 2. **Total_Revenue** - Consolidated revenue figures and totals
178
+ 3. **Segment_Revenue** - Revenue by business segment/division
179
+ 4. **Regional_Revenue** - Revenue by geographic region
180
+ 5. **Data_Quality** - Confidence scores and data validation
181
+
182
+ **Professional Formatting:**
183
+ - Headers: Bold white text on navy blue background (#1F4E79)
184
+ - Data: Clean 11pt font with professional alignment
185
+ - Borders: Thin borders around all data cells
186
+ - Columns: Auto-sized for optimal readability (15-50 characters)
187
+ - Layout: Business-ready presentation format
188
+
189
+ === ERROR HANDLING PROCEDURES ===
190
+
191
+ **Package Installation Issues:**
192
+ - Try: pip install --user openpyxl pandas
193
+ - Try: python3 -m pip install openpyxl pandas
194
+ - Try: pip install --no-cache-dir openpyxl
195
+
196
+ **Revenue Data Loading Issues:**
197
+ - Verify arranged_financial_data.json exists
198
+ - Check JSON syntax and structure
199
+ - Ensure revenue categories are present
200
+
201
+ **Excel Generation Issues:**
202
+ - Log exact openpyxl error messages
203
+ - Try simplified formatting if complex formatting fails
204
+ - Check file write permissions in directory
205
+ - Verify Python version compatibility
206
+
207
+ **File Verification Issues:**
208
+ - Check file exists and has reasonable size (>5KB)
209
+ - Verify Excel file can be opened without corruption
210
+ - Confirm all expected worksheets are present
211
 
212
  === SUCCESS CRITERIA ===
213
+ Revenue Excel generation is successful ONLY if:
214
+ βœ“ openpyxl package installed without errors
215
+ βœ“ Revenue data loaded and parsed successfully
216
+ βœ“ Python script executed without errors
217
+ βœ“ Excel file created with proper filename format
218
+ βœ“ File size > 5KB indicating data was written
219
+ βœ“ All 5 revenue worksheets present and populated
220
+ βœ“ Professional formatting applied consistently
221
+ βœ“ File opens without corruption in Excel
222
+
223
+ === PROFESSIONAL FEATURES ===
224
+ Your Excel report MUST include:
225
+ - **Corporate Design**: Professional navy blue headers with white text
226
+ - **Business Layout**: Clean, executive-ready formatting
227
+ - **Data Integrity**: All original revenue values preserved exactly
228
+ - **User Experience**: Auto-sized columns, proper alignment, clear borders
229
+ - **File Management**: Timestamped filename for version control
230
+ - **Quality Assurance**: Comprehensive error handling and validation
231
+
232
+ === FINAL VALIDATION CHECKLIST ===
233
+ Before reporting success, verify:
234
+ β–‘ All required packages installed successfully
235
+ β–‘ Revenue data JSON loaded and parsed correctly
236
+ β–‘ Python script saved and executed without errors
237
+ β–‘ Excel file created with timestamped filename
238
+ β–‘ File size indicates successful data population (>5KB)
239
+ β–‘ All 5 revenue worksheets present and properly named
240
+ β–‘ Revenue data populated correctly in each worksheet
241
+ β–‘ Professional formatting applied consistently
242
+ β–‘ No execution errors or warnings in output
243
+ β–‘ File can be opened by Excel applications
244
+
245
+ Execute now. Focus EXCLUSIVELY on revenue data visualization. Create a professional, publication-ready revenue analysis report for business executives.
246
+
prompts/workflow/data_arrangement.txt CHANGED
@@ -1,34 +1,35 @@
1
- You are given raw, extracted financial data. Your task is to reorganize it and prepare it for Excel-based reporting.
2
 
3
- ========== WHAT TO DELIVER ==========
4
- β€’ A single JSON object saved as arranged_financial_data.json
5
- β€’ Fields required: categories, headers, metadata
6
 
7
- ========== HOW TO ORGANIZE ==========
8
- Create distinct, Excel-ready categories (one worksheet each) for logical grouping of financial data. Examples include:
9
- 1. Income Statement Data
10
- 2. Balance Sheet Data
11
- 3. Cash Flow Data
12
- 4. Company Information / General Data
13
 
14
- ========== STEP-BY-STEP ==========
15
- 1. Map every data point into the most appropriate category above.
16
- 2. For each category, identify and include all necessary headers for an Excel template, such as years, company names, financial line item names, and units of measurement (e.g., "in millions").
17
- 3. Ensure data integrity by not modifying, calculating, or analyzing the original data values.
18
- 4. Preserve original data formats and units.
19
- 5. Organize data in a tabular format suitable for direct Excel import.
20
- 6. Include metadata about data sources and reporting periods where available.
21
- 7. Assemble everything into the JSON schema described under "WHAT TO DELIVER."
22
- 8. Save the JSON as arranged_financial_data.json via save_file.
23
- 9. Use list_files to confirm the file exists, then read_file to validate its content.
24
- 10. If the file is missing or malformed, fix the issue and repeat steps 8 – 9.
25
- 11. Only report success after the file passes both existence and content checks.
26
 
27
- ========== IMPORTANT RESTRICTIONS ==========
28
- - Never perform any analysis on the data.
29
- - Do not calculate ratios, growth rates, or trends.
30
- - Do not provide insights or interpretations.
31
- - Do not modify the actual data values.
32
- - Focus solely on organization and proper formatting.
 
33
 
34
- Extracted Data: {extracted_data}
 
 
 
 
 
 
 
1
+ ORGANIZE REVENUE DATA FOR EXCEL
2
 
3
+ Your task: Organize the provided revenue data into a JSON file with 5 categories.
 
 
4
 
5
+ CATEGORIES TO CREATE:
6
+ 1. Company_Overview - Company details and metadata
7
+ 2. Total_Revenue - All total/consolidated revenue figures
8
+ 3. Segment_Revenue - Revenue by business segment/division
9
+ 4. Regional_Revenue - Revenue by geographic region
10
+ 5. Data_Quality - Confidence scores and extraction notes
11
 
12
+ JSON STRUCTURE:
13
+ {
14
+ "Category_Name": {
15
+ "headers": {"Item": "...", "Value": "...", "Unit": "...", "Period": "...", "Confidence": "..."},
16
+ "data": [{"item": "...", "value": "...", "unit": "...", "period": "...", "confidence": "..."}],
17
+ "metadata": {"description": "...", "data_count": "...", "quality_score": "..."}
18
+ }
19
+ }
 
 
 
 
20
 
21
+ STEPS:
22
+ 1. Parse the revenue data below
23
+ 2. Map each data point to the correct category
24
+ 3. Create the JSON structure
25
+ 4. save_file('arranged_financial_data.json', json_content)
26
+ 5. list_files() to verify the file exists
27
+ 6. If file not found, retry save_file() once
28
 
29
+ RULES:
30
+ - Use EXACT values from the input data (no modifications)
31
+ - Use EXACT company name, currency, and period from input
32
+ - Focus ONLY on revenue data (ignore expenses, assets, etc.)
33
+
34
+ The revenue data to organize:
35
+ {extracted_data}
prompts/workflow/data_arrangement_1.txt ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ REVENUE DATA ORGANIZATION TASK
2
+
3
+ === YOUR MISSION ===
4
+ Organize ONLY the extracted revenue data and prepare it for Excel-based reporting.
5
+ Focus exclusively on revenue-related data - ignore all other financial information.
6
+ You are using gemini-2.5-pro with thinking budget optimization.
7
+
8
+ 🚨 CRITICAL: You MUST use the ACTUAL extracted data provided below.
9
+ NEVER create fake/sample data. Use EXACT company names, values, periods, and currencies from the provided extraction.
10
+
11
+ === EXTRACTED REVENUE DATA ===
12
+ (Data will be provided below)
13
+
14
+ === WHAT TO DELIVER ===
15
+ β€’ A single JSON file saved as arranged_financial_data.json
16
+ β€’ ONLY 5 revenue-focused categories for Excel worksheets
17
+ β€’ Preserve all original revenue data exactly
18
+
19
+ === REVENUE-FOCUSED ORGANIZATION ===
20
+ Create EXACTLY these 5 Excel-ready categories (revenue only):
21
+
22
+ **1. Company_Overview**
23
+ - Company name, document type, reporting period
24
+ - Currency, extraction date, data quality summary
25
+
26
+ **2. Total_Revenue**
27
+ - Consolidated revenue figures
28
+ - Total revenue metrics from income statement
29
+ - Year-over-year revenue if available
30
+
31
+ **3. Segment_Revenue**
32
+ - Revenue by business segment/division
33
+ - Product vs service revenue breakdowns
34
+ - Business unit revenue performance
35
+
36
+ **4. Regional_Revenue**
37
+ - Revenue by geographic region
38
+ - Country-specific revenue data
39
+ - International vs domestic revenue splits
40
+
41
+ **5. Data_Quality**
42
+ - Confidence scores for each revenue data point
43
+ - Source information and validation notes
44
+ - Revenue extraction metadata
45
+
46
+ === STEP-BY-STEP PROCESS ===
47
+
48
+ **Step 1: Revenue Data Analysis (1 minute)**
49
+ - Parse the extracted revenue data completely - USE THE ACTUAL PROVIDED DATA
50
+ - Count total revenue data points from the PROVIDED extraction
51
+ - Identify revenue categories (total, segment, regional) from ACTUAL data
52
+ - Filter out all non-revenue data points from PROVIDED data
53
+ - NEVER create sample/fake data - ONLY use the provided extracted data
54
+ - Log: "Revenue analysis complete: X revenue points identified from PROVIDED data"
55
+
56
+ **Step 2: Revenue Data Mapping (2 minutes)**
57
+ - Map ONLY the PROVIDED revenue data points to appropriate categories:
58
+ - Total Revenue from PROVIDED data β†’ "Total_Revenue"
59
+ - Segment/Division Revenue from PROVIDED data β†’ "Segment_Revenue"
60
+ - Geographic/Regional Revenue from PROVIDED data β†’ "Regional_Revenue"
61
+ - Company metadata from PROVIDED data β†’ "Company_Overview"
62
+ - Confidence/source data from PROVIDED data β†’ "Data_Quality"
63
+ - IGNORE all non-revenue data (expenses, assets, liabilities, etc.)
64
+ - CRITICAL: Use EXACT values, company names, periods, and currencies from the PROVIDED extracted data
65
+
66
+ **Step 3: JSON Structure Creation (2 minutes)**
67
+ Create this EXACT structure using ONLY the PROVIDED extracted data:
68
+
69
+ ```json
70
+ {
71
+ "[Category_Name]": {
72
+ "headers": {
73
+ "Item": "Revenue Item",
74
+ "Value": "Amount",
75
+ "Unit": "Currency/Scale",
76
+ "Period": "Reporting Period",
77
+ "Confidence": "Accuracy Score"
78
+ },
79
+ "data": [
80
+ {
81
+ "item": "[EXACT field name from PROVIDED data]",
82
+ "value": "[EXACT value from PROVIDED data - no modifications]",
83
+ "unit": "[EXACT unit from PROVIDED data]",
84
+ "period": "[EXACT period from PROVIDED data]",
85
+ "confidence": "[EXACT confidence from PROVIDED data]"
86
+ }
87
+ ],
88
+ "metadata": {
89
+ "description": "[Category description]",
90
+ "data_count": "[Number of items from PROVIDED data]",
91
+ "quality_score": "[Average confidence from PROVIDED data]"
92
+ }
93
+ }
94
+ }
95
+ ```
96
+
97
+ **CRITICAL RULES FOR DATA USAGE:**
98
+ - Use EXACT company name from provided metadata (e.g., "Deutsche Telekom AG")
99
+ - Use EXACT currency from provided data (e.g., "EUR")
100
+ - Use EXACT reporting period from provided data (e.g., "FY 2023")
101
+ - Use EXACT revenue values from provided data (e.g., "111,985 million")
102
+ - NEVER create fake/sample data like "Global Corp", "Q2 2025", or made-up numbers
103
+
104
+ **Step 4: File Operations with Verification (1 minute)**
105
+ - save_file('arranged_financial_data.json', complete_json_structure)
106
+ - list_files() to verify file exists - CRITICAL VERIFICATION STEP
107
+ - If file NOT found in list_files(), retry save_file() operation immediately
108
+ - read_file('arranged_financial_data.json') to validate JSON syntax
109
+ - If any step fails, retry up to 3 times total
110
+ - Log: "Revenue data organization complete: file saved and validated"
111
+
112
+ === DATA PRESERVATION RULES ===
113
+ CRITICAL - You MUST follow these rules exactly:
114
+ - **NEVER** modify original revenue values
115
+ - **NEVER** perform calculations or analysis
116
+ - **NEVER** interpret or add insights
117
+ - **NEVER** change currency units or scales
118
+ - **ONLY** organize revenue data for Excel import
119
+ - **IGNORE** all non-revenue financial data completely
120
+
121
+ === REVENUE DATA VALIDATION ===
122
+ Before saving, verify:
123
+ - Company name preserved exactly from extraction
124
+ - Total revenue data properly categorized
125
+ - Segment revenue breakdowns organized logically
126
+ - Regional revenue data grouped appropriately
127
+ - All confidence scores preserved
128
+ - All original values unchanged
129
+
130
+ === FILE OPERATIONS SEQUENCE ===
131
+ Execute in EXACT order with verification:
132
+ 1. **save_file('arranged_financial_data.json', json_content)**
133
+ 2. **list_files() - CRITICAL VERIFICATION STEP**
134
+ - Check if 'arranged_financial_data.json' appears in the file list
135
+ - If file NOT found, retry save_file() operation immediately
136
+ 3. **read_file('arranged_financial_data.json')** - validate JSON syntax
137
+ 4. **Retry up to 3 times total if any step fails**
138
+ - NEVER proceed without file confirmation in list_files()
139
+
140
+ === SUCCESS CRITERIA ===
141
+ Organization is successful ONLY if:
142
+ βœ“ arranged_financial_data.json saved and confirmed in list_files()
143
+ βœ“ File exists and is readable via read_file()
144
+ βœ“ JSON syntax is valid and well-formed
145
+ βœ“ All 5 revenue categories are present (even if empty)
146
+ βœ“ Every revenue data point mapped to exactly one category
147
+ βœ“ No original revenue values modified
148
+ βœ“ All non-revenue data filtered out
149
+ βœ“ File validation passes completely
150
+
151
+ === ERROR HANDLING ===
152
+ If you encounter issues:
153
+ - **Empty revenue data**: Create categories with "No revenue data available"
154
+ - **Invalid JSON**: Fix syntax errors and retry save
155
+ - **File save fails**: Try different filename and retry
156
+ - **Missing categories**: Create empty category with metadata
157
+ - **Non-revenue data**: Filter out completely, focus only on revenue
158
+
159
+ Execute now. Focus EXCLUSIVELY on revenue data organization. Preserve all revenue values exactly as extracted.
160
+
prompts/workflow/data_extraction.txt CHANGED
@@ -1,58 +1,135 @@
1
- You are a financial data extraction specialist analyzing the document at: {file_path}
2
-
3
- === EXTRACTION APPROACH ===
4
- Use a systematic 5-phase approach: Document Analysis β†’ Critical Data β†’ Standard Financials β†’ Advanced Metrics β†’ Quality Assurance
5
-
6
- === PHASE 1: DOCUMENT ANALYSIS ===
7
- First, quickly identify:
8
- β€’ Document type (Annual Report, 10-K, 10-Q, Quarterly Report, etc.)
9
- β€’ Company name and ticker symbol
10
- β€’ Reporting period and fiscal year
11
- β€’ Currency and unit scales (millions/thousands)
12
- β€’ Location of key financial statements
13
-
14
- === PHASE 2: CRITICAL DATA (Must Extract) ===
15
- πŸ”΄ Company Essentials:
16
- β€’ Official company name and ticker
17
- β€’ Reporting period and currency
18
- β€’ Document type and audit status
19
-
20
- πŸ”΄ Core Performance:
21
- β€’ Total Revenue/Net Sales
22
- β€’ Net Income/Profit
23
- β€’ Total Assets
24
- β€’ Total Shareholders' Equity
25
- β€’ Basic Earnings Per Share (EPS)
26
-
27
- === PHASE 3: STANDARD FINANCIALS (High Priority) ===
28
- πŸ“Š Income Statement: Revenue breakdown, COGS, gross profit, operating expenses, operating income, interest, taxes, diluted EPS
29
- πŸ’° Balance Sheet: Current/non-current assets, current/non-current liabilities, equity components
30
- πŸ’Έ Cash Flow: Operating, investing, financing cash flows, capex, free cash flow
31
-
32
- === PHASE 4: ADVANCED METRICS (If Available) ===
33
- πŸ“ˆ Financial Ratios: Margins, returns (ROE/ROA), liquidity ratios, leverage ratios
34
- πŸ‘₯ Operational Data: Employee count, locations, customer metrics, production volumes
35
- πŸ“‹ Supplementary: Dividends, buybacks, guidance, one-time items
36
-
37
- === PHASE 5: QUALITY ASSURANCE ===
38
- β€’ Validate Balance Sheet equation (Assets = Liabilities + Equity)
39
- β€’ Assign confidence scores: 1.0 (clearly stated) to 0.4 (unclear)
40
- β€’ Flag missing critical data with explanations
41
- β€’ Note any unusual values or inconsistencies
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  === OUTPUT REQUIREMENTS ===
44
- Return structured data using ExtractedFinancialData model:
45
- β€’ company_name: Official company name
46
- β€’ document_type: Type of document analyzed
47
- β€’ reporting_period: Fiscal period (e.g., 'FY 2023')
48
- β€’ data_points: Array with field_name, value, category, period, unit, confidence
49
- β€’ summary: 2-3 sentence summary of key findings
50
-
51
- === EXTRACTION TIPS ===
52
- β€’ Look in financial tables first, then notes, then text
53
- β€’ Watch for footnotes and accounting changes
54
- β€’ Note restatements or discontinued operations
55
- β€’ Pay attention to scale indicators (millions/thousands)
56
- β€’ Extract multiple periods when available
57
-
58
- Document to analyze: {file_path}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ REVENUE-FOCUSED FINANCIAL DATA EXTRACTION
2
+
3
+ === DOCUMENT TO ANALYZE ===
4
+ File: {file_path}
5
+ (Document will be provided directly to you for analysis)
6
+
7
+ === YOUR MISSION ===
8
+ Extract ONLY revenue-related financial data from the provided document with 100% accuracy.
9
+ Focus exclusively on company name and revenue data - ignore all other financial metrics.
10
+ You are using gemini-2.5-pro with thinking budget optimization.
11
+
12
+ === WHAT TO EXTRACT (REVENUE ONLY) ===
13
+
14
+ **MANDATORY (Must Extract):**
15
+ 1. **Company Name** - Official legal company name
16
+ 2. **Total Revenue** - Consolidated revenue/net sales for most recent period
17
+ 3. **Document Type** - 10-K, 10-Q, Annual Report, Quarterly Report, etc.
18
+ 4. **Reporting Period** - FY 2023, Q1 2024, etc.
19
+ 5. **Currency** - USD, EUR, etc.
20
+
21
+ **HIGH VALUE (Extract if clearly present):**
22
+ 6. **Segment Revenue** - Revenue by business segment/division/product line
23
+ 7. **Regional Revenue** - Revenue by geographic region/country
24
+
25
+ **IGNORE COMPLETELY:**
26
+ - Net income, profit, losses
27
+ - Assets, liabilities, equity
28
+ - Cash flow data
29
+ - Expenses, costs, operating income
30
+ - Balance sheet items
31
+ - Ratios, per-share metrics
32
+ - Non-financial data
33
+
34
+ === SYSTEMATIC EXTRACTION PROCESS ===
35
+
36
+ **Step 1: Document Structure Analysis**
37
+ - Scan document to understand structure and layout
38
+ - Identify document type and reporting period
39
+ - Locate revenue-related sections (Income Statement, Segment Reporting, Geographic Data)
40
+
41
+ **Step 2: Company Identification**
42
+ - Extract official company name from document header/title
43
+ - Verify name consistency throughout document
44
+ - Use parent company name if multiple entities present
45
+
46
+ **Step 3: Total Revenue Extraction (CRITICAL)**
47
+ - Find consolidated revenue figure for most recent period
48
+ - Look in: Consolidated Statements of Operations, Income Statement
49
+ - Search terms: "Revenue", "Net Sales", "Total Revenue", "Net Revenue"
50
+ - Record exact value with currency and time period
51
+
52
+ **Step 4: Segment Revenue Analysis**
53
+ - Locate segment reporting section (usually separate section after financial statements)
54
+ - Extract revenue by business segment, division, or product line
55
+ - Common segments: Products, Services, Geographic regions, Business units
56
+ - Ensure segment revenues sum to total revenue for validation
57
+
58
+ **Step 5: Regional Revenue Analysis**
59
+ - Find geographic revenue breakdown section
60
+ - Look for revenue by: Americas, EMEA, APAC, US vs International, specific countries
61
+ - Extract revenue figures for major geographic regions
62
+ - Validate regional totals match consolidated revenue
63
+
64
+ **Step 6: Data Validation**
65
+ - Verify company name is not empty
66
+ - Confirm total revenue has high confidence score (>0.7)
67
+ - Check that segment/regional breakdowns sum to total
68
+ - Ensure all mandatory fields are extracted
69
+
70
+ === CONFIDENCE SCORING (REVENUE DATA ONLY) ===
71
+ - **1.0**: Revenue clearly stated in financial table with proper labels
72
+ - **0.8**: Revenue stated in structured text with clear context
73
+ - **0.6**: Revenue derived from segment/regional totals
74
+ - **0.4**: Revenue estimated or context somewhat unclear
75
+ - **0.2**: Revenue barely visible or questionable source
76
+ - **0.0**: Revenue not found or completely unclear
77
 
78
  === OUTPUT REQUIREMENTS ===
79
+ Return ExtractedFinancialData with ONLY revenue-related data:
80
+
81
+ ```json
82
+ {
83
+ "company_name": "[Official Company Name]",
84
+ "document_type": "[10-K|10-Q|Annual Report|etc.]",
85
+ "reporting_period": "[FY 2023|Q1 2024|etc.]",
86
+ "currency": "[USD|EUR|etc.]",
87
+ "data_points": [
88
+ {
89
+ "field_name": "Total Revenue",
90
+ "value": "$50.3 billion",
91
+ "category": "Revenue",
92
+ "period": "FY 2023",
93
+ "unit": "USD billions",
94
+ "confidence": 1.0
95
+ },
96
+ {
97
+ "field_name": "Product Revenue",
98
+ "value": "$30.2 billion",
99
+ "category": "Segment Revenue",
100
+ "period": "FY 2023",
101
+ "unit": "USD billions",
102
+ "confidence": 0.9
103
+ },
104
+ {
105
+ "field_name": "Americas Revenue",
106
+ "value": "$25.1 billion",
107
+ "category": "Regional Revenue",
108
+ "period": "FY 2023",
109
+ "unit": "USD billions",
110
+ "confidence": 0.8
111
+ }
112
+ ],
113
+ "summary": "[2-3 sentences describing key revenue findings and trends]"
114
+ }
115
+ ```
116
+
117
+ === SUCCESS CRITERIA ===
118
+ Extraction is successful ONLY if:
119
+ βœ“ Company name extracted (never empty)
120
+ βœ“ Total revenue extracted with confidence > 0.5
121
+ βœ“ Document type and period identified
122
+ βœ“ Currency specified
123
+ βœ“ All data points are revenue-related only
124
+ βœ“ Summary focuses on revenue insights (2-3 sentences)
125
+ βœ“ Segment/regional data sums to total (if present)
126
+
127
+ === REVENUE EXTRACTION STRATEGY ===
128
+ 1. **Income Statement First** - Look for consolidated revenue in primary financial statements
129
+ 2. **Segment Section Second** - Find detailed segment revenue breakdowns
130
+ 3. **Geographic Section Third** - Locate regional revenue data
131
+ 4. **Management Discussion** - Check for revenue highlights and explanations
132
+ 5. **Tables Over Text** - Prioritize tabular data over narrative mentions
133
+
134
+ **Remember**: Focus EXCLUSIVELY on revenue data. Ignore all other financial metrics. Your goal is 100% accuracy on revenue extraction with proper segment and regional breakdowns.
135
+
utils/restricted_python_tools.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RestrictedPythonTools - Self-Healing Python Execution with Shell Backend
3
+
4
+ This toolkit provides Python code execution with built-in directory constraints,
5
+ path auto-correction, and self-healing capabilities. Uses RestrictedShellTools
6
+ as the backend execution engine, mirroring Claude Code's architecture.
7
+ """
8
+
9
+ import os
10
+ import re
11
+ import ast
12
+ import sys
13
+ import json
14
+ import time
15
+ import uuid
16
+ import tempfile
17
+ from pathlib import Path
18
+ from typing import Optional, Dict, Any, List
19
+ from agno.tools import Toolkit
20
+ from agno.utils.log import logger
21
+
22
+ from .shell_toolkit import RestrictedShellTools
23
+
24
+
25
+ class RestrictedPythonTools(Toolkit):
26
+ """
27
+ Self-healing Python execution toolkit with directory constraints.
28
+
29
+ Uses RestrictedShellTools as backend for secure, constrained Python execution.
30
+ Includes automatic path correction, package installation, and error recovery.
31
+ """
32
+
33
+ def __init__(self, base_dir: Optional[Path] = None, **kwargs):
34
+ """
35
+ Initialize the restricted Python toolkit.
36
+
37
+ Args:
38
+ base_dir: Base directory to constrain all Python operations to
39
+ **kwargs: Additional arguments passed to parent Toolkit
40
+ """
41
+ self.base_dir = Path(base_dir) if base_dir else Path.cwd()
42
+ self.base_dir.mkdir(parents=True, exist_ok=True)
43
+
44
+ # Initialize backend tools
45
+ self.shell_tools = RestrictedShellTools(base_dir=self.base_dir)
46
+
47
+ # Track installed packages to avoid redundant installations
48
+ self.installed_packages = set()
49
+
50
+ # Initialize toolkit with Python execution functions
51
+ super().__init__(
52
+ name="restricted_python_tools",
53
+ tools=[
54
+ self.run_python_code,
55
+ self.install_package,
56
+ self.save_python_file,
57
+ self.list_python_files,
58
+ self.validate_python_syntax
59
+ ],
60
+ **kwargs
61
+ )
62
+
63
+ logger.info(f"RestrictedPythonTools initialized with base_dir: {self.base_dir}")
64
+
65
+ def run_python_code(self, code: str, timeout: int = 120) -> str:
66
+ """
67
+ Execute Python code with self-healing and directory constraints.
68
+
69
+ Args:
70
+ code (str): Python code to execute
71
+ timeout (int): Maximum execution time in seconds
72
+
73
+ Returns:
74
+ str: Output from code execution or error message
75
+ """
76
+ try:
77
+ # Step 1: Auto-correct and heal the code
78
+ healed_code = self._heal_python_code(code)
79
+
80
+ # Step 2: Validate syntax before execution
81
+ syntax_result = self.validate_python_syntax(healed_code)
82
+ if "Error" in syntax_result:
83
+ return f"Syntax Error: {syntax_result}"
84
+
85
+ # Step 3: Extract and auto-install required packages
86
+ self._auto_install_packages(healed_code)
87
+
88
+ # Step 4: Create temporary Python file
89
+ temp_filename = f"temp_script_{uuid.uuid4().hex[:8]}.py"
90
+ temp_filepath = self.base_dir / temp_filename
91
+
92
+ try:
93
+ # Save healed code to temporary file
94
+ with open(temp_filepath, 'w', encoding='utf-8') as f:
95
+ f.write(healed_code)
96
+
97
+ logger.info(f"Executing Python code via shell backend: {temp_filename}")
98
+
99
+ # Step 5: Execute via RestrictedShellTools backend
100
+ execution_command = f"python3 {temp_filename}"
101
+ result = self.shell_tools.run_shell_command(execution_command, timeout=timeout)
102
+
103
+ # Step 6: Check for common errors and attempt recovery
104
+ if self._has_execution_errors(result):
105
+ recovery_result = self._attempt_error_recovery(healed_code, result, temp_filename, timeout)
106
+ if recovery_result:
107
+ result = recovery_result
108
+
109
+ return result
110
+
111
+ finally:
112
+ # Cleanup temporary file
113
+ if temp_filepath.exists():
114
+ temp_filepath.unlink()
115
+
116
+ except Exception as e:
117
+ error_msg = f"Error executing Python code: {str(e)}"
118
+ logger.error(error_msg)
119
+ return error_msg
120
+
121
+ def _heal_python_code(self, code: str) -> str:
122
+ """
123
+ Auto-correct common path and directory issues in Python code.
124
+
125
+ Args:
126
+ code (str): Original Python code
127
+
128
+ Returns:
129
+ str: Healed Python code with corrected paths
130
+ """
131
+ healed_code = code
132
+
133
+ # Path correction patterns
134
+ path_corrections = [
135
+ # Fix relative paths that go outside base directory
136
+ (r'\.\./', ''),
137
+ (r'\.\./\.\./', ''),
138
+ (r'\.\.\\', ''),
139
+
140
+ # Convert absolute paths to relative paths within base directory
141
+ (r'["\']\/[^"\']*\/([^"\'\/]+\.(xlsx?|csv|json|txt|py))["\']', r'"\1"'),
142
+
143
+ # Fix common pandas path issues
144
+ (r'pd\.to_excel\(["\'][^"\']*\/([^"\'\/]+\.xlsx?)["\']', r'pd.to_excel("\1"'),
145
+ (r'pd\.read_excel\(["\'][^"\']*\/([^"\'\/]+\.xlsx?)["\']', r'pd.read_excel("\1"'),
146
+ (r'pd\.to_csv\(["\'][^"\']*\/([^"\'\/]+\.csv)["\']', r'pd.to_csv("\1"'),
147
+
148
+ # Fix file operations
149
+ (r'open\(["\'][^"\']*\/([^"\'\/]+)["\']', r'open("\1"'),
150
+ (r'with open\(["\'][^"\']*\/([^"\'\/]+)["\']', r'with open("\1"'),
151
+ ]
152
+
153
+ for pattern, replacement in path_corrections:
154
+ healed_code = re.sub(pattern, replacement, healed_code)
155
+
156
+ # Add working directory insurance at the beginning
157
+ directory_insurance = f"""
158
+ import os
159
+ import sys
160
+
161
+ # Ensure we're in the correct working directory
162
+ base_dir = r'{self.base_dir}'
163
+ if os.getcwd() != base_dir:
164
+ os.chdir(base_dir)
165
+ print(f"Working directory corrected to: {{os.getcwd()}}")
166
+
167
+ """
168
+
169
+ # Add directory insurance to the beginning of the code
170
+ healed_code = directory_insurance + healed_code
171
+
172
+ logger.debug(f"Code healing applied - original length: {len(code)}, healed length: {len(healed_code)}")
173
+ return healed_code
174
+
175
+ def _extract_required_packages(self, code: str) -> List[str]:
176
+ """
177
+ Extract package names from import statements in Python code.
178
+
179
+ Args:
180
+ code (str): Python code to analyze
181
+
182
+ Returns:
183
+ List[str]: List of package names that need to be installed
184
+ """
185
+ packages = set()
186
+
187
+ # Built-in modules that don't need installation
188
+ builtin_modules = {
189
+ 'os', 'sys', 'json', 'time', 'datetime', 'uuid', 'tempfile',
190
+ 're', 'ast', 'pathlib', 'math', 'random', 'subprocess',
191
+ 'collections', 'itertools', 'functools', 'logging', 'io',
192
+ 'csv', 'xml', 'urllib', 'http', 'email', 'sqlite3'
193
+ }
194
+
195
+ # Common package mappings (import name -> pip package name)
196
+ package_mappings = {
197
+ 'pandas': 'pandas',
198
+ 'numpy': 'numpy',
199
+ 'openpyxl': 'openpyxl',
200
+ 'xlsxwriter': 'xlsxwriter',
201
+ 'matplotlib': 'matplotlib',
202
+ 'seaborn': 'seaborn',
203
+ 'plotly': 'plotly',
204
+ 'requests': 'requests',
205
+ 'beautifulsoup4': 'beautifulsoup4',
206
+ 'bs4': 'beautifulsoup4',
207
+ 'sklearn': 'scikit-learn',
208
+ 'cv2': 'opencv-python',
209
+ 'PIL': 'Pillow',
210
+ 'yaml': 'PyYAML',
211
+ }
212
+
213
+ # Extract import statements using regex
214
+ import_patterns = [
215
+ r'^import\s+([a-zA-Z_][a-zA-Z0-9_]*)',
216
+ r'^from\s+([a-zA-Z_][a-zA-Z0-9_]*)\s+import',
217
+ ]
218
+
219
+ for line in code.split('\n'):
220
+ line = line.strip()
221
+ for pattern in import_patterns:
222
+ match = re.match(pattern, line)
223
+ if match:
224
+ package_name = match.group(1)
225
+
226
+ # Skip built-in modules
227
+ if package_name in builtin_modules:
228
+ continue
229
+
230
+ # Map to pip package name if known
231
+ pip_package = package_mappings.get(package_name, package_name)
232
+ packages.add(pip_package)
233
+
234
+ return list(packages)
235
+
236
+ def _auto_install_packages(self, code: str) -> None:
237
+ """
238
+ Automatically install required packages for the Python code.
239
+
240
+ Args:
241
+ code (str): Python code to analyze for package requirements
242
+ """
243
+ required_packages = self._extract_required_packages(code)
244
+
245
+ for package in required_packages:
246
+ if package not in self.installed_packages:
247
+ logger.info(f"Auto-installing package: {package}")
248
+ install_result = self.install_package(package)
249
+ if "successfully" in install_result.lower():
250
+ self.installed_packages.add(package)
251
+ else:
252
+ logger.warning(f"Failed to install package {package}: {install_result}")
253
+
254
+ def _has_execution_errors(self, result: str) -> bool:
255
+ """
256
+ Check if execution result contains errors that might be recoverable.
257
+
258
+ Args:
259
+ result (str): Execution result to check
260
+
261
+ Returns:
262
+ bool: True if recoverable errors are detected
263
+ """
264
+ error_indicators = [
265
+ "ModuleNotFoundError",
266
+ "ImportError",
267
+ "FileNotFoundError",
268
+ "PermissionError",
269
+ "No such file or directory",
270
+ ]
271
+
272
+ return any(error in result for error in error_indicators)
273
+
274
+ def _attempt_error_recovery(self, code: str, error_result: str, temp_filename: str, timeout: int) -> Optional[str]:
275
+ """
276
+ Attempt to recover from execution errors.
277
+
278
+ Args:
279
+ code (str): Original code that failed
280
+ error_result (str): Error message from failed execution
281
+ temp_filename (str): Temporary file name used
282
+ timeout (int): Execution timeout
283
+
284
+ Returns:
285
+ Optional[str]: Recovery result if successful, None if recovery failed
286
+ """
287
+ try:
288
+ # Recovery attempt 1: Install missing packages
289
+ if "ModuleNotFoundError" in error_result or "ImportError" in error_result:
290
+ logger.info("Attempting recovery: Installing missing packages")
291
+
292
+ # Extract package name from error message
293
+ missing_package_match = re.search(r"No module named '([^']+)'", error_result)
294
+ if missing_package_match:
295
+ missing_package = missing_package_match.group(1)
296
+ install_result = self.install_package(missing_package)
297
+
298
+ if "successfully" in install_result.lower():
299
+ logger.info(f"Recovery successful: Installed {missing_package}")
300
+ # Retry execution
301
+ retry_result = self.shell_tools.run_shell_command(f"python3 {temp_filename}", timeout=timeout)
302
+ return retry_result
303
+
304
+ # Recovery attempt 2: Fix file path issues
305
+ if "FileNotFoundError" in error_result or "No such file or directory" in error_result:
306
+ logger.info("Attempting recovery: Fixing file path issues")
307
+
308
+ # Create any missing directories that might be referenced
309
+ self.shell_tools.run_shell_command("mkdir -p data reports output")
310
+
311
+ # Retry execution
312
+ retry_result = self.shell_tools.run_shell_command(f"python3 {temp_filename}", timeout=timeout)
313
+ return retry_result
314
+
315
+ except Exception as e:
316
+ logger.error(f"Error recovery failed: {str(e)}")
317
+
318
+ return None
319
+
320
+ def install_package(self, package_name: str) -> str:
321
+ """
322
+ Install a Python package using pip via shell backend.
323
+
324
+ Args:
325
+ package_name (str): Name of the package to install
326
+
327
+ Returns:
328
+ str: Installation result message
329
+ """
330
+ try:
331
+ logger.info(f"Installing Python package: {package_name}")
332
+
333
+ # Try multiple installation methods
334
+ install_commands = [
335
+ f"pip3 install {package_name}",
336
+ f"python3 -m pip install {package_name}",
337
+ f"pip install {package_name}",
338
+ ]
339
+
340
+ for command in install_commands:
341
+ result = self.shell_tools.run_shell_command(command, timeout=120)
342
+
343
+ if "Successfully installed" in result or "already satisfied" in result:
344
+ self.installed_packages.add(package_name)
345
+ return f"Package '{package_name}' installed successfully"
346
+
347
+ # If first method fails, try the next one
348
+ if "error" not in result.lower():
349
+ break
350
+
351
+ return f"Package installation failed: {result}"
352
+
353
+ except Exception as e:
354
+ error_msg = f"Error installing package '{package_name}': {str(e)}"
355
+ logger.error(error_msg)
356
+ return error_msg
357
+
358
+ def save_python_file(self, filename: str, code: str) -> str:
359
+ """
360
+ Save Python code to a file in the base directory.
361
+
362
+ Args:
363
+ filename (str): Name of the Python file
364
+ code (str): Python code content
365
+
366
+ Returns:
367
+ str: Success/failure message
368
+ """
369
+ try:
370
+ if not filename.endswith('.py'):
371
+ filename += '.py'
372
+
373
+ filepath = self.base_dir / filename
374
+
375
+ # Heal the code before saving
376
+ healed_code = self._heal_python_code(code)
377
+
378
+ with open(filepath, 'w', encoding='utf-8') as f:
379
+ f.write(healed_code)
380
+
381
+ logger.info(f"Python file saved: {filename}")
382
+ return f"Python file '{filename}' saved successfully to {self.base_dir}"
383
+
384
+ except Exception as e:
385
+ error_msg = f"Error saving Python file '{filename}': {str(e)}"
386
+ logger.error(error_msg)
387
+ return error_msg
388
+
389
+ def list_python_files(self) -> str:
390
+ """
391
+ List all Python files in the base directory.
392
+
393
+ Returns:
394
+ str: List of Python files
395
+ """
396
+ try:
397
+ python_files = list(self.base_dir.glob("*.py"))
398
+
399
+ if not python_files:
400
+ return "No Python files found in the base directory"
401
+
402
+ file_list = []
403
+ for file_path in python_files:
404
+ file_stat = file_path.stat()
405
+ file_info = f"{file_path.name} ({file_stat.st_size} bytes, modified: {time.ctime(file_stat.st_mtime)})"
406
+ file_list.append(file_info)
407
+
408
+ return "Python files in base directory:\n" + "\n".join(file_list)
409
+
410
+ except Exception as e:
411
+ error_msg = f"Error listing Python files: {str(e)}"
412
+ logger.error(error_msg)
413
+ return error_msg
414
+
415
+ def validate_python_syntax(self, code: str) -> str:
416
+ """
417
+ Validate Python code syntax without executing it.
418
+
419
+ Args:
420
+ code (str): Python code to validate
421
+
422
+ Returns:
423
+ str: Validation result message
424
+ """
425
+ try:
426
+ # Parse the code to check for syntax errors
427
+ ast.parse(code)
428
+ return "Python syntax is valid"
429
+
430
+ except SyntaxError as e:
431
+ error_msg = f"Syntax Error at line {e.lineno}: {e.msg}"
432
+ logger.warning(f"Python syntax validation failed: {error_msg}")
433
+ return error_msg
434
+
435
+ except Exception as e:
436
+ error_msg = f"Error validating Python syntax: {str(e)}"
437
+ logger.error(error_msg)
438
+ return error_msg
439
+
440
+ def get_base_directory(self) -> str:
441
+ """
442
+ Get the current base directory path.
443
+
444
+ Returns:
445
+ str: Absolute path of the base directory
446
+ """
447
+ return str(self.base_dir.absolute())
448
+
449
+ def clear_temp_files(self) -> str:
450
+ """
451
+ Clean up any temporary Python files in the base directory.
452
+
453
+ Returns:
454
+ str: Cleanup result message
455
+ """
456
+ try:
457
+ temp_files = list(self.base_dir.glob("temp_script_*.py"))
458
+
459
+ if not temp_files:
460
+ return "No temporary files to clean up"
461
+
462
+ for temp_file in temp_files:
463
+ temp_file.unlink()
464
+
465
+ return f"Cleaned up {len(temp_files)} temporary Python files"
466
+
467
+ except Exception as e:
468
+ error_msg = f"Error cleaning up temporary files: {str(e)}"
469
+ logger.error(error_msg)
470
+ return error_msg
utils/shell_toolkit.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom Shell Toolkit with Base Directory Support
3
+
4
+ This toolkit provides shell command execution constrained to a specific base directory,
5
+ preventing agents from navigating outside their assigned working directory.
6
+ """
7
+
8
+ import os
9
+ import subprocess
10
+ from pathlib import Path
11
+ from typing import List, Optional
12
+ from agno.tools import Toolkit
13
+ from agno.utils.log import logger
14
+
15
+
16
+ class RestrictedShellTools(Toolkit):
17
+ """
18
+ Shell toolkit that restricts command execution to a specific base directory.
19
+
20
+ This ensures agents cannot navigate outside their assigned working directory,
21
+ solving the issue of files being saved in wrong locations.
22
+ """
23
+
24
+ def __init__(self, base_dir: Optional[Path] = None, **kwargs):
25
+ """
26
+ Initialize the restricted shell toolkit.
27
+
28
+ Args:
29
+ base_dir: Base directory to constrain all shell operations to
30
+ **kwargs: Additional arguments passed to parent Toolkit
31
+ """
32
+ self.base_dir = Path(base_dir) if base_dir else Path.cwd()
33
+
34
+ # Ensure base directory exists
35
+ self.base_dir.mkdir(parents=True, exist_ok=True)
36
+
37
+ # Initialize toolkit with our shell command function
38
+ super().__init__(
39
+ name="restricted_shell_tools",
40
+ tools=[self.run_shell_command],
41
+ **kwargs
42
+ )
43
+
44
+ logger.info(f"RestrictedShellTools initialized with base_dir: {self.base_dir}")
45
+
46
+ def run_shell_command(self, command: str, timeout: int = 30) -> str:
47
+ """
48
+ Runs a shell command in the constrained base directory.
49
+
50
+ Args:
51
+ command (str): The shell command to execute
52
+ timeout (int): Maximum execution time in seconds
53
+
54
+ Returns:
55
+ str: The output of the command or error message
56
+ """
57
+ try:
58
+ # Log the command and working directory
59
+ logger.info(f"Executing shell command in {self.base_dir}: {command}")
60
+
61
+ # Ensure we're working in the correct directory
62
+ original_cwd = os.getcwd()
63
+
64
+ try:
65
+ # Change to base directory before executing command
66
+ os.chdir(self.base_dir)
67
+
68
+ # Execute the command in the base directory
69
+ result = subprocess.run(
70
+ command,
71
+ shell=True,
72
+ capture_output=True,
73
+ text=True,
74
+ timeout=timeout,
75
+ cwd=str(self.base_dir) # Explicitly set working directory
76
+ )
77
+
78
+ # Log execution details
79
+ logger.debug(f"Command executed with return code: {result.returncode}")
80
+
81
+ if result.returncode != 0:
82
+ error_msg = f"Command failed with return code {result.returncode}\nSTDERR: {result.stderr}\nSTDOUT: {result.stdout}"
83
+ logger.warning(error_msg)
84
+ return error_msg
85
+
86
+ # Return successful output
87
+ output = result.stdout.strip()
88
+ logger.debug(f"Command output: {output[:200]}{'...' if len(output) > 200 else ''}")
89
+ return output
90
+
91
+ finally:
92
+ # Always restore original working directory
93
+ os.chdir(original_cwd)
94
+
95
+ except subprocess.TimeoutExpired:
96
+ error_msg = f"Command timed out after {timeout} seconds: {command}"
97
+ logger.error(error_msg)
98
+ return error_msg
99
+
100
+ except Exception as e:
101
+ error_msg = f"Error executing command '{command}': {str(e)}"
102
+ logger.error(error_msg)
103
+ return error_msg
104
+
105
+ def get_current_directory(self) -> str:
106
+ """
107
+ Returns the current base directory path.
108
+
109
+ Returns:
110
+ str: Absolute path of the base directory
111
+ """
112
+ return str(self.base_dir.absolute())
113
+
114
+ def list_directory_contents(self) -> str:
115
+ """
116
+ Lists the contents of the base directory.
117
+
118
+ Returns:
119
+ str: Directory listing
120
+ """
121
+ return self.run_shell_command("ls -la")
122
+
123
+ def check_file_exists(self, filename: str) -> str:
124
+ """
125
+ Checks if a file exists in the base directory.
126
+
127
+ Args:
128
+ filename (str): Name of the file to check
129
+
130
+ Returns:
131
+ str: Result of the check
132
+ """
133
+ file_path = self.base_dir / filename
134
+ if file_path.exists():
135
+ return f"File '{filename}' exists in {self.base_dir}"
136
+ else:
137
+ return f"File '{filename}' does not exist in {self.base_dir}"
workflow/financial_workflow.py CHANGED
@@ -1,360 +1,349 @@
1
  """
2
- Financial Document Analysis Workflow using Agno Workflows
3
- Clean, pure-python implementation with structured outputs to avoid JSON parsing issues
 
 
 
 
 
 
 
4
  """
5
 
6
  import json
 
7
  from pathlib import Path
8
- from typing import Dict, List, Optional, Iterator
9
- from pydantic import BaseModel, Field
 
10
 
11
- from agno.agent import Agent, RunResponse
12
- from agno.models.google import Gemini
13
- from agno.media import File
14
  from agno.tools.file import FileTools
 
15
  from agno.tools.python import PythonTools
16
- from agno.workflow import Workflow
 
 
 
17
  from agno.utils.log import logger
18
- from agno.tools.shell import ShellTools
 
19
  from config.settings import settings
20
  from utils.prompt_loader import prompt_loader
 
 
21
 
22
 
23
- # Structured Output Models to avoid JSON parsing issues
24
  class DataPoint(BaseModel):
25
- """Individual financial data point"""
26
- field_name: str = Field(..., description="Name of the financial data field")
27
- value: str = Field(..., description="Value of the field")
28
- category: str = Field(..., description="Financial category (revenue, expenses, assets, etc.)")
29
  period: str = Field(default="", description="Time period if applicable")
30
  unit: str = Field(default="", description="Currency or measurement unit")
31
  confidence: float = Field(default=0.9, description="Confidence score 0-1")
32
 
33
- class ExtractedFinancialData(BaseModel):
34
- """Structured output for data extraction phase"""
35
- company_name: str = Field(default="", description="Company name")
36
- document_type: str = Field(..., description="Type of financial document")
37
- reporting_period: str = Field(default="", description="Reporting period")
38
- data_points: List[DataPoint] = Field(..., description="All extracted financial data points")
39
- summary: str = Field(..., description="Brief summary of extracted data")
40
 
41
- class FinancialCategory(BaseModel):
42
- """A category of organized financial data"""
43
- category_name: str = Field(..., description="Name of the financial category")
44
- description: str = Field(..., description="Description of what this category contains")
45
- data_items: Dict[str, str] = Field(..., description="Key-value pairs of financial data")
46
- totals: Dict[str, str] = Field(default_factory=dict, description="Any calculated totals")
47
 
48
- class ArrangedFinancialData(BaseModel):
49
- """Structured output for data arrangement phase"""
50
- categories: List[FinancialCategory] = Field(..., description="Organized financial categories")
51
- key_metrics: Dict[str, str] = Field(default_factory=dict, description="Key financial metrics")
52
- insights: List[str] = Field(default_factory=list, description="Financial insights and analysis")
53
- summary: str = Field(..., description="Summary of arranged data")
54
 
55
- class GeneratedCode(BaseModel):
56
- """Structured output for code generation phase"""
57
- code: str = Field(..., description="Generated Python code for Excel creation")
58
- description: str = Field(..., description="Description of what the code does")
59
- output_filename: str = Field(..., description="Expected output filename")
60
- execution_notes: str = Field(default="", description="Notes about code execution")
61
 
62
 
63
  class FinancialDocumentWorkflow(Workflow):
64
  """
65
- Pure Python workflow for financial document analysis
66
- Uses structured outputs to eliminate JSON parsing issues
67
- """
68
-
69
- description: str = "Financial document analysis workflow with data extraction, organization, and Excel generation"
70
 
71
- # Data Extractor Agent - Structured output eliminates JSON parsing issues
72
- data_extractor: Agent = Agent(
73
- model=Gemini(id=settings.DATA_EXTRACTOR_MODEL,thinking_budget=settings.DATA_EXTRACTOR_MODEL_THINKING_BUDGET,api_key=settings.GOOGLE_API_KEY),
74
- description="Expert financial data extraction specialist",
75
- instructions=prompt_loader.load_instructions_as_list("agents/data_extractor"),
76
- response_model=ExtractedFinancialData,
77
- structured_outputs=True,
78
- debug_mode=True,
79
- )
80
 
81
- # Data Arranger Agent - Organizes data into categories for Excel
82
- data_arranger: Agent = Agent(
83
- model=Gemini(id=settings.DATA_ARRANGER_MODEL,thinking_budget=settings.DATA_ARRANGER_MODEL_THINKING_BUDGET,api_key=settings.GOOGLE_API_KEY),
84
- description="Financial data organization and analysis expert",
85
- instructions=prompt_loader.load_instructions_as_list("agents/data_arranger"),
86
- tools=[FileTools()], # FileTools for saving arranged data
87
- # NOTE: Cannot use structured_outputs with tools in Gemini - choosing tools over structured outputs
88
- markdown=True,
89
- debug_mode=True,
90
- add_memory_references=True,
91
- add_session_summary_references=True,
92
- exponential_backoff=True,
93
- retries=10,
94
- )
95
-
96
- # Code Generator Agent - Creates Excel generation code
97
- code_generator = Agent(
98
- model=Gemini(
99
- id=settings.CODE_GENERATOR_MODEL,
100
- api_key=settings.GOOGLE_API_KEY
101
- ),
102
- description="Excel report generator that analyzes JSON data and creates formatted workbooks using shell execution on any OS",
103
- goal="Generate a professional Excel report from arranged_financial_data.json with multiple worksheets, formatting, and charts",
104
- instructions=prompt_loader.load_instructions_as_list("agents/code_generator"),
105
- expected_output="A Financial_Report_YYYYMMDD_HHMMSS.xlsx file containing formatted data from the JSON with multiple worksheets, professional styling, and relevant charts",
106
- additional_context="This agent must work on Windows, Mac, and Linux. Always use os.path for file operations and handle path separators correctly. Include proper error handling for cross-platform compatibility.",
107
- tools=[
108
- ShellTools(),
109
- FileTools(save_files=True, read_files=True, list_files=True),
110
- PythonTools(pip_install=True, save_and_run=False, run_code=False)
111
- ],
112
- markdown=False,
113
- show_tool_calls=True,
114
- debug_mode=True,
115
- retries=10,
116
- add_datetime_to_instructions=True,
117
- delay_between_retries=10
118
- )
119
-
120
- def __init__(self, session_id: str = None, **kwargs):
121
- super().__init__(session_id=session_id, **kwargs)
122
- self.session_id = session_id or f"financial_workflow_{int(__import__('time').time())}"
123
- self.session_output_dir = Path(settings.TEMP_DIR) / self.session_id / "output"
124
- self.session_input_dir = Path(settings.TEMP_DIR) / self.session_id / "input"
125
- self.session_temp_dir = Path(settings.TEMP_DIR) / self.session_id / "temp"
126
 
127
- # Create all session directories
128
- self.session_output_dir.mkdir(parents=True, exist_ok=True)
129
- self.session_input_dir.mkdir(parents=True, exist_ok=True)
130
- self.session_temp_dir.mkdir(parents=True, exist_ok=True)
131
 
132
- # Configure tools with correct base directories after initialization
133
- self._configure_agent_tools()
 
 
 
 
 
134
 
135
- logger.info(f"FinancialDocumentWorkflow initialized with session: {self.session_id}")
 
 
 
136
 
137
- def clear_cache(self):
138
- """Clear workflow session cache and temporary files."""
139
- try:
140
- # Clear session state
141
- self.session_state.clear()
142
- logger.info(f"Cleared workflow cache for session: {self.session_id}")
143
-
144
- # Clean up temporary files (keep input and output)
145
- if self.session_temp_dir.exists():
146
- import shutil
147
- try:
148
- shutil.rmtree(self.session_temp_dir)
149
- self.session_temp_dir.mkdir(parents=True, exist_ok=True)
150
- logger.info(f"Cleaned temporary files for session: {self.session_id}")
151
- except Exception as e:
152
- logger.warning(f"Could not clean temp directory: {e}")
153
-
154
- except Exception as e:
155
- logger.error(f"Error clearing workflow cache: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
- def cleanup_session(self):
158
- """Complete cleanup of session including all files."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  try:
160
- # Clear cache first
161
- self.clear_cache()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
- # Remove entire session directory
164
- session_dir = Path(settings.TEMP_DIR) / self.session_id
165
- if session_dir.exists():
166
- import shutil
167
- try:
168
- shutil.rmtree(session_dir)
169
- logger.info(f"Completely removed session directory: {session_dir}")
170
- except Exception as e:
171
- logger.warning(f"Could not remove session directory: {e}")
172
-
173
  except Exception as e:
174
- logger.error(f"Error during session cleanup: {e}")
175
-
176
- def _configure_agent_tools(self):
177
- """Configure agent tools with the correct base directories"""
178
- # Configure data arranger's FileTools with session output directory
179
- if hasattr(self.data_arranger, 'tools') and self.data_arranger.tools:
180
- for tool in self.data_arranger.tools:
181
- if isinstance(tool, FileTools):
182
- tool.base_dir = self.session_output_dir
183
-
184
- # Configure code generator's tools with session output directory
185
- if hasattr(self.code_generator, 'tools') and self.code_generator.tools:
186
- for tool in self.code_generator.tools:
187
- if isinstance(tool, FileTools):
188
- tool.base_dir = self.session_output_dir
189
- elif isinstance(tool, PythonTools):
190
- tool.base_dir = self.session_output_dir
191
-
192
- def run(self, file_path: str = None, **kwargs) -> RunResponse:
193
  """
194
- Main workflow execution method
195
- Pure Python workflow execution - no streaming, no JSON parsing issues
 
 
 
 
 
 
196
  """
197
- # Handle file_path from parameter or attribute
198
  if file_path is None:
199
- file_path = getattr(self, 'file_path', None)
200
 
201
  if file_path is None:
202
- raise ValueError("file_path must be provided either as parameter or set as attribute")
203
-
204
- logger.info(f"Processing financial document: {file_path}")
205
 
206
- # Remove use_cache parameter since it's not defined in the method signature
207
- use_cache = kwargs.get('use_cache', True)
208
-
209
- # Check cache first if enabled
210
- if use_cache and "final_results" in self.session_state:
211
- logger.info("Returning cached results")
212
- return RunResponse(
213
- run_id=self.run_id,
214
- content=self.session_state["final_results"]
215
- )
216
 
217
  try:
218
- # Step 1: Extract Financial Data
219
- logger.info("Step 1: Extracting financial data...")
 
 
 
220
 
221
- # Check for cached extraction
222
- if use_cache and "extracted_data" in self.session_state:
223
- extracted_data = ExtractedFinancialData.model_validate(
224
- self.session_state["extracted_data"]
225
- )
226
- logger.info("Using cached extraction data")
227
- else:
228
- document = File(filepath=file_path)
229
- extraction_prompt = prompt_loader.load_prompt("workflow/data_extraction", file_path=file_path)
230
-
231
- extraction_response: RunResponse = self.data_extractor.run(
232
- extraction_prompt,
233
- files=[document]
234
- )
235
- extracted_data: ExtractedFinancialData = extraction_response.content
236
-
237
- # Cache the result
238
- self.session_state["extracted_data"] = extracted_data.model_dump()
239
- logger.info(f"Extracted {len(extracted_data.data_points)} data points")
240
 
241
- # Step 2: Arrange and Organize Data
242
- logger.info("Step 2: Organizing financial data...")
243
 
244
- if use_cache and "arrangement_response" in self.session_state:
245
- arrangement_content = self.session_state["arrangement_response"]
246
- logger.info("Using cached arrangement data")
247
- else:
248
- # Debug: Check extracted data before passing to prompt
249
- extracted_json = extracted_data.model_dump_json(indent=2)
250
- logger.debug(f"Extracted data size: {len(extracted_json)} characters")
251
- logger.debug(f"First 200 chars of extracted data: {extracted_json[:200]}...")
252
-
253
- arrangement_prompt = prompt_loader.load_prompt("workflow/data_arrangement",
254
- extracted_data=extracted_json)
255
-
256
- # Debug: Check if prompt contains the actual data or just the placeholder
257
- if "{extracted_data}" in arrangement_prompt:
258
- logger.error("CRITICAL: Variable substitution failed! Prompt still contains {extracted_data} placeholder")
259
- logger.error(f"Prompt length: {len(arrangement_prompt)}")
260
- else:
261
- logger.info(f"Variable substitution successful. Prompt length: {len(arrangement_prompt)}")
262
-
263
- arrangement_response: RunResponse = self.data_arranger.run(arrangement_prompt)
264
- arrangement_content = arrangement_response.content
265
-
266
- # Cache the result
267
- self.session_state["arrangement_response"] = arrangement_content
268
- logger.info("Data organization completed - check output directory for arranged_financial_data.json")
269
 
270
- # Step 3: Generate and Execute Excel Code
271
- logger.info("Step 3: Generating and executing Excel code...")
272
-
273
- if use_cache and "code_generation_response" in self.session_state:
274
- code_generation_content = self.session_state["code_generation_response"]
275
- execution_success = self.session_state.get("execution_success", False)
276
- logger.info("Using cached code generation results")
277
- else:
278
- code_prompt = prompt_loader.load_prompt("workflow/code_generation")
279
-
280
- code_response: RunResponse = self.code_generator.run(code_prompt)
281
- code_generation_content = code_response.content
282
-
283
- # Simple check for execution success based on response content
284
- execution_success = (
285
- "error" not in code_generation_content.lower() or
286
- "success" in code_generation_content.lower() or
287
- "completed" in code_generation_content.lower()
288
- )
289
-
290
- # Cache the results
291
- self.session_state["code_generation_response"] = code_generation_content
292
- self.session_state["execution_success"] = execution_success
293
-
294
- logger.info(f"Code generation and execution completed: {'βœ… Success' if execution_success else '❌ Failed'}")
295
 
296
- # Prepare final results
297
- # List actual output files
298
- output_files = []
299
- if self.session_output_dir.exists():
300
- output_files = [f.name for f in self.session_output_dir.iterdir() if f.is_file()]
 
 
301
 
302
- results_summary = f"""
303
- # Financial Document Analysis Complete
304
-
305
- ## Document Information
306
- - **Company**: {extracted_data.company_name or 'Not specified'}
307
- - **Document Type**: {extracted_data.document_type}
308
- - **Reporting Period**: {extracted_data.reporting_period or 'Not specified'}
309
-
310
- ## Processing Summary
311
- - **Data Points Extracted**: {len(extracted_data.data_points)}
312
- - **Data Organization**: {'βœ… Completed' if arrangement_content else '❌ Failed'}
313
- - **Excel Creation**: {'βœ… Success' if execution_success else '❌ Failed'}
314
-
315
- ## Data Organization Results
316
- {arrangement_content[:500] + '...' if arrangement_content and len(arrangement_content) > 500 else arrangement_content or 'No arrangement data available'}
317
-
318
- ## Tool Execution Summary
319
- **Data Arranger**: Used FileTools to save organized data to JSON
320
- **Code Generator**: Used PythonTools and FileTools for Excel generation
321
-
322
- ## Code Generation Results
323
- {code_generation_content[:500] + '...' if code_generation_content and len(code_generation_content) > 500 else code_generation_content or 'No code generation results available'}
324
-
325
- ## Generated Files ({len(output_files)} files)
326
- {chr(10).join(f"- **{file}**" for file in output_files) if output_files else "- No files generated"}
327
-
328
- ## Output Directory
329
- πŸ“ `{self.session_output_dir}`
330
-
331
- ---
332
- *Generated using Agno Workflows with FileTools and PythonTools integration*
333
- *Note: Due to Gemini limitations, structured outputs were used for data extraction only*
334
- """
335
 
336
- # Cache final results
337
- self.session_state["final_results"] = results_summary
338
 
339
- return RunResponse(
340
- run_id=self.run_id,
341
- content=results_summary
342
- )
343
 
344
  except Exception as e:
345
- error_message = f"❌ Workflow failed: {str(e)}"
346
- logger.error(f"Financial workflow error: {e}", exc_info=True)
347
- return RunResponse(
348
- run_id=self.run_id,
349
- content=error_message
350
- )
351
-
352
- def get_processing_status(self) -> Dict[str, str]:
353
- """Get the current processing status"""
354
  status = {
355
- "extraction": "completed" if "extracted_data" in self.session_state else "pending",
356
- "arrangement": "completed" if "arranged_data" in self.session_state else "pending",
357
- "code_generation": "completed" if "generated_code" in self.session_state else "pending",
358
- "final_results": "completed" if "final_results" in self.session_state else "pending"
 
359
  }
360
- return status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Financial Document Analysis Workflow - Agno Workflow 2.0 Implementation (Fixed)
3
+
4
+ This workflow processes financial documents through a multi-agent system using the new
5
+ step-based architecture introduced in Agno Workflow 2.0:
6
+ 1. Data Extractor Agent: Extracts structured financial data
7
+ 2. Data Arrangement Function: Organizes data into Excel-ready format
8
+ 3. Code Generator Agent: Creates professional Excel reports
9
+
10
+ Built according to Agno Workflow 2.0 standards with simple sequential execution.
11
  """
12
 
13
  import json
14
+ import time
15
  from pathlib import Path
16
+ from typing import Optional, Dict, Any
17
+ from textwrap import dedent
18
+ import os
19
 
20
+ from agno.agent import Agent
21
+ from agno.models.google import Gemini
 
22
  from agno.tools.file import FileTools
23
+ from agno.tools.shell import ShellTools
24
  from agno.tools.python import PythonTools
25
+ from agno.workflow.v2.workflow import Workflow
26
+ from agno.workflow.v2.types import StepInput, StepOutput
27
+ from agno.workflow.v2.step import Step
28
+ from agno.storage.sqlite import SqliteStorage # Added this import
29
  from agno.utils.log import logger
30
+ from pydantic import BaseModel, Field
31
+
32
  from config.settings import settings
33
  from utils.prompt_loader import prompt_loader
34
+ from utils.shell_toolkit import RestrictedShellTools
35
+ from utils.restricted_python_tools import RestrictedPythonTools
36
 
37
 
 
38
  class DataPoint(BaseModel):
39
+ """Individual financial data point."""
40
+ field_name: str = Field(description="Name of the financial data field")
41
+ value: str = Field(description="Value of the field")
42
+ category: str = Field(description="Financial category (revenue, expenses, assets, etc.)")
43
  period: str = Field(default="", description="Time period if applicable")
44
  unit: str = Field(default="", description="Currency or measurement unit")
45
  confidence: float = Field(default=0.9, description="Confidence score 0-1")
46
 
 
 
 
 
 
 
 
47
 
48
+ class Metadata(BaseModel):
49
+ """Metadata for extracted financial data."""
50
+ company_name: str = Field(default="Unknown Company", description="Company name")
51
+ document_type: str = Field(default="Unknown", description="Type of financial document")
52
+ reporting_period: str = Field(default="", description="Reporting period")
53
+ currency: str = Field(default="", description="Primary currency used")
54
 
 
 
 
 
 
 
55
 
56
+ class ExtractedFinancialData(BaseModel):
57
+ """Structured model for extracted financial data."""
58
+ data_points: list[DataPoint] = Field(description="List of extracted financial data points")
59
+ summary: str = Field(description="Summary of the extracted data")
60
+ metadata: Metadata = Field(default_factory=Metadata, description="Additional metadata")
 
61
 
62
 
63
  class FinancialDocumentWorkflow(Workflow):
64
  """
65
+ Financial document analysis workflow using Agno Workflow 2.0 step-based architecture.
 
 
 
 
66
 
67
+ This workflow processes financial documents through three specialized steps:
68
+ - Data extraction with structured outputs
69
+ - Data arrangement for Excel compatibility
70
+ - Excel report generation with formatting
71
+ """
 
 
 
 
72
 
73
+ def __init__(self, session_id: Optional[str] = None, **kwargs):
74
+ """Initialize workflow with session management and step-based architecture."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ # Initialize session directories first
77
+ self._setup_session_directories(session_id)
 
 
78
 
79
+ # Create storage with auto schema upgrade
80
+ storage = SqliteStorage(
81
+ table_name="financial_workflows",
82
+ db_file="tmp/agno_workflows.db",
83
+ mode="workflow_v2", # Use workflow_v2 mode
84
+ auto_upgrade_schema=True # This will fix your schema issues
85
+ )
86
 
87
+ # Create agents for the workflow
88
+ self.data_extractor = self._create_data_extractor()
89
+ self.data_arranger = self._create_data_arranger()
90
+ self.code_generator = self._create_code_generator()
91
 
92
+ # Create steps using Step objects for better tracking
93
+ data_extraction_step = Step(
94
+ name="FinancialDataExtractor",
95
+ agent=self.data_extractor,
96
+ description="Expert financial data extraction specialist optimized for Gemini"
97
+ )
98
+
99
+ data_arrangement_step = Step(
100
+ name="DataArrangement",
101
+ executor=self._arrangement_function,
102
+ description="User-defined callable step for data arrangement"
103
+ )
104
+
105
+ excel_generation_step = Step(
106
+ name="ExcelReportGenerator",
107
+ agent=self.code_generator,
108
+ description="Excel report generator optimized for Gemini with cross-platform support"
109
+ )
110
+
111
+ # Initialize the Workflow 2.0 with step-based architecture
112
+ super().__init__(
113
+ name="FinancialDocumentWorkflow",
114
+ description=dedent("""\
115
+ Financial document analysis workflow using Agno Workflow 2.0 with step-based execution.
116
+ Processes financial documents through extraction, arrangement, and Excel report generation.
117
+ Uses session state for caching and proper error recovery mechanisms.
118
+ """),
119
+ steps=[
120
+ data_extraction_step,
121
+ data_arrangement_step,
122
+ excel_generation_step
123
+ ],
124
+ session_id=session_id,
125
+ storage=storage, # Add the storage here
126
+ debug_mode=True,
127
+ **kwargs
128
+ )
129
+
130
+ logger.info(f"FinancialDocumentWorkflow v2.0 initialized with session: {self.session_id}")
131
+ logger.info(f"Session directories: {list(self.session_directories.keys())}")
132
+
133
+ def _setup_session_directories(self, session_id: Optional[str] = None):
134
+ """Setup session-specific directories."""
135
+ self.session_id = session_id
136
+ self.session_directories = settings.create_session_directories(self.session_id)
137
+ self.session_output_dir = self.session_directories["output"]
138
+ self.session_input_dir = self.session_directories["input"]
139
+ self.session_temp_dir = self.session_directories["temp"]
140
+ self.session_cache_dir = self.session_directories["cache"]
141
+
142
+ def _create_data_extractor(self) -> Agent:
143
+ """Create the data extraction agent."""
144
+ return Agent(
145
+ model=Gemini(
146
+ id=settings.DATA_EXTRACTOR_MODEL,
147
+ thinking_budget=settings.DATA_EXTRACTOR_MODEL_THINKING_BUDGET,
148
+ api_key=settings.GOOGLE_API_KEY
149
+ ),
150
+ name="FinancialDataExtractor",
151
+ description="Expert financial data extraction specialist optimized for Gemini",
152
+ instructions=prompt_loader.load_instructions_as_list("agents/data_extractor"),
153
+ response_model=ExtractedFinancialData,
154
+ structured_outputs=True,
155
+ debug_mode=True,
156
+ retries=10,
157
+ delay_between_retries=10,
158
+ exponential_backoff=True,
159
+ )
160
 
161
+ def _create_data_arranger(self) -> Agent:
162
+ """Create the data arrangement agent."""
163
+ logger.info(f"Data arranger base directory: {self.session_output_dir}")
164
+ logger.info(f"Directory exists: {self.session_output_dir.exists()}")
165
+ logger.info(f"Directory is writable: {os.access(self.session_output_dir, os.W_OK)}")
166
+ return Agent(
167
+ model=Gemini(
168
+ id=settings.DATA_ARRANGER_MODEL,
169
+ thinking_budget=settings.DATA_ARRANGER_MODEL_THINKING_BUDGET,
170
+ api_key=settings.GOOGLE_API_KEY
171
+ ),
172
+ name="FinancialDataArranger",
173
+ description="Financial data organization specialist optimized for Gemini",
174
+ instructions=prompt_loader.load_instructions_as_list("agents/data_arranger"),
175
+ tools=[
176
+ RestrictedShellTools(base_dir=self.session_output_dir),
177
+ FileTools(base_dir=self.session_output_dir, save_files=True, read_files=True, list_files=True),
178
+ ],
179
+ markdown=False,
180
+ debug_mode=True,
181
+ add_memory_references=True,
182
+ add_session_summary_references=True,
183
+ retries=10,
184
+ delay_between_retries=10,
185
+ exponential_backoff=True,
186
+ )
187
+
188
+ def _create_code_generator(self) -> Agent:
189
+ """Create the code generation agent."""
190
+ return Agent(
191
+ model=Gemini(
192
+ id=settings.CODE_GENERATOR_MODEL,
193
+ thinking_budget=settings.CODE_GENERATOR_MODEL_THINKING_BUDGET,
194
+ api_key=settings.GOOGLE_API_KEY
195
+ ),
196
+ name="ExcelReportGenerator",
197
+ description="Excel report generator optimized for Gemini with cross-platform support",
198
+ goal="Generate professional Excel reports from arranged financial data with multiple worksheets and formatting",
199
+ instructions=prompt_loader.load_instructions_as_list("agents/code_generator"),
200
+ expected_output="A professionally formatted Excel file with multiple worksheets, charts, and proper styling",
201
+ additional_context=f"Working directory: {self.session_output_dir}. All files must be saved in this directory only.",
202
+ tools=[
203
+ RestrictedShellTools(base_dir=self.session_output_dir),
204
+ RestrictedPythonTools(base_dir=self.session_output_dir),
205
+ FileTools(base_dir=self.session_output_dir, save_files=True, read_files=True, list_files=True)
206
+ ],
207
+ markdown=False,
208
+ show_tool_calls=True,
209
+ debug_mode=True,
210
+ add_datetime_to_instructions=True,
211
+ retries=10,
212
+ delay_between_retries=10,
213
+ exponential_backoff=True,
214
+ )
215
+
216
+ def _arrangement_function(self, step_input: StepInput) -> StepOutput:
217
+ """Custom function for data arrangement step."""
218
  try:
219
+ message = step_input.message
220
+ previous_step_content = step_input.previous_step_content
221
+
222
+ logger.info("Starting data arrangement step")
223
+
224
+ # Load the base arrangement prompt
225
+ arrangement_prompt = prompt_loader.load_prompt("workflow/data_arrangement")
226
+
227
+ # Combine prompt with extracted data from previous step
228
+ full_arrangement_prompt = f"{arrangement_prompt}\n\nHere is the extracted financial data to arrange:\n\n{previous_step_content}"
229
+
230
+ # Run data arrangement using the agent
231
+ response = self.data_arranger.run(full_arrangement_prompt)
232
+
233
+ # Cache the arrangement results in workflow session state
234
+ if hasattr(self, 'session_state') and self.session_state:
235
+ cache_key = f"arrangement_{int(time.time())}"
236
+ self.session_state[cache_key] = response.content
237
+ logger.info(f"Cached arrangement results with key: {cache_key}")
238
+
239
+ logger.info("Data arrangement completed successfully")
240
+
241
+ return StepOutput(
242
+ content=response.content,
243
+ response=response,
244
+ success=True
245
+ )
246
 
 
 
 
 
 
 
 
 
 
 
247
  except Exception as e:
248
+ logger.error(f"Data arrangement failed: {str(e)}")
249
+ return StepOutput(
250
+ content=f"Data arrangement failed: {str(e)}",
251
+ success=False,
252
+ )
253
+
254
+ def run(self, file_path: str = None, **kwargs):
 
 
 
 
 
 
 
 
 
 
 
 
255
  """
256
+ Main workflow execution using Workflow 2.0 step-based architecture.
257
+
258
+ Args:
259
+ file_path: Path to the financial document to process
260
+ **kwargs: Additional parameters
261
+
262
+ Returns:
263
+ Workflow execution result using the new step-based system
264
  """
265
+ # Handle file_path from kwargs if not provided as positional
266
  if file_path is None:
267
+ file_path = kwargs.get('file_path')
268
 
269
  if file_path is None:
270
+ logger.error("file_path is required but not provided")
271
+ raise ValueError("file_path is required but not provided")
 
272
 
273
+ start_time = time.time()
 
 
 
 
 
 
 
 
 
274
 
275
  try:
276
+ # Validate input file
277
+ file_path = Path(file_path).resolve()
278
+ if not file_path.exists():
279
+ logger.error(f"File not found: {file_path}")
280
+ raise FileNotFoundError(f"File not found: {file_path}")
281
 
282
+ # Copy input file to session directory for reference
283
+ input_file = self.session_input_dir / file_path.name
284
+ input_file.write_bytes(file_path.read_bytes())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
+ logger.info(f"Starting financial document analysis for: {file_path.name}")
 
287
 
288
+ # Create File object for direct upload to Gemini API (for first step)
289
+ from agno.media import File
290
+ document = File(filepath=str(file_path))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
+ # Load extraction prompt for the first step
293
+ extraction_prompt = prompt_loader.load_prompt(
294
+ "workflow/data_extraction",
295
+ file_path=str(file_path),
296
+ output_directory=str(self.session_output_dir)
297
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
+ # Execute the workflow using the new 2.0 step-based system
300
+ # Pass the extraction prompt as the message and include the file
301
+ result = super().run(
302
+ message=extraction_prompt,
303
+ files=[document],
304
+ **kwargs
305
+ )
306
 
307
+ # Final status
308
+ execution_time = time.time() - start_time
309
+ status = self._get_workflow_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
+ logger.info(f"Workflow completed successfully in {execution_time:.2f} seconds")
312
+ logger.info(f"Results: {status}")
313
 
314
+ return result
 
 
 
315
 
316
  except Exception as e:
317
+ logger.error(f"Workflow execution failed: {str(e)}")
318
+ raise
319
+
320
+ def _get_workflow_status(self) -> Dict[str, Any]:
321
+ """Get current workflow status and file counts."""
 
 
 
 
322
  status = {
323
+ "session_id": self.session_id,
324
+ "output_directory": str(self.session_output_dir),
325
+ "json_files": 0,
326
+ "excel_files": 0,
327
+ "data_points": 0
328
  }
329
+
330
+ if self.session_output_dir.exists():
331
+ status["json_files"] = len(list(self.session_output_dir.glob("*.json")))
332
+ status["excel_files"] = len(list(self.session_output_dir.glob("*.xlsx")))
333
+
334
+ return status
335
+
336
+
337
+ # Compatibility function to maintain the same interface as the original workflow
338
+ def create_financial_workflow(session_id: Optional[str] = None, **kwargs) -> FinancialDocumentWorkflow:
339
+ """
340
+ Create a new FinancialDocumentWorkflow instance using Workflow 2.0.
341
+
342
+ Args:
343
+ session_id: Optional session ID for tracking workflow execution
344
+ **kwargs: Additional parameters for workflow configuration
345
+
346
+ Returns:
347
+ FinancialDocumentWorkflow: Configured workflow instance
348
+ """
349
+ return FinancialDocumentWorkflow(session_id=session_id, **kwargs)
workflow/financial_workflow_working.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Financial Document Analysis Workflow - Optimized for Gemini Models
3
+
4
+ This workflow processes financial documents through a multi-agent system:
5
+ 1. Data Extractor: Extracts structured financial data
6
+ 2. Data Arranger: Organizes data into Excel-ready format
7
+ 3. Code Generator: Creates professional Excel reports
8
+
9
+ Built according to official Agno documentation standards.
10
+ """
11
+
12
+ import json
13
+ import time
14
+ from pathlib import Path
15
+ from typing import Iterator, Optional, Dict, Any
16
+ from textwrap import dedent
17
+
18
+ from agno.agent import Agent, RunResponse
19
+ from agno.models.google import Gemini
20
+ from agno.tools.file import FileTools
21
+ from agno.tools.shell import ShellTools
22
+ from agno.tools.python import PythonTools
23
+ from agno.workflow import Workflow
24
+ from agno.utils.log import logger
25
+ from pydantic import BaseModel, Field
26
+
27
+ from config.settings import settings
28
+ from utils.prompt_loader import prompt_loader
29
+ from utils.shell_toolkit import RestrictedShellTools
30
+ from utils.restricted_python_tools import RestrictedPythonTools
31
+
32
+
33
+ class DataPoint(BaseModel):
34
+ """Individual financial data point."""
35
+ field_name: str = Field(description="Name of the financial data field")
36
+ value: str = Field(description="Value of the field")
37
+ category: str = Field(description="Financial category (revenue, expenses, assets, etc.)")
38
+ period: str = Field(default="", description="Time period if applicable")
39
+ unit: str = Field(default="", description="Currency or measurement unit")
40
+ confidence: float = Field(default=0.9, description="Confidence score 0-1")
41
+
42
+
43
+ class Metadata(BaseModel):
44
+ """Metadata for extracted financial data."""
45
+ company_name: str = Field(default="Unknown Company", description="Company name")
46
+ document_type: str = Field(default="Unknown", description="Type of financial document")
47
+ reporting_period: str = Field(default="", description="Reporting period")
48
+ currency: str = Field(default="", description="Primary currency used")
49
+
50
+
51
+ class ExtractedFinancialData(BaseModel):
52
+ """Structured model for extracted financial data."""
53
+ data_points: list[DataPoint] = Field(description="List of extracted financial data points")
54
+ summary: str = Field(description="Summary of the extracted data")
55
+ metadata: Metadata = Field(default_factory=Metadata, description="Additional metadata")
56
+
57
+
58
+ class FinancialDocumentWorkflow(Workflow):
59
+ """
60
+ Financial document analysis workflow optimized for Gemini models.
61
+
62
+ This workflow processes financial documents through three specialized agents:
63
+ - Data extraction with structured outputs
64
+ - Data arrangement for Excel compatibility
65
+ - Excel report generation with formatting
66
+ """
67
+
68
+ description: str = dedent("""\
69
+ Financial document analysis workflow optimized for Gemini models with robust error handling.
70
+ Processes financial documents through extraction, arrangement, and Excel report generation.
71
+ Uses session state for caching and proper error recovery mechanisms.
72
+ """)
73
+
74
+ # Data Extractor Agent - Uses structured outputs for reliable data extraction
75
+ data_extractor: Agent = Agent(
76
+ model=Gemini(
77
+ id=settings.DATA_EXTRACTOR_MODEL,
78
+ thinking_budget=settings.DATA_EXTRACTOR_MODEL_THINKING_BUDGET,
79
+ api_key=settings.GOOGLE_API_KEY
80
+ ),
81
+ name="FinancialDataExtractor",
82
+ description="Expert financial data extraction specialist optimized for Gemini",
83
+ instructions=prompt_loader.load_instructions_as_list("agents/data_extractor"),
84
+ response_model=ExtractedFinancialData,
85
+ structured_outputs=True,
86
+ debug_mode=True,
87
+ retries=10,
88
+ delay_between_retries=10,
89
+ exponential_backoff=True,
90
+ )
91
+
92
+ def __init__(self, session_id: Optional[str] = None, **kwargs):
93
+ """Initialize workflow with session management."""
94
+ super().__init__(session_id=session_id, **kwargs)
95
+
96
+ # Initialize session directories
97
+ self._setup_session_directories()
98
+
99
+ # Initialize remaining agents with session-specific configurations
100
+ self._initialize_session_agents()
101
+
102
+ logger.info(f"FinancialDocumentWorkflow initialized with session: {self.session_id}")
103
+ logger.info(f"Session directories: {list(self.session_directories.keys())}")
104
+
105
+ def _setup_session_directories(self):
106
+ """Setup session-specific directories."""
107
+ self.session_directories = settings.create_session_directories(self.session_id)
108
+ self.session_output_dir = self.session_directories["output"]
109
+ self.session_input_dir = self.session_directories["input"]
110
+ self.session_temp_dir = self.session_directories["temp"]
111
+ self.session_cache_dir = self.session_directories["cache"]
112
+
113
+ def _initialize_session_agents(self):
114
+ """Initialize agents that need session-specific configuration."""
115
+
116
+ # Data Arranger Agent - Organizes data with file operations
117
+ self.data_arranger = Agent(
118
+ model=Gemini(
119
+ id=settings.DATA_ARRANGER_MODEL,
120
+ thinking_budget=settings.DATA_ARRANGER_MODEL_THINKING_BUDGET,
121
+ api_key=settings.GOOGLE_API_KEY
122
+ ),
123
+ name="FinancialDataArranger",
124
+ description="Financial data organization specialist optimized for Gemini",
125
+ instructions=prompt_loader.load_instructions_as_list("agents/data_arranger"),
126
+ tools=[
127
+ RestrictedShellTools(base_dir=self.session_output_dir),
128
+ FileTools(base_dir=self.session_output_dir)
129
+ ],
130
+ markdown=False,
131
+ debug_mode=True,
132
+ add_memory_references=True,
133
+ add_session_summary_references=True,
134
+ retries=10,
135
+ delay_between_retries=10,
136
+ exponential_backoff=True,
137
+ debug_level=2,
138
+ )
139
+
140
+ # Code Generator Agent - Creates Excel reports with comprehensive tools
141
+ self.code_generator = Agent(
142
+ model=Gemini(
143
+ id=settings.CODE_GENERATOR_MODEL,
144
+ thinking_budget=settings.CODE_GENERATOR_MODEL_THINKING_BUDGET,
145
+ api_key=settings.GOOGLE_API_KEY
146
+ ),
147
+ name="ExcelReportGenerator",
148
+ description="Excel report generator optimized for Gemini with cross-platform support",
149
+ goal="Generate professional Excel reports from arranged financial data with multiple worksheets and formatting",
150
+ instructions=prompt_loader.load_instructions_as_list("agents/code_generator"),
151
+ expected_output="A professionally formatted Excel file with multiple worksheets, charts, and proper styling",
152
+ additional_context=f"Working directory: {self.session_output_dir}. All files must be saved in this directory only.",
153
+ tools=[
154
+ RestrictedShellTools(base_dir=self.session_output_dir),
155
+ RestrictedPythonTools(base_dir=self.session_output_dir),
156
+ FileTools(base_dir=self.session_output_dir, save_files=True, read_files=True, list_files=True)
157
+ ],
158
+ markdown=False,
159
+ show_tool_calls=True,
160
+ debug_mode=True,
161
+ add_datetime_to_instructions=True,
162
+ retries=10,
163
+ delay_between_retries=10,
164
+ exponential_backoff=True,
165
+ )
166
+
167
+ logger.info("All agents initialized with Gemini models and proper tool configuration")
168
+
169
+ def run(self, file_path: str = None, **kwargs) -> Iterator[RunResponse]:
170
+ """
171
+ Main workflow execution following official Agno documentation patterns.
172
+
173
+ Args:
174
+ file_path: Path to the financial document to process
175
+ **kwargs: Additional parameters
176
+
177
+ Yields:
178
+ RunResponse: Streaming responses from the workflow execution
179
+ """
180
+ # Handle file_path from kwargs if not provided as positional
181
+ if file_path is None:
182
+ file_path = kwargs.get('file_path')
183
+
184
+ if file_path is None:
185
+ yield RunResponse(
186
+ run_id=self.run_id,
187
+ content="❌ Error: file_path is required but not provided"
188
+ )
189
+ return
190
+
191
+ start_time = time.time()
192
+
193
+ try:
194
+ # Validate input file
195
+ file_path = Path(file_path).resolve()
196
+ if not file_path.exists():
197
+ yield RunResponse(
198
+ run_id=self.run_id,
199
+ content=f"❌ Error: File not found: {file_path}"
200
+ )
201
+ return
202
+
203
+ # Copy input file to session directory for reference
204
+ input_file = self.session_input_dir / file_path.name
205
+ input_file.write_bytes(file_path.read_bytes())
206
+
207
+ yield RunResponse(
208
+ run_id=self.run_id,
209
+ content=f"πŸš€ Starting financial document analysis for: {file_path.name}"
210
+ )
211
+
212
+ # Step 1: Data Extraction
213
+ yield RunResponse(
214
+ run_id=self.run_id,
215
+ content="πŸ“Š Step 1: Extracting financial data..."
216
+ )
217
+
218
+ # Check cache first
219
+ cache_key = f"extraction_{file_path.name}_{file_path.stat().st_mtime}"
220
+ if cache_key in self.session_state:
221
+ logger.info("Using cached extraction results")
222
+ extracted_data = self.session_state[cache_key]
223
+ yield RunResponse(
224
+ run_id=self.run_id,
225
+ content="βœ… Using cached extraction results"
226
+ )
227
+ else:
228
+ # Create File object for direct upload to Gemini API
229
+ from agno.media import File
230
+ document = File(filepath=str(file_path))
231
+
232
+ # Load extraction prompt
233
+ extraction_prompt = prompt_loader.load_prompt(
234
+ "workflow/data_extraction",
235
+ file_path=str(file_path),
236
+ output_directory=str(self.session_output_dir)
237
+ )
238
+
239
+ # Run data extraction with file upload
240
+ extraction_response = self.data_extractor.run(extraction_prompt, files=[document])
241
+ extracted_data = extraction_response.content
242
+
243
+ # Debug: Log the type of extracted_data
244
+ logger.info(f"DEBUG: extracted_data type: {type(extracted_data)}")
245
+ logger.info(f"DEBUG: extracted_data has model_dump_json: {hasattr(extracted_data, 'model_dump_json')}")
246
+
247
+ # Cache the results
248
+ self.session_state[cache_key] = extracted_data
249
+
250
+ yield RunResponse(
251
+ run_id=self.run_id,
252
+ content=f"βœ… Extracted {len(extracted_data.data_points) if hasattr(extracted_data, 'data_points') else 'N/A'} data points"
253
+ )
254
+
255
+ # Step 2: Data Arrangement
256
+ yield RunResponse(
257
+ run_id=self.run_id,
258
+ content="πŸ“‹ Step 2: Arranging data for Excel..."
259
+ )
260
+
261
+ # Load the base arrangement prompt (without placeholders)
262
+ arrangement_prompt = prompt_loader.load_prompt("workflow/data_arrangement")
263
+
264
+ # Serialize extracted data
265
+ try:
266
+ if hasattr(extracted_data, 'model_dump_json'):
267
+ extracted_data_json = extracted_data.model_dump_json(indent=2)
268
+ elif hasattr(extracted_data, 'model_dump'):
269
+ import json
270
+ extracted_data_json = json.dumps(extracted_data.model_dump(), indent=2)
271
+ else:
272
+ import json
273
+ extracted_data_json = json.dumps(str(extracted_data), indent=2)
274
+
275
+ logger.info(f"DEBUG: Successfully serialized extracted_data ({len(extracted_data_json)} chars)")
276
+
277
+ except Exception as e:
278
+ logger.error(f"DEBUG: Failed to serialize extracted_data: {e}")
279
+ import json
280
+ extracted_data_json = json.dumps({"error": "Failed to serialize extracted data", "data": str(extracted_data)}, indent=2)
281
+
282
+ # Pass both the prompt and data directly to the agent
283
+ full_arrangement_prompt = f"{arrangement_prompt}\n\nHere is the extracted financial data to arrange:\n\n{extracted_data_json}"
284
+
285
+ # Run data arrangement
286
+ arrangement_response = self.data_arranger.run(full_arrangement_prompt)
287
+ yield RunResponse(
288
+ run_id=self.run_id,
289
+ content=f"βœ… Data arrangement completed"
290
+ )
291
+
292
+ yield RunResponse(
293
+ run_id=self.run_id,
294
+ content="βœ… Data arranged and saved to JSON"
295
+ )
296
+
297
+ # Step 3: Excel Report Generation
298
+ yield RunResponse(
299
+ run_id=self.run_id,
300
+ content="πŸ“ˆ Step 3: Generating Excel report..."
301
+ )
302
+
303
+ # Prepare code generation prompt
304
+ code_generation_prompt = prompt_loader.load_prompt(
305
+ "workflow/code_generation",
306
+ session_directory=str(self.session_output_dir)
307
+ )
308
+
309
+ # Run code generation
310
+ code_generation_response = self.code_generator.run(code_generation_prompt)
311
+ yield RunResponse(
312
+ run_id=self.run_id,
313
+ content=f"βœ… Excel report generation completed"
314
+ )
315
+
316
+ # Final status
317
+ execution_time = time.time() - start_time
318
+ status = self._get_workflow_status()
319
+
320
+ yield RunResponse(
321
+ run_id=self.run_id,
322
+ content=f"""
323
+ βœ… Workflow completed successfully in {execution_time:.2f} seconds
324
+
325
+ πŸ“Š Results Summary:
326
+ - Data points extracted: {status.get('data_points', 'N/A')}
327
+ - JSON files created: {status.get('json_files', 0)}
328
+ - Excel files created: {status.get('excel_files', 0)}
329
+ - Session directory: {self.session_output_dir}
330
+
331
+ 🎯 All files saved to: {self.session_output_dir}
332
+ """.strip()
333
+ )
334
+
335
+ except Exception as e:
336
+ logger.error(f"Workflow execution failed: {str(e)}")
337
+ yield RunResponse(
338
+ run_id=self.run_id,
339
+ content=f"❌ Workflow failed: {str(e)}"
340
+ )
341
+
342
+ def _get_workflow_status(self) -> Dict[str, Any]:
343
+ """Get current workflow status and file counts."""
344
+ status = {
345
+ "session_id": self.session_id,
346
+ "output_directory": str(self.session_output_dir),
347
+ "json_files": 0,
348
+ "excel_files": 0,
349
+ "data_points": 0
350
+ }
351
+
352
+ if self.session_output_dir.exists():
353
+ status["json_files"] = len(list(self.session_output_dir.glob("*.json")))
354
+ status["excel_files"] = len(list(self.session_output_dir.glob("*.xlsx")))
355
+
356
+ return status
357
+