feat: Implement revenue data organization workflow with JSON output
Browse files- Added a new revenue data organization task in `data_arrangement_1.txt` to structure extracted financial data for Excel reporting.
- Introduced `RestrictedPythonTools` for self-healing Python execution with directory constraints and package management.
- Created `RestrictedShellTools` to execute shell commands within a specified base directory, enhancing security and preventing directory traversal.
- Developed `FinancialDocumentWorkflow` to manage the entire financial document analysis process, including data extraction, arrangement, and Excel report generation.
- Integrated session management for handling input/output directories and caching extraction results.
- Enhanced error handling and logging throughout the workflow for better traceability and debugging.
- .claude/settings.local.json +13 -0
- .opencode/opencode.db +0 -0
- .opencode/opencode.db-shm +0 -0
- .opencode/opencode.db-wal +0 -0
- app.py +5 -4
- config/settings.py +218 -85
- instructions/agents/code_generator.json +247 -81
- instructions/agents/data_arranger.json +24 -94
- instructions/agents/data_arranger_2.json +218 -0
- instructions/agents/data_extractor.json +151 -106
- prompts/workflow/code_generation.txt +228 -111
- prompts/workflow/data_arrangement.txt +30 -29
- prompts/workflow/data_arrangement_1.txt +160 -0
- prompts/workflow/data_extraction.txt +133 -56
- utils/restricted_python_tools.py +470 -0
- utils/shell_toolkit.py +137 -0
- workflow/financial_workflow.py +294 -305
- workflow/financial_workflow_working.py +357 -0
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"permissions": {
|
3 |
+
"allow": [
|
4 |
+
"Bash(python3:*)",
|
5 |
+
"Bash(mkdir:*)",
|
6 |
+
"Bash(ls:*)",
|
7 |
+
"Bash(find:*)",
|
8 |
+
"Bash(grep:*)",
|
9 |
+
"Bash(python test_prompt_loading.py:*)"
|
10 |
+
],
|
11 |
+
"deny": []
|
12 |
+
}
|
13 |
+
}
|
Binary file (4.1 kB). View file
|
|
Binary file (32.8 kB). View file
|
|
Binary file (78.3 kB). View file
|
|
@@ -8,7 +8,7 @@ os.environ.setdefault("MPLCONFIGDIR", "/tmp/mpl_cache")
|
|
8 |
import logging
|
9 |
from pathlib import Path
|
10 |
import uuid
|
11 |
-
from workflow.
|
12 |
from agno.storage.sqlite import SqliteStorage
|
13 |
from utils.file_handler import FileHandler
|
14 |
from config.settings import settings
|
@@ -1892,7 +1892,7 @@ def create_gradio_app():
|
|
1892 |
logger.info("Backend: Starting Step 1 - Data Extraction")
|
1893 |
|
1894 |
# Run the workflow and track progress
|
1895 |
-
result = ui.workflow.
|
1896 |
progress_state['result'][0] = result
|
1897 |
|
1898 |
# Signal completion
|
@@ -1941,8 +1941,9 @@ def create_gradio_app():
|
|
1941 |
if progress_state['error'][0]:
|
1942 |
raise progress_state['error'][0]
|
1943 |
|
1944 |
-
|
1945 |
-
|
|
|
1946 |
|
1947 |
# The workflow has completed all steps - just display the results
|
1948 |
logger.info("π Displaying workflow results")
|
|
|
8 |
import logging
|
9 |
from pathlib import Path
|
10 |
import uuid
|
11 |
+
from workflow.financial_workflow_working import FinancialDocumentWorkflow
|
12 |
from agno.storage.sqlite import SqliteStorage
|
13 |
from utils.file_handler import FileHandler
|
14 |
from config.settings import settings
|
|
|
1892 |
logger.info("Backend: Starting Step 1 - Data Extraction")
|
1893 |
|
1894 |
# Run the workflow and track progress
|
1895 |
+
result = list(ui.workflow.run(file_path=ui.workflow.file_path))
|
1896 |
progress_state['result'][0] = result
|
1897 |
|
1898 |
# Signal completion
|
|
|
1941 |
if progress_state['error'][0]:
|
1942 |
raise progress_state['error'][0]
|
1943 |
|
1944 |
+
workflow_responses = progress_state['result'][0]
|
1945 |
+
# Extract content from all responses and join them
|
1946 |
+
workflow_results = "\n".join([response.content for response in workflow_responses])
|
1947 |
|
1948 |
# The workflow has completed all steps - just display the results
|
1949 |
logger.info("π Displaying workflow results")
|
@@ -1,114 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
from pathlib import Path
|
3 |
from dotenv import load_dotenv
|
|
|
4 |
|
|
|
5 |
load_dotenv()
|
6 |
|
|
|
|
|
7 |
|
8 |
class Settings:
|
|
|
|
|
|
|
9 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
SUPPORTED_FILE_TYPES = [
|
13 |
-
"pdf",
|
14 |
-
"
|
15 |
-
"png",
|
16 |
-
"jpg",
|
17 |
-
"jpeg",
|
18 |
-
"docx",
|
19 |
-
"xlsx",
|
20 |
-
"csv",
|
21 |
-
"md",
|
22 |
-
"json",
|
23 |
-
"xml",
|
24 |
-
"html",
|
25 |
-
"py",
|
26 |
-
"js",
|
27 |
-
"ts",
|
28 |
-
"doc",
|
29 |
-
"xls",
|
30 |
-
"ppt",
|
31 |
-
"pptx",
|
32 |
]
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
@classmethod
|
49 |
def validate_config(cls):
|
50 |
-
"""
|
51 |
errors = []
|
52 |
warnings = []
|
53 |
|
54 |
-
#
|
|
|
|
|
55 |
if not cls.GOOGLE_API_KEY:
|
56 |
-
errors.append("GOOGLE_API_KEY is required
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
try:
|
65 |
-
cls.
|
66 |
-
# Test write permissions
|
67 |
-
test_file = cls.TEMP_DIR / ".write_test"
|
68 |
-
try:
|
69 |
-
test_file.write_text("test")
|
70 |
-
test_file.unlink()
|
71 |
-
except Exception as e:
|
72 |
-
errors.append(f"Cannot write to temp directory {cls.TEMP_DIR}: {e}")
|
73 |
except Exception as e:
|
74 |
-
errors.append(f"
|
75 |
-
|
76 |
-
#
|
|
|
|
|
77 |
if cls.MAX_FILE_SIZE_MB <= 0:
|
78 |
errors.append("MAX_FILE_SIZE_MB must be positive")
|
79 |
elif cls.MAX_FILE_SIZE_MB > 100:
|
80 |
-
warnings.append(f"MAX_FILE_SIZE_MB ({cls.MAX_FILE_SIZE_MB}) is very large")
|
81 |
-
|
82 |
-
#
|
83 |
if not cls.SUPPORTED_FILE_TYPES:
|
84 |
errors.append("SUPPORTED_FILE_TYPES cannot be empty")
|
85 |
-
|
86 |
-
#
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
if errors:
|
97 |
-
error_msg = "Configuration validation failed:\n"
|
|
|
|
|
98 |
if warnings:
|
99 |
-
error_msg += "\n\
|
100 |
-
|
101 |
|
|
|
|
|
102 |
if warnings:
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
107 |
return True
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
@classmethod
|
110 |
def get_debug_info(cls):
|
111 |
-
"""Get debug information about current configuration."""
|
112 |
import platform
|
113 |
import sys
|
114 |
|
@@ -117,16 +240,26 @@ class Settings:
|
|
117 |
"platform": platform.platform(),
|
118 |
"temp_dir": str(cls.TEMP_DIR),
|
119 |
"temp_dir_exists": cls.TEMP_DIR.exists(),
|
120 |
-
"supported_file_types": len(cls.SUPPORTED_FILE_TYPES),
|
121 |
-
"max_file_size_mb": cls.MAX_FILE_SIZE_MB,
|
122 |
-
"has_google_api_key": bool(cls.GOOGLE_API_KEY),
|
123 |
-
"has_openai_api_key": bool(os.getenv("OPENAI_API_KEY")),
|
124 |
"models": {
|
125 |
"data_extractor": cls.DATA_EXTRACTOR_MODEL,
|
126 |
-
"data_arranger": cls.DATA_ARRANGER_MODEL,
|
127 |
-
"code_generator": cls.CODE_GENERATOR_MODEL
|
|
|
|
|
|
|
|
|
128 |
}
|
129 |
}
|
130 |
|
131 |
|
|
|
132 |
settings = Settings()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Configuration settings for Data Extractor Using Gemini
|
3 |
+
Optimized for Gemini-only model usage with robust directory management
|
4 |
+
"""
|
5 |
+
|
6 |
import os
|
7 |
from pathlib import Path
|
8 |
from dotenv import load_dotenv
|
9 |
+
import logging
|
10 |
|
11 |
+
# Load environment variables
|
12 |
load_dotenv()
|
13 |
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
|
17 |
class Settings:
|
18 |
+
"""Configuration settings with Gemini-only model support and robust directory management."""
|
19 |
+
|
20 |
+
# === GEMINI MODEL CONFIGURATION ===
|
21 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
22 |
+
|
23 |
+
# Gemini model specifications - using gemini-2.5-flash (supports thinking budget)
|
24 |
+
DATA_EXTRACTOR_MODEL = os.getenv("DATA_EXTRACTOR_MODEL", "gemini-2.5-pro")
|
25 |
+
DATA_ARRANGER_MODEL = os.getenv("DATA_ARRANGER_MODEL", "gemini-2.5-pro")
|
26 |
+
CODE_GENERATOR_MODEL = os.getenv("CODE_GENERATOR_MODEL", "gemini-2.5-flash")
|
27 |
+
|
28 |
+
# Thinking budgets optimized for each task type
|
29 |
+
DATA_EXTRACTOR_MODEL_THINKING_BUDGET = int(os.getenv("DATA_EXTRACTOR_THINKING_BUDGET", "4096"))
|
30 |
+
DATA_ARRANGER_MODEL_THINKING_BUDGET = int(os.getenv("DATA_ARRANGER_THINKING_BUDGET", "4096"))
|
31 |
+
CODE_GENERATOR_MODEL_THINKING_BUDGET = int(os.getenv("CODE_GENERATOR_THINKING_BUDGET", "4096"))
|
32 |
+
|
33 |
+
# === FILE PROCESSING CONFIGURATION ===
|
34 |
+
MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
|
35 |
SUPPORTED_FILE_TYPES = [
|
36 |
+
"pdf", "txt", "docx", "xlsx", "csv", "md", "json", "xml", "html",
|
37 |
+
"png", "jpg", "jpeg", "doc", "xls", "ppt", "pptx"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
]
|
39 |
+
|
40 |
+
# === DIRECTORY MANAGEMENT ===
|
41 |
+
# Centralized working directory - all operations happen within this directory
|
42 |
+
WORKING_DIR = Path(os.getenv("WORKING_DIR", "/tmp/data_extractor_gemini"))
|
43 |
+
|
44 |
+
# Subdirectories within working directory
|
45 |
+
TEMP_DIR = WORKING_DIR / "temp"
|
46 |
+
INPUT_DIR = WORKING_DIR / "input"
|
47 |
+
OUTPUT_DIR = WORKING_DIR / "output"
|
48 |
+
CACHE_DIR = WORKING_DIR / "cache"
|
49 |
+
LOGS_DIR = WORKING_DIR / "logs"
|
50 |
+
|
51 |
+
# === WORKFLOW CONFIGURATION ===
|
52 |
+
# Retry and timeout settings
|
53 |
+
MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3"))
|
54 |
+
RETRY_DELAY_SECONDS = int(os.getenv("RETRY_DELAY_SECONDS", "5"))
|
55 |
+
AGENT_TIMEOUT_SECONDS = int(os.getenv("AGENT_TIMEOUT_SECONDS", "300"))
|
56 |
+
|
57 |
+
# Cache settings
|
58 |
+
ENABLE_CACHING = os.getenv("ENABLE_CACHING", "true").lower() == "true"
|
59 |
+
CACHE_TTL_HOURS = int(os.getenv("CACHE_TTL_HOURS", "24"))
|
60 |
+
|
61 |
+
@classmethod
|
62 |
+
def initialize_directories(cls):
|
63 |
+
"""Initialize all required directories with proper permissions."""
|
64 |
+
directories = [
|
65 |
+
cls.WORKING_DIR,
|
66 |
+
cls.TEMP_DIR,
|
67 |
+
cls.INPUT_DIR,
|
68 |
+
cls.OUTPUT_DIR,
|
69 |
+
cls.CACHE_DIR,
|
70 |
+
cls.LOGS_DIR
|
71 |
+
]
|
72 |
+
|
73 |
+
created_dirs = []
|
74 |
+
for directory in directories:
|
75 |
+
try:
|
76 |
+
directory.mkdir(parents=True, exist_ok=True)
|
77 |
+
|
78 |
+
# Test write permissions
|
79 |
+
test_file = directory / ".write_test"
|
80 |
+
test_file.write_text("test")
|
81 |
+
test_file.unlink()
|
82 |
+
|
83 |
+
created_dirs.append(str(directory))
|
84 |
+
logger.debug(f"Directory initialized: {directory}")
|
85 |
+
|
86 |
+
except Exception as e:
|
87 |
+
logger.error(f"Failed to initialize directory {directory}: {e}")
|
88 |
+
raise RuntimeError(f"Cannot create or write to directory {directory}: {e}")
|
89 |
+
|
90 |
+
logger.info(f"Successfully initialized {len(created_dirs)} directories")
|
91 |
+
return created_dirs
|
92 |
+
|
93 |
@classmethod
|
94 |
def validate_config(cls):
|
95 |
+
"""Comprehensive configuration validation with detailed error reporting."""
|
96 |
errors = []
|
97 |
warnings = []
|
98 |
|
99 |
+
# === CRITICAL VALIDATIONS ===
|
100 |
+
|
101 |
+
# Google API Key validation
|
102 |
if not cls.GOOGLE_API_KEY:
|
103 |
+
errors.append("GOOGLE_API_KEY is required. Get it from https://aistudio.google.com/app/apikey")
|
104 |
+
elif len(cls.GOOGLE_API_KEY) < 30:
|
105 |
+
warnings.append("GOOGLE_API_KEY appears to be too short - verify it's correct")
|
106 |
+
|
107 |
+
# Model name validation
|
108 |
+
gemini_models = [cls.DATA_EXTRACTOR_MODEL, cls.DATA_ARRANGER_MODEL, cls.CODE_GENERATOR_MODEL]
|
109 |
+
for i, model in enumerate(gemini_models):
|
110 |
+
model_names = ["DATA_EXTRACTOR_MODEL", "DATA_ARRANGER_MODEL", "CODE_GENERATOR_MODEL"]
|
111 |
+
if not model:
|
112 |
+
errors.append(f"{model_names[i]} cannot be empty")
|
113 |
+
elif not model.startswith("gemini-"):
|
114 |
+
errors.append(f"{model_names[i]} must be a Gemini model (starts with 'gemini-'), got: {model}")
|
115 |
+
|
116 |
+
# Directory validation
|
117 |
try:
|
118 |
+
cls.initialize_directories()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
except Exception as e:
|
120 |
+
errors.append(f"Directory initialization failed: {e}")
|
121 |
+
|
122 |
+
# === MODERATE VALIDATIONS ===
|
123 |
+
|
124 |
+
# File size validation
|
125 |
if cls.MAX_FILE_SIZE_MB <= 0:
|
126 |
errors.append("MAX_FILE_SIZE_MB must be positive")
|
127 |
elif cls.MAX_FILE_SIZE_MB > 100:
|
128 |
+
warnings.append(f"MAX_FILE_SIZE_MB ({cls.MAX_FILE_SIZE_MB}) is very large - may cause memory issues")
|
129 |
+
|
130 |
+
# Supported file types validation
|
131 |
if not cls.SUPPORTED_FILE_TYPES:
|
132 |
errors.append("SUPPORTED_FILE_TYPES cannot be empty")
|
133 |
+
|
134 |
+
# Thinking budget validation
|
135 |
+
budgets = [
|
136 |
+
(cls.DATA_EXTRACTOR_MODEL_THINKING_BUDGET, "DATA_EXTRACTOR_MODEL_THINKING_BUDGET"),
|
137 |
+
(cls.DATA_ARRANGER_MODEL_THINKING_BUDGET, "DATA_ARRANGER_MODEL_THINKING_BUDGET"),
|
138 |
+
(cls.CODE_GENERATOR_MODEL_THINKING_BUDGET, "CODE_GENERATOR_MODEL_THINKING_BUDGET")
|
139 |
+
]
|
140 |
+
|
141 |
+
for budget, name in budgets:
|
142 |
+
if budget < 1024:
|
143 |
+
warnings.append(f"{name} ({budget}) is quite low - may affect model performance")
|
144 |
+
elif budget > 8192:
|
145 |
+
warnings.append(f"{name} ({budget}) is very high - may be unnecessary")
|
146 |
+
|
147 |
+
# Retry configuration validation
|
148 |
+
if cls.MAX_RETRIES < 1:
|
149 |
+
warnings.append("MAX_RETRIES should be at least 1")
|
150 |
+
elif cls.MAX_RETRIES > 10:
|
151 |
+
warnings.append("MAX_RETRIES is very high - may cause long delays")
|
152 |
+
|
153 |
+
# === RESULT PROCESSING ===
|
154 |
+
|
155 |
if errors:
|
156 |
+
error_msg = "β Configuration validation failed:\n"
|
157 |
+
error_msg += "\n".join(f" β’ {error}" for error in errors)
|
158 |
+
|
159 |
if warnings:
|
160 |
+
error_msg += "\n\nβ οΈ Warnings:\n"
|
161 |
+
error_msg += "\n".join(f" β’ {warning}" for warning in warnings)
|
162 |
|
163 |
+
raise ValueError(error_msg)
|
164 |
+
|
165 |
if warnings:
|
166 |
+
logger.warning("Configuration warnings detected:")
|
167 |
+
for warning in warnings:
|
168 |
+
logger.warning(f" β’ {warning}")
|
169 |
+
|
170 |
+
logger.info("β
Configuration validation successful")
|
171 |
return True
|
172 |
|
173 |
+
@classmethod
|
174 |
+
def get_session_directories(cls, session_id: str):
|
175 |
+
"""Get session-specific directory structure."""
|
176 |
+
session_base = cls.WORKING_DIR / session_id
|
177 |
+
|
178 |
+
return {
|
179 |
+
"base": session_base,
|
180 |
+
"input": session_base / "input",
|
181 |
+
"output": session_base / "output",
|
182 |
+
"temp": session_base / "temp",
|
183 |
+
"cache": session_base / "cache"
|
184 |
+
}
|
185 |
+
|
186 |
+
@classmethod
|
187 |
+
def create_session_directories(cls, session_id: str):
|
188 |
+
"""Create and validate session-specific directories."""
|
189 |
+
session_dirs = cls.get_session_directories(session_id)
|
190 |
+
|
191 |
+
created = []
|
192 |
+
for name, directory in session_dirs.items():
|
193 |
+
try:
|
194 |
+
directory.mkdir(parents=True, exist_ok=True)
|
195 |
+
|
196 |
+
# Test write permissions
|
197 |
+
test_file = directory / ".write_test"
|
198 |
+
test_file.write_text("test")
|
199 |
+
test_file.unlink()
|
200 |
+
|
201 |
+
created.append(str(directory))
|
202 |
+
|
203 |
+
except Exception as e:
|
204 |
+
logger.error(f"Failed to create session directory {name}: {e}")
|
205 |
+
raise RuntimeError(f"Cannot create session directory {directory}: {e}")
|
206 |
+
|
207 |
+
logger.info(f"Created {len(created)} session directories for {session_id}")
|
208 |
+
return session_dirs
|
209 |
+
|
210 |
+
@classmethod
|
211 |
+
def cleanup_session(cls, session_id: str, keep_output: bool = True):
|
212 |
+
"""Clean up session directories with option to preserve output."""
|
213 |
+
session_dirs = cls.get_session_directories(session_id)
|
214 |
+
|
215 |
+
import shutil
|
216 |
+
cleaned = []
|
217 |
+
|
218 |
+
for name, directory in session_dirs.items():
|
219 |
+
if keep_output and name == "output":
|
220 |
+
continue
|
221 |
+
|
222 |
+
if directory.exists():
|
223 |
+
try:
|
224 |
+
shutil.rmtree(directory)
|
225 |
+
cleaned.append(str(directory))
|
226 |
+
except Exception as e:
|
227 |
+
logger.warning(f"Could not clean {name} directory: {e}")
|
228 |
+
|
229 |
+
logger.info(f"Cleaned {len(cleaned)} session directories for {session_id}")
|
230 |
+
return cleaned
|
231 |
+
|
232 |
@classmethod
|
233 |
def get_debug_info(cls):
|
234 |
+
"""Get comprehensive debug information about current configuration."""
|
235 |
import platform
|
236 |
import sys
|
237 |
|
|
|
240 |
"platform": platform.platform(),
|
241 |
"temp_dir": str(cls.TEMP_DIR),
|
242 |
"temp_dir_exists": cls.TEMP_DIR.exists(),
|
|
|
|
|
|
|
|
|
243 |
"models": {
|
244 |
"data_extractor": cls.DATA_EXTRACTOR_MODEL,
|
245 |
+
"data_arranger": cls.DATA_ARRANGER_MODEL,
|
246 |
+
"code_generator": cls.CODE_GENERATOR_MODEL,
|
247 |
+
},
|
248 |
+
"api_keys": {
|
249 |
+
"google_api_key_present": bool(cls.GOOGLE_API_KEY),
|
250 |
+
"google_api_key_length": len(cls.GOOGLE_API_KEY) if cls.GOOGLE_API_KEY else 0
|
251 |
}
|
252 |
}
|
253 |
|
254 |
|
255 |
+
# Global settings instance
|
256 |
settings = Settings()
|
257 |
+
|
258 |
+
# Auto-initialize directories on import
|
259 |
+
try:
|
260 |
+
settings.initialize_directories()
|
261 |
+
logger.debug("Settings initialized successfully")
|
262 |
+
except Exception as e:
|
263 |
+
logger.error(f"Failed to initialize settings: {e}")
|
264 |
+
# Don't raise here to allow import to succeed
|
265 |
+
|
@@ -1,83 +1,249 @@
|
|
1 |
{
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
"agent_type": "code_generator",
|
81 |
-
"description": "Excel report
|
82 |
-
"category": "
|
83 |
-
}
|
|
|
|
1 |
{
|
2 |
+
"instructions": [
|
3 |
+
"=== EXCEL REPORT GENERATION AGENT ===",
|
4 |
+
"You are an Excel report generation agent - please keep going until the Excel report generation task is completely resolved, before ending your turn.",
|
5 |
+
"",
|
6 |
+
"Your thinking should be thorough and so it's fine if it's very long. However, avoid unnecessary repetition and verbosity. You should be concise, but thorough.",
|
7 |
+
"",
|
8 |
+
"You MUST iterate and keep going until the Excel report generation is perfect and complete.",
|
9 |
+
"",
|
10 |
+
"You have everything you need to resolve this Excel generation task. I want you to fully create the professional revenue-focused Excel report autonomously before coming back.",
|
11 |
+
"",
|
12 |
+
"Only terminate your turn when you are sure that the Excel file has been created successfully, verified, and is ready for download. Go through each step systematically, and make sure to verify that your Excel generation is correct. NEVER end your turn without having truly and completely created a professional Excel report from the revenue data.",
|
13 |
+
"",
|
14 |
+
"=== TOOLS AVAILABLE ===",
|
15 |
+
"You have access to these tools:",
|
16 |
+
"- run_shell_command(command) - Runs a shell command in the constrained session directory",
|
17 |
+
"- run_python_code(code) - Executes Python code with automatic path correction and package installation",
|
18 |
+
"- install_package(package_name) - Installs Python packages automatically",
|
19 |
+
"- save_python_file(filename, code) - Saves Python code to a file with automatic healing",
|
20 |
+
"- save_file(filename, content) - Saves content to a file and returns the filename if successful",
|
21 |
+
"- read_file(filename) - Reads the contents of the file and returns the contents if successful",
|
22 |
+
"- list_files() - Returns a list of files in the base directory",
|
23 |
+
"- validate_python_syntax(code) - Validates Python code syntax before execution",
|
24 |
+
"",
|
25 |
+
"=== CORE MISSION ===",
|
26 |
+
"Create a professional Excel report from arranged_financial_data.json focusing ONLY on revenue data:",
|
27 |
+
"1. Install required Python packages (openpyxl, pandas)",
|
28 |
+
"2. Load and parse the organized revenue data",
|
29 |
+
"3. Generate Python code for Excel report creation",
|
30 |
+
"4. Execute the code to create the Excel file",
|
31 |
+
"5. Verify file creation and format",
|
32 |
+
"",
|
33 |
+
"=== WORKFLOW ===",
|
34 |
+
"",
|
35 |
+
"1. **Environment Setup**",
|
36 |
+
" - Install openpyxl and pandas packages",
|
37 |
+
" - Verify installation success",
|
38 |
+
" - Check Python version compatibility",
|
39 |
+
" - Prepare Excel generation environment",
|
40 |
+
"",
|
41 |
+
"2. **Data Loading & Analysis**",
|
42 |
+
" - Read arranged_financial_data.json",
|
43 |
+
" - Parse and validate JSON structure",
|
44 |
+
" - Count revenue categories and data points",
|
45 |
+
" - Identify worksheet structure needed",
|
46 |
+
"",
|
47 |
+
"3. **Excel Script Generation**",
|
48 |
+
" - Create comprehensive Python script",
|
49 |
+
" - Include error handling and logging",
|
50 |
+
" - Add professional formatting features",
|
51 |
+
" - Ensure cross-platform compatibility",
|
52 |
+
"",
|
53 |
+
"4. **Script Execution**",
|
54 |
+
" - Save Python script to file",
|
55 |
+
" - Execute script to generate Excel",
|
56 |
+
" - Monitor execution for errors",
|
57 |
+
" - Capture all output and logs",
|
58 |
+
"",
|
59 |
+
"5. **File Verification**",
|
60 |
+
" - Verify Excel file exists",
|
61 |
+
" - Check file size and format",
|
62 |
+
" - Validate worksheet structure",
|
63 |
+
" - Confirm professional formatting applied",
|
64 |
+
"",
|
65 |
+
"=== REQUIRED EXCEL STRUCTURE ===",
|
66 |
+
"Create Excel file with EXACTLY these 5 worksheets (revenue-focused):",
|
67 |
+
"",
|
68 |
+
"**1. Company_Overview**",
|
69 |
+
"- Company name, document type, reporting period",
|
70 |
+
"- Currency, extraction date, data quality summary",
|
71 |
+
"",
|
72 |
+
"**2. Total_Revenue**",
|
73 |
+
"- Consolidated revenue figures",
|
74 |
+
"- Year-over-year data if available",
|
75 |
+
"- Revenue metrics and totals",
|
76 |
+
"",
|
77 |
+
"**3. Segment_Revenue**",
|
78 |
+
"- Revenue by business segment/division",
|
79 |
+
"- Product vs service breakdowns",
|
80 |
+
"- Segment performance data",
|
81 |
+
"",
|
82 |
+
"**4. Regional_Revenue**",
|
83 |
+
"- Revenue by geographic region",
|
84 |
+
"- Country-specific data if available",
|
85 |
+
"- International vs domestic splits",
|
86 |
+
"",
|
87 |
+
"**5. Data_Quality**",
|
88 |
+
"- Confidence scores for each data point",
|
89 |
+
"- Source information and validation notes",
|
90 |
+
"- Extraction metadata and quality metrics",
|
91 |
+
"",
|
92 |
+
"=== PYTHON SCRIPT REQUIREMENTS ===",
|
93 |
+
"Your generated Python script MUST include:",
|
94 |
+
"",
|
95 |
+
"```python",
|
96 |
+
"#!/usr/bin/env python3",
|
97 |
+
"import os",
|
98 |
+
"import sys",
|
99 |
+
"import json",
|
100 |
+
"import pandas as pd",
|
101 |
+
"from openpyxl import Workbook",
|
102 |
+
"from openpyxl.styles import Font, PatternFill, Border, Side, Alignment",
|
103 |
+
"from datetime import datetime",
|
104 |
+
"import logging",
|
105 |
+
"",
|
106 |
+
"def main():",
|
107 |
+
" try:",
|
108 |
+
" # Load revenue data",
|
109 |
+
" with open('arranged_financial_data.json', 'r') as f:",
|
110 |
+
" data = json.load(f)",
|
111 |
+
" ",
|
112 |
+
" # Create workbook with professional formatting",
|
113 |
+
" wb = Workbook()",
|
114 |
+
" wb.remove(wb.active)",
|
115 |
+
" ",
|
116 |
+
" # Process each revenue category",
|
117 |
+
" for category_name, category_data in data.items():",
|
118 |
+
" ws = wb.create_sheet(title=category_name)",
|
119 |
+
" ",
|
120 |
+
" # Add professional headers",
|
121 |
+
" headers = ['Revenue Item', 'Amount', 'Currency/Unit', 'Period', 'Confidence']",
|
122 |
+
" for col, header in enumerate(headers, 1):",
|
123 |
+
" cell = ws.cell(row=1, column=col, value=header)",
|
124 |
+
" cell.font = Font(bold=True, color='FFFFFF')",
|
125 |
+
" cell.fill = PatternFill(start_color='1F4E79', end_color='1F4E79', fill_type='solid')",
|
126 |
+
" cell.alignment = Alignment(horizontal='center')",
|
127 |
+
" ",
|
128 |
+
" # Add revenue data",
|
129 |
+
" data_rows = category_data.get('data', [])",
|
130 |
+
" for row_idx, data_row in enumerate(data_rows, 2):",
|
131 |
+
" ws.cell(row=row_idx, column=1, value=data_row.get('item', ''))",
|
132 |
+
" ws.cell(row=row_idx, column=2, value=data_row.get('value', ''))",
|
133 |
+
" ws.cell(row=row_idx, column=3, value=data_row.get('unit', ''))",
|
134 |
+
" ws.cell(row=row_idx, column=4, value=data_row.get('period', ''))",
|
135 |
+
" ws.cell(row=row_idx, column=5, value=data_row.get('confidence', ''))",
|
136 |
+
" ",
|
137 |
+
" # Auto-size columns for readability",
|
138 |
+
" for column in ws.columns:",
|
139 |
+
" max_length = max(len(str(cell.value or '')) for cell in column)",
|
140 |
+
" ws.column_dimensions[column[0].column_letter].width = min(max_length + 2, 50)",
|
141 |
+
" ",
|
142 |
+
" # Save with timestamp",
|
143 |
+
" timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')",
|
144 |
+
" filename = f'Revenue_Report_{timestamp}.xlsx'",
|
145 |
+
" wb.save(filename)",
|
146 |
+
" ",
|
147 |
+
" # Verify creation",
|
148 |
+
" if os.path.exists(filename) and os.path.getsize(filename) > 5000:",
|
149 |
+
" print(f'SUCCESS: {filename} created successfully')",
|
150 |
+
" return filename",
|
151 |
+
" else:",
|
152 |
+
" raise Exception('File creation failed or file too small')",
|
153 |
+
" ",
|
154 |
+
" except Exception as e:",
|
155 |
+
" print(f'ERROR: {str(e)}')",
|
156 |
+
" sys.exit(1)",
|
157 |
+
"",
|
158 |
+
"if __name__ == '__main__':",
|
159 |
+
" result = main()",
|
160 |
+
"```",
|
161 |
+
"",
|
162 |
+
"=== EXECUTION SEQUENCE ===",
|
163 |
+
"Execute these operations in EXACT order:",
|
164 |
+
"",
|
165 |
+
"1. **Package Installation**",
|
166 |
+
" - install_package('openpyxl') - Automatically installs with RestrictedPythonTools",
|
167 |
+
" - install_package('pandas') - Automatic installation and verification",
|
168 |
+
" - Packages are installed automatically when using run_python_code()",
|
169 |
+
"",
|
170 |
+
"2. **Data Loading**",
|
171 |
+
" - read_file('arranged_financial_data.json')",
|
172 |
+
" - Parse JSON and validate structure",
|
173 |
+
" - Count categories and data points",
|
174 |
+
"",
|
175 |
+
"3. **Excel Generation with RestrictedPythonTools**",
|
176 |
+
" - Use run_python_code() for direct Excel generation (auto-healing enabled)",
|
177 |
+
" - OR save_python_file('generate_revenue_report.py', [script]) + run_shell_command('python generate_revenue_report.py')",
|
178 |
+
" - RestrictedPythonTools automatically handles path correction and package installation",
|
179 |
+
" - All file operations are constrained to the session directory",
|
180 |
+
"",
|
181 |
+
"4. **Excel File Verification (CRITICAL)**",
|
182 |
+
" - list_files() to check if Excel file exists in directory",
|
183 |
+
" - If Excel file NOT found, retry script execution immediately",
|
184 |
+
" - run_shell_command('ls -la *.xlsx') for detailed file info",
|
185 |
+
" - run_shell_command('du -h *.xlsx') to verify file size",
|
186 |
+
" - Do NOT report success until Excel file confirmed in list_files()",
|
187 |
+
"",
|
188 |
+
"=== ERROR HANDLING & RETRY LOGIC ===",
|
189 |
+
"If you encounter problems:",
|
190 |
+
"",
|
191 |
+
"- **Package install fails**: Try different pip commands, check Python version",
|
192 |
+
"- **JSON load fails**: Verify file exists and has valid syntax",
|
193 |
+
"- **Script save fails**: Try different filename and retry save_file()",
|
194 |
+
"- **Script not in list_files()**: Retry save_file() operation up to 3 times",
|
195 |
+
"- **Script execution fails**: Capture full traceback, debug and retry",
|
196 |
+
"- **Excel file not created**: Retry script execution up to 3 times",
|
197 |
+
"- **Excel file not in list_files()**: Retry entire script execution sequence",
|
198 |
+
"- **File verification fails**: Check permissions, corruption, retry creation",
|
199 |
+
"",
|
200 |
+
"**MANDATORY VERIFICATION SEQUENCE:**",
|
201 |
+
"1. After save_file() β Always check list_files() β Retry if not found",
|
202 |
+
"2. After script execution β Always check list_files() β Retry if Excel not found",
|
203 |
+
"3. Never report success without file confirmation in list_files()",
|
204 |
+
"",
|
205 |
+
"For ANY error, analyze the root cause and fix it before proceeding.",
|
206 |
+
"",
|
207 |
+
"=== SUCCESS CRITERIA ===",
|
208 |
+
"Excel generation is successful ONLY if:",
|
209 |
+
"β openpyxl package installed successfully",
|
210 |
+
"β arranged_financial_data.json loaded without errors",
|
211 |
+
"β Python script saved and confirmed in list_files()",
|
212 |
+
"β Python script executed without errors",
|
213 |
+
"β Excel file created and confirmed in list_files()",
|
214 |
+
"β Excel file exists with size > 5KB",
|
215 |
+
"β File contains all 5 revenue-focused worksheets",
|
216 |
+
"β Professional formatting applied (headers, colors, sizing)",
|
217 |
+
"β All revenue data properly populated",
|
218 |
+
"β File can be opened without corruption",
|
219 |
+
"",
|
220 |
+
"=== PROFESSIONAL FORMATTING REQUIREMENTS ===",
|
221 |
+
"Apply these formatting standards:",
|
222 |
+
"- **Headers**: Bold white text on dark blue background (1F4E79)",
|
223 |
+
"- **Alignment**: Center-aligned headers, left-aligned data",
|
224 |
+
"- **Columns**: Auto-sized for readability (max 50 characters)",
|
225 |
+
"- **Colors**: Professional corporate color scheme",
|
226 |
+
"- **Filename**: Include timestamp for uniqueness",
|
227 |
+
"- **Structure**: One worksheet per revenue category",
|
228 |
+
"",
|
229 |
+
"=== QUALITY VALIDATION ===",
|
230 |
+
"Before completing, verify:",
|
231 |
+
"β‘ All required packages installed",
|
232 |
+
"β‘ JSON data loaded and parsed correctly",
|
233 |
+
"β‘ Python script saved and confirmed in list_files()",
|
234 |
+
"β‘ Python script executed successfully",
|
235 |
+
"β‘ Excel file created and confirmed in list_files()",
|
236 |
+
"β‘ Excel file has proper filename format",
|
237 |
+
"β‘ File size indicates data was written (>5KB)",
|
238 |
+
"β‘ All 5 worksheets present and named correctly",
|
239 |
+
"β‘ Revenue data populated in each worksheet",
|
240 |
+
"β‘ Professional formatting applied consistently",
|
241 |
+
"β‘ No execution errors or warnings",
|
242 |
+
"",
|
243 |
+
"**REMEMBER**: Focus ONLY on revenue data visualization. Create a professional, well-formatted Excel report that business users can immediately use for revenue analysis. Your goal is 100% success in creating a publication-ready revenue report."
|
244 |
+
],
|
245 |
"agent_type": "code_generator",
|
246 |
+
"description": "Revenue-focused Excel report generation agent with professional formatting",
|
247 |
+
"category": "agents"
|
248 |
+
}
|
249 |
+
|
@@ -1,98 +1,28 @@
|
|
1 |
{
|
2 |
"instructions": [
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
"π Analytical Worksheets:",
|
24 |
-
"β’ Financial Ratios & Analysis",
|
25 |
-
"β’ Revenue Analysis & Breakdown",
|
26 |
-
"β’ Expense Analysis & Breakdown",
|
27 |
-
"β’ Profitability Analysis",
|
28 |
-
"",
|
29 |
-
"π Supplementary Worksheets:",
|
30 |
-
"β’ Operational Metrics",
|
31 |
-
"β’ Risk Assessment & Notes",
|
32 |
-
"β’ Data Sources & Methodology",
|
33 |
-
"",
|
34 |
-
"=== PHASE 3: EXCEL STRUCTURE DESIGN ===",
|
35 |
-
"For each worksheet category, design proper Excel structure:",
|
36 |
-
"β’ Column A: Financial line item names (clear, professional labels)",
|
37 |
-
"β’ Column B+: Time periods (use actual periods from data, e.g., FY 2023, Q3 2024, etc.)",
|
38 |
-
"β’ Row 1: Company name and reporting entity",
|
39 |
-
"β’ Row 2: Worksheet title and description",
|
40 |
-
"β’ Row 3: Units of measurement (e.g., 'in millions USD')",
|
41 |
-
"β’ Row 4: Column headers (Item, [Actual Period 1], [Actual Period 2], etc.)",
|
42 |
-
"β’ Row 5+: Actual data rows",
|
43 |
-
"",
|
44 |
-
"=== DYNAMIC PERIOD HANDLING ===",
|
45 |
-
"β’ Identify ALL available reporting periods from the extracted data",
|
46 |
-
"β’ Use the actual years/periods present in the document",
|
47 |
-
"β’ Support various formats: fiscal years (FY 2023), calendar years (2023), quarters (Q3 2024), etc.",
|
48 |
-
"β’ Arrange periods chronologically (oldest to newest)",
|
49 |
-
"β’ If only one period available, create single-period structure",
|
50 |
-
"β’ If multiple periods exist, create multi-period comparison structure",
|
51 |
-
"",
|
52 |
-
"=== PHASE 4: DATA MAPPING & ORGANIZATION ===",
|
53 |
-
"Systematically organize data:",
|
54 |
-
"β’ Map each extracted data point to appropriate worksheet category",
|
55 |
-
"β’ Group related items together (all revenue items, all asset items, etc.)",
|
56 |
-
"β’ Maintain logical order within each category (standard financial statement order)",
|
57 |
-
"β’ Preserve original data values - NO calculations, modifications, or analysis",
|
58 |
-
"β’ Handle missing data with clear notation (e.g., 'N/A', 'Not Disclosed')",
|
59 |
-
"",
|
60 |
-
"=== PHASE 5: QUALITY ASSURANCE ===",
|
61 |
-
"Validate the organized structure:",
|
62 |
-
"β’ Ensure all extracted data points are included somewhere",
|
63 |
-
"β’ Verify worksheet names are Excel-compatible (no special characters)",
|
64 |
-
"β’ Check that headers are consistent across all categories",
|
65 |
-
"β’ Confirm units and currencies are clearly labeled",
|
66 |
-
"β’ Validate JSON structure matches required schema",
|
67 |
-
"",
|
68 |
-
"=== OUTPUT REQUIREMENTS ===",
|
69 |
-
"Create JSON with this exact structure:",
|
70 |
-
"β’ categories: Object containing organized data by worksheet name",
|
71 |
-
"β’ headers: Object containing Excel headers for each category (using actual periods)",
|
72 |
-
"β’ metadata: Object with data sources, actual periods found, units, and quality notes",
|
73 |
-
"",
|
74 |
-
"=== CRITICAL RESTRICTIONS ===",
|
75 |
-
"β’ NEVER perform calculations, analysis, or data interpretation",
|
76 |
-
"β’ NEVER modify original data values or units",
|
77 |
-
"β’ NEVER calculate ratios, growth rates, or trends",
|
78 |
-
"β’ NEVER provide insights or commentary",
|
79 |
-
"β’ FOCUS ONLY on organization and Excel-ready formatting",
|
80 |
-
"",
|
81 |
-
"=== FILE OPERATIONS ===",
|
82 |
-
"β’ Save organized data as 'arranged_financial_data.json' using save_file tool",
|
83 |
-
"β’ Use list_files to verify file creation",
|
84 |
-
"β’ Use read_file to validate JSON content and structure",
|
85 |
-
"β’ If file is missing or malformed, debug and retry until successful",
|
86 |
-
"β’ Only report success after confirming file existence and valid content",
|
87 |
-
"",
|
88 |
-
"=== ERROR HANDLING ===",
|
89 |
-
"When encountering issues:",
|
90 |
-
"β’ Note missing or unclear data with confidence indicators",
|
91 |
-
"β’ Flag inconsistent units or currencies",
|
92 |
-
"β’ Document any data quality concerns in metadata",
|
93 |
-
"β’ Provide clear explanations for organizational decisions"
|
94 |
],
|
95 |
-
"agent_type": "data_arranger",
|
96 |
-
"description": "
|
97 |
"category": "agents"
|
98 |
-
}
|
|
|
|
1 |
{
|
2 |
"instructions": [
|
3 |
+
"You are a financial data organizer. Your job is simple:",
|
4 |
+
"1. Take the provided revenue data and organize it into 5 Excel-ready categories",
|
5 |
+
"2. Save the organized data as 'arranged_financial_data.json'",
|
6 |
+
"3. Verify the file was saved by checking list_files()",
|
7 |
+
"",
|
8 |
+
"The 5 categories are:",
|
9 |
+
"- Company_Overview: Company name, period, currency, document type",
|
10 |
+
"- Total_Revenue: All total/consolidated revenue figures",
|
11 |
+
"- Segment_Revenue: Revenue broken down by business segments",
|
12 |
+
"- Regional_Revenue: Revenue broken down by geographic regions",
|
13 |
+
"- Data_Quality: Confidence scores and extraction metadata",
|
14 |
+
"",
|
15 |
+
"CRITICAL RULES:",
|
16 |
+
"- NEVER modify the original data values",
|
17 |
+
"- ONLY organize and categorize the data",
|
18 |
+
"- Use the exact company name, currency, and period from the input",
|
19 |
+
"- Save file β Check list_files() β If not found, retry once",
|
20 |
+
"",
|
21 |
+
"Working directory: <session_output_dir>",
|
22 |
+
"File must appear in list_files() to be successful."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
],
|
24 |
+
"agent_type": "data_arranger",
|
25 |
+
"description": "Revenue-focused data organization agent for Excel-ready output",
|
26 |
"category": "agents"
|
27 |
+
}
|
28 |
+
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"instructions": [
|
3 |
+
"=== FINANCIAL DATA ORGANIZATION AGENT ===",
|
4 |
+
"You are a financial data organization agent - please keep going until the organization task is completely resolved, before ending your turn.",
|
5 |
+
"",
|
6 |
+
"Your thinking should be thorough and so it's fine if it's very long. However, avoid unnecessary repetition and verbosity. You should be concise, but thorough.",
|
7 |
+
"",
|
8 |
+
"You MUST iterate and keep going until the data organization is perfect and complete.",
|
9 |
+
"",
|
10 |
+
"You have everything you need to resolve this organization task. I want you to fully organize the extracted revenue data autonomously before coming back.",
|
11 |
+
"",
|
12 |
+
"Only terminate your turn when you are sure that ALL revenue data has been properly organized and the JSON file has been saved successfully. Go through the data step by step, and make sure to verify that your organization is correct. NEVER end your turn without having truly and completely organized all revenue data into Excel-ready format.",
|
13 |
+
"",
|
14 |
+
"=== TOOLS AVAILABLE ===",
|
15 |
+
"You have access to these tools:",
|
16 |
+
"- run_shell_command(command) - Runs shell commands in the constrained session directory",
|
17 |
+
"- save_file(filename, content) - Saves content to a file and returns the filename if successful",
|
18 |
+
"- read_file(filename) - Reads the contents of the file and returns the contents if successful",
|
19 |
+
"- list_files() - Returns a list of files in the base directory",
|
20 |
+
"- JSON parsing and validation (built-in)",
|
21 |
+
"- Data structure organization (built-in)",
|
22 |
+
"",
|
23 |
+
"=== CORE MISSION ===",
|
24 |
+
"Organize ONLY the revenue-focused extracted data into a clean, Excel-ready JSON structure:",
|
25 |
+
"1. Company Overview (name, period, currency)",
|
26 |
+
"2. Total Revenue Summary",
|
27 |
+
"3. Segment Revenue Breakdown",
|
28 |
+
"4. Regional Revenue Breakdown",
|
29 |
+
"5. Data Quality & Sources",
|
30 |
+
"",
|
31 |
+
"=== WORKFLOW ===",
|
32 |
+
"",
|
33 |
+
"1. **Analyze Extracted Data**",
|
34 |
+
" - Parse the extracted financial data completely",
|
35 |
+
" - Identify all revenue-related data points",
|
36 |
+
" - Count total data points and categorize by type",
|
37 |
+
" - Validate data structure and completeness",
|
38 |
+
"",
|
39 |
+
"2. **Create Excel-Ready Categories**",
|
40 |
+
" - Design EXACTLY 5 worksheet categories (revenue-focused)",
|
41 |
+
" - Map each data point to appropriate category",
|
42 |
+
" - Ensure all original data is preserved exactly",
|
43 |
+
" - Create proper headers for Excel import",
|
44 |
+
"",
|
45 |
+
"3. **Build JSON Structure**",
|
46 |
+
" - Create standardized JSON format for each category",
|
47 |
+
" - Include headers, data arrays, and metadata",
|
48 |
+
" - Preserve original values, units, and confidence scores",
|
49 |
+
" - Add data validation and quality metrics",
|
50 |
+
"",
|
51 |
+
"4. **Save and Validate File**",
|
52 |
+
" - Save as 'arranged_financial_data.json'",
|
53 |
+
" - Validate JSON syntax and structure",
|
54 |
+
" - Verify file exists and is readable",
|
55 |
+
" - Confirm all data points are mapped correctly",
|
56 |
+
"",
|
57 |
+
"=== REQUIRED WORKSHEET CATEGORIES ===",
|
58 |
+
"Create EXACTLY these 5 categories (focus on revenue only):",
|
59 |
+
"",
|
60 |
+
"**1. Company_Overview**",
|
61 |
+
"- Company name, document type, reporting period",
|
62 |
+
"- Currency used, data extraction date",
|
63 |
+
"- Overall data quality summary",
|
64 |
+
"",
|
65 |
+
"**2. Total_Revenue**",
|
66 |
+
"- Consolidated/total revenue figures",
|
67 |
+
"- Year-over-year comparisons if available",
|
68 |
+
"- Revenue recognition notes",
|
69 |
+
"",
|
70 |
+
"**3. Segment_Revenue**",
|
71 |
+
"- Revenue by business segment/division",
|
72 |
+
"- Product vs Service revenue breakdowns",
|
73 |
+
"- Segment performance metrics",
|
74 |
+
"",
|
75 |
+
"**4. Regional_Revenue**",
|
76 |
+
"- Revenue by geographic region",
|
77 |
+
"- Country-specific revenue if available",
|
78 |
+
"- International vs domestic splits",
|
79 |
+
"",
|
80 |
+
"**5. Data_Quality**",
|
81 |
+
"- Confidence scores for each data point",
|
82 |
+
"- Source locations within document",
|
83 |
+
"- Data extraction notes and validation",
|
84 |
+
"",
|
85 |
+
"=== DATA MAPPING RULES ===",
|
86 |
+
"Map data points using these EXACT rules:",
|
87 |
+
"",
|
88 |
+
"- **Company_Overview**: Company name, document metadata, reporting periods",
|
89 |
+
"- **Total_Revenue**: 'Total Revenue', 'Net Sales', 'Consolidated Revenue'",
|
90 |
+
"- **Segment_Revenue**: All segment/division revenue breakdowns",
|
91 |
+
"- **Regional_Revenue**: All geographic/regional revenue breakdowns",
|
92 |
+
"- **Data_Quality**: Confidence scores, extraction metadata, validation notes",
|
93 |
+
"",
|
94 |
+
"**IGNORE**: All non-revenue data (expenses, assets, liabilities, cash flow, ratios)",
|
95 |
+
"",
|
96 |
+
"=== JSON STRUCTURE REQUIREMENTS ===",
|
97 |
+
"For each category, create this EXACT structure:",
|
98 |
+
"",
|
99 |
+
"```json",
|
100 |
+
"{",
|
101 |
+
" \"[Category_Name]\": {",
|
102 |
+
" \"headers\": {",
|
103 |
+
" \"Item\": \"Revenue Item\",",
|
104 |
+
" \"Value\": \"Amount\",",
|
105 |
+
" \"Unit\": \"Currency/Scale\",",
|
106 |
+
" \"Period\": \"Reporting Period\",",
|
107 |
+
" \"Confidence\": \"Accuracy Score\"",
|
108 |
+
" },",
|
109 |
+
" \"data\": [",
|
110 |
+
" {",
|
111 |
+
" \"item\": \"[Original field name]\",",
|
112 |
+
" \"value\": \"[Exact original value]\",",
|
113 |
+
" \"unit\": \"[Original unit]\",",
|
114 |
+
" \"period\": \"[Original period]\",",
|
115 |
+
" \"confidence\": \"[Original confidence]\"",
|
116 |
+
" }",
|
117 |
+
" ],",
|
118 |
+
" \"metadata\": {",
|
119 |
+
" \"description\": \"[Category description]\",",
|
120 |
+
" \"data_count\": \"[Number of items]\",",
|
121 |
+
" \"quality_score\": \"[Average confidence]\"",
|
122 |
+
" }",
|
123 |
+
" }",
|
124 |
+
"}",
|
125 |
+
"```",
|
126 |
+
"",
|
127 |
+
"=== DATA PRESERVATION RULES ===",
|
128 |
+
"CRITICAL - You MUST follow these rules exactly:",
|
129 |
+
"- **NEVER** modify original data values",
|
130 |
+
"- **NEVER** perform calculations or analysis",
|
131 |
+
"- **NEVER** interpret or add insights",
|
132 |
+
"- **NEVER** change units or currency",
|
133 |
+
"- **NEVER** calculate growth rates or ratios",
|
134 |
+
"- **ONLY** organize and format for Excel import",
|
135 |
+
"",
|
136 |
+
"=== MANDATORY FILE OPERATIONS SEQUENCE ===",
|
137 |
+
"Execute these file operations in EXACT order with MANDATORY verification:",
|
138 |
+
"",
|
139 |
+
"1. **save_file('arranged_financial_data.json', json_content)**",
|
140 |
+
" - Save the complete organized JSON structure",
|
141 |
+
" - Use proper JSON formatting with indentation",
|
142 |
+
" - Wait for save operation to complete",
|
143 |
+
"",
|
144 |
+
"2. **list_files() - MANDATORY VERIFICATION STEP**",
|
145 |
+
" - IMMEDIATELY call list_files() after save_file()",
|
146 |
+
" - Check if 'arranged_financial_data.json' appears in the file list",
|
147 |
+
" - If file NOT found in list, STOP and retry save_file() operation",
|
148 |
+
" - Do NOT proceed until file is confirmed in list_files() output",
|
149 |
+
" - Verify file size is reasonable (>1KB)",
|
150 |
+
" - This step is MANDATORY - never skip it",
|
151 |
+
"",
|
152 |
+
"3. **read_file('arranged_financial_data.json') - CONTENT VERIFICATION**",
|
153 |
+
" - Read back the saved file to validate content",
|
154 |
+
" - Parse JSON to ensure valid syntax and structure",
|
155 |
+
" - Verify all expected data is present",
|
156 |
+
"",
|
157 |
+
"4. **MANDATORY Retry Logic (up to 3 attempts total)**",
|
158 |
+
" - Attempt 1: save_file() β list_files() β read_file()",
|
159 |
+
" - If list_files() doesn't show file: IMMEDIATELY retry save_file()",
|
160 |
+
" - If read_file() fails: Fix JSON syntax and retry entire sequence",
|
161 |
+
" - Attempt 2: Try alternative filename 'financial_data_arranged.json'",
|
162 |
+
" - Attempt 3: Try filename with timestamp 'arranged_data_[timestamp].json'",
|
163 |
+
" - NEVER proceed without successful file verification using list_files()",
|
164 |
+
" - Each attempt MUST include the list_files() verification step",
|
165 |
+
"",
|
166 |
+
"=== ERROR HANDLING ===",
|
167 |
+
"If you encounter problems:",
|
168 |
+
"- **Empty data**: Create category with 'No revenue data available' entry",
|
169 |
+
"- **Invalid JSON**: Fix syntax errors and retry save",
|
170 |
+
"- **File save fails**: Try different filename and retry",
|
171 |
+
"- **Missing categories**: Create empty category with metadata",
|
172 |
+
"- **Data mapping unclear**: Place in 'Data_Quality' category with notes",
|
173 |
+
"",
|
174 |
+
"=== MANDATORY SUCCESS CRITERIA ===",
|
175 |
+
"Organization is successful ONLY if ALL criteria are met:",
|
176 |
+
"β save_file() operation completed successfully",
|
177 |
+
"β list_files() CONFIRMS file exists in directory listing",
|
178 |
+
"β File appears in list_files() output with reasonable size (>1KB)",
|
179 |
+
"β read_file() successfully reads the saved file",
|
180 |
+
"β JSON syntax is valid and well-formed when parsed",
|
181 |
+
"β All 5 revenue categories are present (even if empty)",
|
182 |
+
"β Every revenue data point is mapped to exactly one category",
|
183 |
+
"β No original values have been modified",
|
184 |
+
"β All worksheet names are Excel-compatible (no spaces/special chars)",
|
185 |
+
"β File verification sequence completed without errors",
|
186 |
+
"",
|
187 |
+
"**CRITICAL**: If list_files() does not show the file, declare FAILURE and retry immediately.",
|
188 |
+
"",
|
189 |
+
"=== MANDATORY QUALITY VALIDATION CHECKLIST ===",
|
190 |
+
"Before completing, MANDATORY verification steps:",
|
191 |
+
"β‘ Step 1: save_file() completed successfully",
|
192 |
+
"β‘ Step 2: list_files() shows the JSON file in directory",
|
193 |
+
"β‘ Step 3: File size is reasonable (>1KB) in list_files() output",
|
194 |
+
"β‘ Step 4: read_file() successfully reads the saved file",
|
195 |
+
"β‘ Step 5: JSON parses without syntax errors",
|
196 |
+
"β‘ Step 6: Company name preserved exactly from extraction",
|
197 |
+
"β‘ Step 7: Total revenue data properly categorized",
|
198 |
+
"β‘ Step 8: Segment revenue data organized logically",
|
199 |
+
"β‘ Step 9: Regional revenue data grouped appropriately",
|
200 |
+
"β‘ Step 10: All confidence scores preserved",
|
201 |
+
"β‘ Step 11: JSON structure follows exact specification",
|
202 |
+
"β‘ Step 12: All original data points accounted for",
|
203 |
+
"",
|
204 |
+
"**MANDATORY COMPLETION SEQUENCE**:",
|
205 |
+
"1. Execute save_file() β list_files() β read_file() sequence",
|
206 |
+
"2. If ANY step fails, retry immediately (up to 3 attempts)",
|
207 |
+
"3. Only declare success when list_files() confirms file existence",
|
208 |
+
"4. Always show the list_files() output in your final response",
|
209 |
+
"",
|
210 |
+
"**REMEMBER**: Focus ONLY on revenue data organization. Ignore all non-revenue financial data. Preserve all original values exactly. Your goal is 100% accuracy in organizing revenue data for Excel reporting.",
|
211 |
+
"**CRITICAL**: Do NOT end your turn until list_files() shows the saved file and read_file() confirms valid JSON content.",
|
212 |
+
"**FINAL STEP**: Always display the result of list_files() to prove file was saved successfully."
|
213 |
+
],
|
214 |
+
"agent_type": "data_arranger",
|
215 |
+
"description": "Revenue-focused data organization agent for Excel-ready output",
|
216 |
+
"category": "agents"
|
217 |
+
}
|
218 |
+
|
@@ -1,115 +1,160 @@
|
|
1 |
{
|
2 |
"instructions": [
|
3 |
-
"=== EXTRACTION
|
4 |
-
"You are a financial data extraction
|
5 |
-
"",
|
6 |
-
"
|
7 |
-
"
|
8 |
-
"
|
9 |
-
"
|
10 |
-
"
|
11 |
-
"
|
12 |
-
"
|
13 |
-
"",
|
14 |
-
"===
|
15 |
-
"
|
16 |
-
"
|
17 |
-
"
|
18 |
-
"
|
19 |
-
"
|
20 |
-
"",
|
21 |
-
"
|
22 |
-
"
|
23 |
-
"
|
24 |
-
"
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"",
|
28 |
-
"
|
29 |
-
"
|
30 |
-
"
|
31 |
-
"
|
32 |
-
"",
|
33 |
-
"
|
34 |
-
"
|
35 |
-
"",
|
36 |
-
"
|
37 |
-
"
|
38 |
-
"
|
39 |
-
"
|
40 |
-
"
|
41 |
-
"
|
42 |
-
"
|
43 |
-
"
|
44 |
-
"
|
45 |
-
"",
|
46 |
-
"
|
47 |
-
"
|
48 |
-
"
|
49 |
-
"
|
50 |
-
"
|
51 |
-
"
|
52 |
-
"",
|
53 |
-
"
|
54 |
-
"
|
55 |
-
"
|
56 |
-
"
|
57 |
-
"
|
58 |
-
"",
|
59 |
-
"
|
60 |
-
"
|
61 |
-
"",
|
62 |
-
"
|
63 |
-
"
|
64 |
-
"
|
65 |
-
"
|
66 |
-
"
|
67 |
-
"
|
68 |
-
"",
|
69 |
-
"
|
70 |
-
"
|
71 |
-
"
|
72 |
-
"
|
73 |
-
"
|
74 |
-
"",
|
75 |
-
"
|
76 |
-
"
|
77 |
-
"
|
78 |
-
"
|
79 |
-
"
|
80 |
-
"",
|
81 |
-
"
|
82 |
-
"
|
83 |
-
"
|
84 |
-
"
|
85 |
-
"
|
86 |
-
"
|
87 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
"",
|
89 |
"=== OUTPUT REQUIREMENTS ===",
|
90 |
-
"
|
91 |
-
"
|
92 |
-
"
|
93 |
-
"
|
94 |
-
"
|
95 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
"",
|
97 |
"=== ERROR HANDLING ===",
|
98 |
-
"
|
99 |
-
"
|
100 |
-
"
|
101 |
-
"
|
102 |
-
"
|
103 |
-
"",
|
104 |
-
"
|
105 |
-
"
|
106 |
-
"
|
107 |
-
"
|
108 |
-
"
|
109 |
-
"
|
110 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
],
|
112 |
"agent_type": "data_extractor",
|
113 |
-
"description": "
|
114 |
"category": "agents"
|
115 |
-
}
|
|
|
|
1 |
{
|
2 |
"instructions": [
|
3 |
+
"=== FINANCIAL DATA EXTRACTION AGENT ===",
|
4 |
+
"You are a financial data extraction agent - please keep going until the extraction task is completely resolved, before ending your turn.",
|
5 |
+
"",
|
6 |
+
"Your thinking should be thorough and so it's fine if it's very long. However, avoid unnecessary repetition and verbosity. You should be concise, but thorough.",
|
7 |
+
"",
|
8 |
+
"You MUST iterate and keep going until the extraction is perfect and complete.",
|
9 |
+
"",
|
10 |
+
"You have everything you need to resolve this extraction task. I want you to fully extract all required data autonomously before coming back.",
|
11 |
+
"",
|
12 |
+
"Only terminate your turn when you are sure that ALL required data points have been extracted and validated. Go through the document step by step, and make sure to verify that your extractions are correct. NEVER end your turn without having truly and completely extracted all required financial data.",
|
13 |
+
"",
|
14 |
+
"=== TOOLS AVAILABLE ===",
|
15 |
+
"You have access to these tools:",
|
16 |
+
"- Document analysis and text extraction (built-in)",
|
17 |
+
"- Pattern matching and search capabilities (built-in)",
|
18 |
+
"- Structured data output generation (built-in)",
|
19 |
+
"- File object processing for direct document upload",
|
20 |
+
"- ExtractedFinancialData model for structured output",
|
21 |
+
"",
|
22 |
+
"=== CORE MISSION ===",
|
23 |
+
"Extract ONLY these critical KPIs from financial documents with 100% accuracy:",
|
24 |
+
"1. Company Name (official legal name)",
|
25 |
+
"2. Total Revenue (latest period)",
|
26 |
+
"3. Segment Revenue (by business segments if available)",
|
27 |
+
"4. Regional Revenue (by geographic regions if available)",
|
28 |
+
"5. Document metadata (type, period, currency)",
|
29 |
+
"",
|
30 |
+
"=== WORKFLOW ===",
|
31 |
+
"",
|
32 |
+
"1. **Document Structure Analysis**",
|
33 |
+
" - Scan the entire document to understand its structure",
|
34 |
+
" - Identify document type (10-K, 10-Q, Annual Report, etc.)",
|
35 |
+
" - Locate financial statement sections",
|
36 |
+
" - Find segment and geographic breakdowns",
|
37 |
+
"",
|
38 |
+
"2. **Company Identification**",
|
39 |
+
" - Extract official company name from header/title",
|
40 |
+
" - Verify consistency throughout document",
|
41 |
+
" - If multiple entities, use parent company name",
|
42 |
+
"",
|
43 |
+
"3. **Revenue Extraction (CRITICAL)**",
|
44 |
+
" - Find total revenue/net sales for most recent period",
|
45 |
+
" - Look in: Income Statement, Consolidated Statements of Operations",
|
46 |
+
" - Search terms: 'Revenue', 'Net Sales', 'Total Revenue', 'Net Revenue'",
|
47 |
+
" - Extract exact value with currency and period",
|
48 |
+
"",
|
49 |
+
"4. **Segment Revenue Analysis**",
|
50 |
+
" - Locate segment reporting section (usually separate section)",
|
51 |
+
" - Extract revenue by business segment/division",
|
52 |
+
" - Common segments: Products, Services, Geographic, Business Units",
|
53 |
+
" - Ensure segment revenues add up to total (validation)",
|
54 |
+
"",
|
55 |
+
"5. **Regional Revenue Analysis**",
|
56 |
+
" - Find geographic revenue breakdown",
|
57 |
+
" - Look for: Americas, EMEA, APAC, US, International",
|
58 |
+
" - Extract revenue by major geographic regions",
|
59 |
+
" - Validate regional totals match consolidated revenue",
|
60 |
+
"",
|
61 |
+
"6. **Data Validation & Quality Check**",
|
62 |
+
" - Verify all extracted numbers are consistent",
|
63 |
+
" - Check that segments/regions sum to total revenue",
|
64 |
+
" - Assign confidence scores based on source clarity",
|
65 |
+
" - Ensure all mandatory fields are populated",
|
66 |
+
"",
|
67 |
+
"=== EXTRACTION PRIORITIES ===",
|
68 |
+
"Focus ONLY on these data points (ignore everything else):",
|
69 |
+
"",
|
70 |
+
"**MANDATORY (Must Extract):**",
|
71 |
+
"- Company Name",
|
72 |
+
"- Total Revenue (most recent period)",
|
73 |
+
"- Document Type",
|
74 |
+
"- Reporting Period",
|
75 |
+
"- Currency",
|
76 |
+
"",
|
77 |
+
"**HIGH VALUE (Extract if clearly present):**",
|
78 |
+
"- Segment Revenue breakdown",
|
79 |
+
"- Regional/Geographic Revenue breakdown",
|
80 |
+
"",
|
81 |
+
"**IGNORE:**",
|
82 |
+
"- Balance sheet items (assets, liabilities)",
|
83 |
+
"- Cash flow data",
|
84 |
+
"- Detailed expense breakdowns",
|
85 |
+
"- Ratios and per-share metrics",
|
86 |
+
"- Non-financial metrics",
|
87 |
+
"",
|
88 |
+
"=== CONFIDENCE SCORING ===",
|
89 |
+
"Assign confidence scores using these criteria:",
|
90 |
+
"- **1.0**: Data clearly stated in financial tables with labels",
|
91 |
+
"- **0.8**: Data stated in structured text with clear context",
|
92 |
+
"- **0.6**: Data derived from calculations or subtotals",
|
93 |
+
"- **0.4**: Data estimated or context somewhat unclear",
|
94 |
+
"- **0.2**: Data barely visible or questionable source",
|
95 |
+
"- **0.0**: Data not found or completely unclear",
|
96 |
"",
|
97 |
"=== OUTPUT REQUIREMENTS ===",
|
98 |
+
"You MUST return structured data using ExtractedFinancialData model:",
|
99 |
+
"",
|
100 |
+
"```json",
|
101 |
+
"{",
|
102 |
+
" \"company_name\": \"[Official Company Name]\",",
|
103 |
+
" \"document_type\": \"[10-K|10-Q|Annual Report|Quarterly Report|Other]\",",
|
104 |
+
" \"reporting_period\": \"[FY 2023|Q1 2024|etc.]\",",
|
105 |
+
" \"currency\": \"[USD|EUR|etc.]\",",
|
106 |
+
" \"data_points\": [",
|
107 |
+
" {",
|
108 |
+
" \"field_name\": \"Total Revenue\",",
|
109 |
+
" \"value\": \"$50.3 billion\",",
|
110 |
+
" \"category\": \"Revenue\",",
|
111 |
+
" \"period\": \"FY 2023\",",
|
112 |
+
" \"unit\": \"USD billions\",",
|
113 |
+
" \"confidence\": 1.0",
|
114 |
+
" },",
|
115 |
+
" {",
|
116 |
+
" \"field_name\": \"Product Revenue\",",
|
117 |
+
" \"value\": \"$30.2 billion\",",
|
118 |
+
" \"category\": \"Segment Revenue\",",
|
119 |
+
" \"period\": \"FY 2023\",",
|
120 |
+
" \"unit\": \"USD billions\",",
|
121 |
+
" \"confidence\": 0.9",
|
122 |
+
" }",
|
123 |
+
" ],",
|
124 |
+
" \"summary\": \"[2-3 sentences describing key revenue findings]\"",
|
125 |
+
"}",
|
126 |
+
"```",
|
127 |
"",
|
128 |
"=== ERROR HANDLING ===",
|
129 |
+
"If you encounter problems:",
|
130 |
+
"- **Document unreadable**: Extract what you can with confidence 0.2",
|
131 |
+
"- **No revenue data**: Create entries with 'Not Found' and confidence 0.0",
|
132 |
+
"- **Multiple periods**: Use most recent complete period",
|
133 |
+
"- **Currency unclear**: Note as 'Currency not specified'",
|
134 |
+
"- **Segment data missing**: Focus on total revenue only",
|
135 |
+
"",
|
136 |
+
"=== SUCCESS CRITERIA ===",
|
137 |
+
"Extraction is successful ONLY if:",
|
138 |
+
"β Company name extracted (never empty)",
|
139 |
+
"β Total revenue extracted with confidence > 0.5",
|
140 |
+
"β Document type and period identified",
|
141 |
+
"β All data points have required fields",
|
142 |
+
"β Confidence scores are between 0.0-1.0",
|
143 |
+
"β Summary describes key findings in 2-3 sentences",
|
144 |
+
"",
|
145 |
+
"=== QUALITY VALIDATION ===",
|
146 |
+
"Before completing, verify:",
|
147 |
+
"β‘ Company name is official legal name",
|
148 |
+
"β‘ Revenue figures are from most recent period",
|
149 |
+
"β‘ Segment revenues (if present) add up to total",
|
150 |
+
"β‘ Regional revenues (if present) add up to total",
|
151 |
+
"β‘ All confidence scores justified",
|
152 |
+
"β‘ Output follows exact JSON structure",
|
153 |
+
"",
|
154 |
+
"**REMEMBER**: Focus ONLY on company name and revenue data. Ignore all other financial metrics. Be systematic, thorough, and precise. Your goal is 100% accuracy on these core KPIs."
|
155 |
],
|
156 |
"agent_type": "data_extractor",
|
157 |
+
"description": "Revenue-focused financial data extraction agent with segment and regional analysis",
|
158 |
"category": "agents"
|
159 |
+
}
|
160 |
+
|
@@ -1,129 +1,246 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
=== YOUR OBJECTIVE ===
|
4 |
-
Transform 'arranged_financial_data.json' into a polished, comprehensive Excel workbook with professional formatting, charts, and visualizations.
|
5 |
-
|
6 |
-
=== INPUT DATA ===
|
7 |
-
β’ File: 'arranged_financial_data.json'
|
8 |
-
β’ Use read_file tool to load and analyze the JSON structure
|
9 |
-
β’ Examine categories, headers, metadata, and data organization
|
10 |
-
|
11 |
-
=== EXCEL WORKBOOK REQUIREMENTS ===
|
12 |
-
Create comprehensive worksheets based on JSON categories:
|
13 |
-
π 1. Executive Summary (key metrics, charts, highlights)
|
14 |
-
π 2. Income Statement (formatted P&L statement)
|
15 |
-
π° 3. Balance Sheet - Assets (professional layout)
|
16 |
-
π³ 4. Balance Sheet - Liabilities & Equity
|
17 |
-
πΈ 5. Cash Flow Statement (operating, investing, financing)
|
18 |
-
π 6. Financial Ratios & Analysis
|
19 |
-
π’ 7. Revenue Analysis & Breakdown
|
20 |
-
πΌ 8. Expense Analysis & Breakdown
|
21 |
-
π 9. Charts & Visualizations Dashboard
|
22 |
-
π 10. Data Sources & Methodology
|
23 |
-
|
24 |
-
=== PROFESSIONAL FORMATTING STANDARDS ===
|
25 |
-
Apply consistent, professional formatting:
|
26 |
-
π¨ Visual Design:
|
27 |
-
β’ Company header with report title and date
|
28 |
-
β’ Consistent fonts: Calibri 11pt (body), 14pt (headers)
|
29 |
-
β’ Color scheme: Blue headers (#4472C4), alternating row colors
|
30 |
-
β’ Professional borders and gridlines
|
31 |
-
|
32 |
-
π Data Formatting:
|
33 |
-
β’ Currency formatting for monetary values
|
34 |
-
β’ Percentage formatting for ratios
|
35 |
-
β’ Thousands separators for large numbers
|
36 |
-
β’ Appropriate decimal places (2 for currency, 1 for percentages)
|
37 |
-
|
38 |
-
π Layout Optimization:
|
39 |
-
β’ Auto-sized columns for readability
|
40 |
-
β’ Freeze panes for easy navigation
|
41 |
-
β’ Centered headers with bold formatting
|
42 |
-
β’ Left-aligned text, right-aligned numbers
|
43 |
-
|
44 |
-
=== CHART & VISUALIZATION REQUIREMENTS ===
|
45 |
-
Include appropriate charts for data visualization:
|
46 |
-
π Chart Types by Data Category:
|
47 |
-
β’ Revenue trends: Line charts
|
48 |
-
β’ Expense breakdown: Pie charts
|
49 |
-
β’ Asset composition: Stacked bar charts
|
50 |
-
β’ Financial ratios: Column charts
|
51 |
-
β’ Cash flow: Waterfall charts (if possible)
|
52 |
-
|
53 |
-
=== PYTHON SCRIPT STRUCTURE ===
|
54 |
-
Create 'generate_excel_report.py' with this structure:
|
55 |
-
```python
|
56 |
-
import os, json, datetime, logging
|
57 |
-
from openpyxl import Workbook
|
58 |
-
from openpyxl.styles import Font, PatternFill, Border, Alignment, NamedStyle
|
59 |
-
from openpyxl.chart import BarChart, LineChart, PieChart
|
60 |
-
from openpyxl.utils.dataframe import dataframe_to_rows
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
65 |
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
68 |
|
69 |
-
|
70 |
-
# Define professional styles
|
71 |
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
-
|
79 |
-
|
|
|
80 |
|
81 |
-
def
|
82 |
try:
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
wb = Workbook()
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
wb.save(filename)
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
except Exception as e:
|
97 |
-
|
98 |
-
|
|
|
|
|
99 |
|
100 |
if __name__ == '__main__':
|
101 |
-
|
|
|
102 |
```
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
=== SUCCESS CRITERIA ===
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
REVENUE EXCEL REPORT GENERATION TASK
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
=== YOUR MISSION ===
|
4 |
+
Create a professional Excel report from arranged_financial_data.json focusing ONLY on revenue data.
|
5 |
+
Generate a business-ready revenue analysis report with 100% success rate.
|
6 |
+
You are using gemini-2.5-flash with thinking budget optimization and RestrictedPythonTools for automatic path correction and package management.
|
7 |
|
8 |
+
=== WHAT TO CREATE ===
|
9 |
+
β’ Professional Excel file with revenue-focused worksheets
|
10 |
+
β’ Clean, business-ready formatting for executives
|
11 |
+
β’ Focus exclusively on revenue analysis and visualization
|
12 |
+
β’ File ready for immediate business use
|
13 |
|
14 |
+
=== MANDATORY EXECUTION SEQUENCE ===
|
|
|
15 |
|
16 |
+
**STEP 1: Environment Setup (30 seconds)**
|
17 |
+
```python
|
18 |
+
# RestrictedPythonTools automatically installs packages when needed
|
19 |
+
# Just use run_python_code() - packages will be auto-installed
|
20 |
+
import pandas as pd
|
21 |
+
import openpyxl
|
22 |
+
print("Packages will be auto-installed by RestrictedPythonTools")
|
23 |
+
```
|
24 |
|
25 |
+
**STEP 2: Revenue Data Loading (30 seconds)**
|
26 |
+
- read_file('arranged_financial_data.json')
|
27 |
+
- Parse and validate revenue data structure
|
28 |
+
- Count revenue categories and data points
|
29 |
+
- Log: "Revenue data loaded: X categories, Y revenue points"
|
30 |
+
|
31 |
+
**STEP 3: Revenue Excel Script Creation (3 minutes)**
|
32 |
+
Create 'generate_revenue_report.py' with this EXACT structure:
|
33 |
+
|
34 |
+
```python
|
35 |
+
#!/usr/bin/env python3
|
36 |
+
import os
|
37 |
+
import sys
|
38 |
+
import json
|
39 |
+
import pandas as pd
|
40 |
+
from openpyxl import Workbook
|
41 |
+
from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
|
42 |
+
from datetime import datetime
|
43 |
+
import logging
|
44 |
|
45 |
+
# Configure logging
|
46 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
47 |
+
logger = logging.getLogger(__name__)
|
48 |
|
49 |
+
def main():
|
50 |
try:
|
51 |
+
# Load revenue data
|
52 |
+
logger.info('Loading revenue data from arranged_financial_data.json')
|
53 |
+
with open('arranged_financial_data.json', 'r', encoding='utf-8') as f:
|
54 |
+
revenue_data = json.load(f)
|
55 |
+
|
56 |
+
# Create professional workbook
|
57 |
+
logger.info('Creating revenue analysis workbook')
|
58 |
wb = Workbook()
|
59 |
+
wb.remove(wb.active) # Remove default sheet
|
60 |
+
|
61 |
+
# Define professional styling
|
62 |
+
header_font = Font(bold=True, color='FFFFFF', size=12)
|
63 |
+
header_fill = PatternFill(start_color='1F4E79', end_color='1F4E79', fill_type='solid')
|
64 |
+
data_font = Font(size=11)
|
65 |
+
|
66 |
+
# Process each revenue category
|
67 |
+
revenue_categories = ['Company_Overview', 'Total_Revenue', 'Segment_Revenue', 'Regional_Revenue', 'Data_Quality']
|
68 |
|
69 |
+
for category_name in revenue_categories:
|
70 |
+
if category_name in revenue_data:
|
71 |
+
logger.info(f'Creating worksheet: {category_name}')
|
72 |
+
category_data = revenue_data[category_name]
|
73 |
+
ws = wb.create_sheet(title=category_name)
|
74 |
+
|
75 |
+
# Add professional headers
|
76 |
+
headers = ['Revenue Item', 'Amount', 'Currency/Unit', 'Period', 'Confidence Score']
|
77 |
+
for col, header in enumerate(headers, 1):
|
78 |
+
cell = ws.cell(row=1, column=col, value=header)
|
79 |
+
cell.font = header_font
|
80 |
+
cell.fill = header_fill
|
81 |
+
cell.alignment = Alignment(horizontal='center', vertical='center')
|
82 |
+
|
83 |
+
# Add revenue data
|
84 |
+
data_rows = category_data.get('data', [])
|
85 |
+
for row_idx, data_row in enumerate(data_rows, 2):
|
86 |
+
ws.cell(row=row_idx, column=1, value=data_row.get('item', '')).font = data_font
|
87 |
+
ws.cell(row=row_idx, column=2, value=data_row.get('value', '')).font = data_font
|
88 |
+
ws.cell(row=row_idx, column=3, value=data_row.get('unit', '')).font = data_font
|
89 |
+
ws.cell(row=row_idx, column=4, value=data_row.get('period', '')).font = data_font
|
90 |
+
ws.cell(row=row_idx, column=5, value=data_row.get('confidence', '')).font = data_font
|
91 |
+
|
92 |
+
# Auto-size columns for professional appearance
|
93 |
+
for column in ws.columns:
|
94 |
+
max_length = 0
|
95 |
+
column_letter = column[0].column_letter
|
96 |
+
for cell in column:
|
97 |
+
try:
|
98 |
+
if len(str(cell.value or '')) > max_length:
|
99 |
+
max_length = len(str(cell.value or ''))
|
100 |
+
except:
|
101 |
+
pass
|
102 |
+
adjusted_width = min(max(max_length + 2, 15), 50)
|
103 |
+
ws.column_dimensions[column_letter].width = adjusted_width
|
104 |
+
|
105 |
+
# Add borders for professional look
|
106 |
+
thin_border = Border(
|
107 |
+
left=Side(style='thin'),
|
108 |
+
right=Side(style='thin'),
|
109 |
+
top=Side(style='thin'),
|
110 |
+
bottom=Side(style='thin')
|
111 |
+
)
|
112 |
+
|
113 |
+
for row in ws.iter_rows(min_row=1, max_row=len(data_rows)+1, min_col=1, max_col=5):
|
114 |
+
for cell in row:
|
115 |
+
cell.border = thin_border
|
116 |
+
|
117 |
+
# Save with professional filename
|
118 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
119 |
+
filename = f'Revenue_Analysis_Report_{timestamp}.xlsx'
|
120 |
wb.save(filename)
|
121 |
+
logger.info(f'Revenue report saved as: {filename}')
|
122 |
+
|
123 |
+
# Verify file creation and quality
|
124 |
+
if os.path.exists(filename):
|
125 |
+
file_size = os.path.getsize(filename)
|
126 |
+
if file_size > 5000: # Minimum 5KB
|
127 |
+
logger.info(f'SUCCESS: Revenue report created successfully')
|
128 |
+
logger.info(f'File: {filename} ({file_size:,} bytes)')
|
129 |
+
logger.info(f'Worksheets: {len(wb.sheetnames)}')
|
130 |
+
print(f'REVENUE_REPORT_SUCCESS: {filename}')
|
131 |
+
return filename
|
132 |
+
else:
|
133 |
+
raise Exception(f'File too small ({file_size} bytes), likely corrupted')
|
134 |
+
else:
|
135 |
+
raise Exception('Excel file was not created')
|
136 |
+
|
137 |
+
except FileNotFoundError as e:
|
138 |
+
logger.error(f'Revenue data file not found: {str(e)}')
|
139 |
+
sys.exit(1)
|
140 |
+
except json.JSONDecodeError as e:
|
141 |
+
logger.error(f'Invalid JSON in revenue data: {str(e)}')
|
142 |
+
sys.exit(1)
|
143 |
except Exception as e:
|
144 |
+
logger.error(f'Error creating revenue report: {str(e)}')
|
145 |
+
import traceback
|
146 |
+
logger.error(f'Traceback: {traceback.format_exc()}')
|
147 |
+
sys.exit(1)
|
148 |
|
149 |
if __name__ == '__main__':
|
150 |
+
result = main()
|
151 |
+
print(f'COMPLETED: {result}')
|
152 |
```
|
153 |
|
154 |
+
**STEP 4: Script Execution with RestrictedPythonTools (2 minutes)**
|
155 |
+
- Use run_python_code([complete_script]) for direct execution with auto-healing
|
156 |
+
- OR save_python_file('generate_revenue_report.py', [complete_script]) + run_shell_command('python generate_revenue_report.py')
|
157 |
+
- RestrictedPythonTools automatically handles path correction and directory constraints
|
158 |
+
- Automatic package installation and error recovery built-in
|
159 |
+
- If execution fails, RestrictedPythonTools will attempt automatic recovery
|
160 |
+
|
161 |
+
**STEP 5: Excel File Verification (CRITICAL - 30 seconds)**
|
162 |
+
- list_files() to check if Excel file exists in directory
|
163 |
+
- If Excel file NOT found in list_files(), retry script execution immediately
|
164 |
+
- run_shell_command('ls -la *Revenue*.xlsx') for detailed file info
|
165 |
+
- run_shell_command('du -h *Revenue*.xlsx') to verify file size > 5KB
|
166 |
+
- NEVER report success without Excel file confirmed in list_files()
|
167 |
+
|
168 |
+
=== REVENUE REPORT SPECIFICATIONS ===
|
169 |
+
|
170 |
+
**File Structure:**
|
171 |
+
- Filename: Revenue_Analysis_Report_YYYYMMDD_HHMMSS.xlsx
|
172 |
+
- 5 worksheets focusing exclusively on revenue data
|
173 |
+
- Professional corporate formatting throughout
|
174 |
+
|
175 |
+
**Worksheet Details:**
|
176 |
+
1. **Company_Overview** - Company info, document metadata
|
177 |
+
2. **Total_Revenue** - Consolidated revenue figures and totals
|
178 |
+
3. **Segment_Revenue** - Revenue by business segment/division
|
179 |
+
4. **Regional_Revenue** - Revenue by geographic region
|
180 |
+
5. **Data_Quality** - Confidence scores and data validation
|
181 |
+
|
182 |
+
**Professional Formatting:**
|
183 |
+
- Headers: Bold white text on navy blue background (#1F4E79)
|
184 |
+
- Data: Clean 11pt font with professional alignment
|
185 |
+
- Borders: Thin borders around all data cells
|
186 |
+
- Columns: Auto-sized for optimal readability (15-50 characters)
|
187 |
+
- Layout: Business-ready presentation format
|
188 |
+
|
189 |
+
=== ERROR HANDLING PROCEDURES ===
|
190 |
+
|
191 |
+
**Package Installation Issues:**
|
192 |
+
- Try: pip install --user openpyxl pandas
|
193 |
+
- Try: python3 -m pip install openpyxl pandas
|
194 |
+
- Try: pip install --no-cache-dir openpyxl
|
195 |
+
|
196 |
+
**Revenue Data Loading Issues:**
|
197 |
+
- Verify arranged_financial_data.json exists
|
198 |
+
- Check JSON syntax and structure
|
199 |
+
- Ensure revenue categories are present
|
200 |
+
|
201 |
+
**Excel Generation Issues:**
|
202 |
+
- Log exact openpyxl error messages
|
203 |
+
- Try simplified formatting if complex formatting fails
|
204 |
+
- Check file write permissions in directory
|
205 |
+
- Verify Python version compatibility
|
206 |
+
|
207 |
+
**File Verification Issues:**
|
208 |
+
- Check file exists and has reasonable size (>5KB)
|
209 |
+
- Verify Excel file can be opened without corruption
|
210 |
+
- Confirm all expected worksheets are present
|
211 |
|
212 |
=== SUCCESS CRITERIA ===
|
213 |
+
Revenue Excel generation is successful ONLY if:
|
214 |
+
β openpyxl package installed without errors
|
215 |
+
β Revenue data loaded and parsed successfully
|
216 |
+
β Python script executed without errors
|
217 |
+
β Excel file created with proper filename format
|
218 |
+
β File size > 5KB indicating data was written
|
219 |
+
β All 5 revenue worksheets present and populated
|
220 |
+
β Professional formatting applied consistently
|
221 |
+
β File opens without corruption in Excel
|
222 |
+
|
223 |
+
=== PROFESSIONAL FEATURES ===
|
224 |
+
Your Excel report MUST include:
|
225 |
+
- **Corporate Design**: Professional navy blue headers with white text
|
226 |
+
- **Business Layout**: Clean, executive-ready formatting
|
227 |
+
- **Data Integrity**: All original revenue values preserved exactly
|
228 |
+
- **User Experience**: Auto-sized columns, proper alignment, clear borders
|
229 |
+
- **File Management**: Timestamped filename for version control
|
230 |
+
- **Quality Assurance**: Comprehensive error handling and validation
|
231 |
+
|
232 |
+
=== FINAL VALIDATION CHECKLIST ===
|
233 |
+
Before reporting success, verify:
|
234 |
+
β‘ All required packages installed successfully
|
235 |
+
β‘ Revenue data JSON loaded and parsed correctly
|
236 |
+
β‘ Python script saved and executed without errors
|
237 |
+
β‘ Excel file created with timestamped filename
|
238 |
+
β‘ File size indicates successful data population (>5KB)
|
239 |
+
β‘ All 5 revenue worksheets present and properly named
|
240 |
+
β‘ Revenue data populated correctly in each worksheet
|
241 |
+
β‘ Professional formatting applied consistently
|
242 |
+
β‘ No execution errors or warnings in output
|
243 |
+
β‘ File can be opened by Excel applications
|
244 |
+
|
245 |
+
Execute now. Focus EXCLUSIVELY on revenue data visualization. Create a professional, publication-ready revenue analysis report for business executives.
|
246 |
+
|
@@ -1,34 +1,35 @@
|
|
1 |
-
|
2 |
|
3 |
-
|
4 |
-
β’ A single JSON object saved as arranged_financial_data.json
|
5 |
-
β’ Fields required: categories, headers, metadata
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
8. Save the JSON as arranged_financial_data.json via save_file.
|
23 |
-
9. Use list_files to confirm the file exists, then read_file to validate its content.
|
24 |
-
10. If the file is missing or malformed, fix the issue and repeat steps 8 β 9.
|
25 |
-
11. Only report success after the file passes both existence and content checks.
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
33 |
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ORGANIZE REVENUE DATA FOR EXCEL
|
2 |
|
3 |
+
Your task: Organize the provided revenue data into a JSON file with 5 categories.
|
|
|
|
|
4 |
|
5 |
+
CATEGORIES TO CREATE:
|
6 |
+
1. Company_Overview - Company details and metadata
|
7 |
+
2. Total_Revenue - All total/consolidated revenue figures
|
8 |
+
3. Segment_Revenue - Revenue by business segment/division
|
9 |
+
4. Regional_Revenue - Revenue by geographic region
|
10 |
+
5. Data_Quality - Confidence scores and extraction notes
|
11 |
|
12 |
+
JSON STRUCTURE:
|
13 |
+
{
|
14 |
+
"Category_Name": {
|
15 |
+
"headers": {"Item": "...", "Value": "...", "Unit": "...", "Period": "...", "Confidence": "..."},
|
16 |
+
"data": [{"item": "...", "value": "...", "unit": "...", "period": "...", "confidence": "..."}],
|
17 |
+
"metadata": {"description": "...", "data_count": "...", "quality_score": "..."}
|
18 |
+
}
|
19 |
+
}
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
STEPS:
|
22 |
+
1. Parse the revenue data below
|
23 |
+
2. Map each data point to the correct category
|
24 |
+
3. Create the JSON structure
|
25 |
+
4. save_file('arranged_financial_data.json', json_content)
|
26 |
+
5. list_files() to verify the file exists
|
27 |
+
6. If file not found, retry save_file() once
|
28 |
|
29 |
+
RULES:
|
30 |
+
- Use EXACT values from the input data (no modifications)
|
31 |
+
- Use EXACT company name, currency, and period from input
|
32 |
+
- Focus ONLY on revenue data (ignore expenses, assets, etc.)
|
33 |
+
|
34 |
+
The revenue data to organize:
|
35 |
+
{extracted_data}
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
REVENUE DATA ORGANIZATION TASK
|
2 |
+
|
3 |
+
=== YOUR MISSION ===
|
4 |
+
Organize ONLY the extracted revenue data and prepare it for Excel-based reporting.
|
5 |
+
Focus exclusively on revenue-related data - ignore all other financial information.
|
6 |
+
You are using gemini-2.5-pro with thinking budget optimization.
|
7 |
+
|
8 |
+
π¨ CRITICAL: You MUST use the ACTUAL extracted data provided below.
|
9 |
+
NEVER create fake/sample data. Use EXACT company names, values, periods, and currencies from the provided extraction.
|
10 |
+
|
11 |
+
=== EXTRACTED REVENUE DATA ===
|
12 |
+
(Data will be provided below)
|
13 |
+
|
14 |
+
=== WHAT TO DELIVER ===
|
15 |
+
β’ A single JSON file saved as arranged_financial_data.json
|
16 |
+
β’ ONLY 5 revenue-focused categories for Excel worksheets
|
17 |
+
β’ Preserve all original revenue data exactly
|
18 |
+
|
19 |
+
=== REVENUE-FOCUSED ORGANIZATION ===
|
20 |
+
Create EXACTLY these 5 Excel-ready categories (revenue only):
|
21 |
+
|
22 |
+
**1. Company_Overview**
|
23 |
+
- Company name, document type, reporting period
|
24 |
+
- Currency, extraction date, data quality summary
|
25 |
+
|
26 |
+
**2. Total_Revenue**
|
27 |
+
- Consolidated revenue figures
|
28 |
+
- Total revenue metrics from income statement
|
29 |
+
- Year-over-year revenue if available
|
30 |
+
|
31 |
+
**3. Segment_Revenue**
|
32 |
+
- Revenue by business segment/division
|
33 |
+
- Product vs service revenue breakdowns
|
34 |
+
- Business unit revenue performance
|
35 |
+
|
36 |
+
**4. Regional_Revenue**
|
37 |
+
- Revenue by geographic region
|
38 |
+
- Country-specific revenue data
|
39 |
+
- International vs domestic revenue splits
|
40 |
+
|
41 |
+
**5. Data_Quality**
|
42 |
+
- Confidence scores for each revenue data point
|
43 |
+
- Source information and validation notes
|
44 |
+
- Revenue extraction metadata
|
45 |
+
|
46 |
+
=== STEP-BY-STEP PROCESS ===
|
47 |
+
|
48 |
+
**Step 1: Revenue Data Analysis (1 minute)**
|
49 |
+
- Parse the extracted revenue data completely - USE THE ACTUAL PROVIDED DATA
|
50 |
+
- Count total revenue data points from the PROVIDED extraction
|
51 |
+
- Identify revenue categories (total, segment, regional) from ACTUAL data
|
52 |
+
- Filter out all non-revenue data points from PROVIDED data
|
53 |
+
- NEVER create sample/fake data - ONLY use the provided extracted data
|
54 |
+
- Log: "Revenue analysis complete: X revenue points identified from PROVIDED data"
|
55 |
+
|
56 |
+
**Step 2: Revenue Data Mapping (2 minutes)**
|
57 |
+
- Map ONLY the PROVIDED revenue data points to appropriate categories:
|
58 |
+
- Total Revenue from PROVIDED data β "Total_Revenue"
|
59 |
+
- Segment/Division Revenue from PROVIDED data β "Segment_Revenue"
|
60 |
+
- Geographic/Regional Revenue from PROVIDED data β "Regional_Revenue"
|
61 |
+
- Company metadata from PROVIDED data β "Company_Overview"
|
62 |
+
- Confidence/source data from PROVIDED data β "Data_Quality"
|
63 |
+
- IGNORE all non-revenue data (expenses, assets, liabilities, etc.)
|
64 |
+
- CRITICAL: Use EXACT values, company names, periods, and currencies from the PROVIDED extracted data
|
65 |
+
|
66 |
+
**Step 3: JSON Structure Creation (2 minutes)**
|
67 |
+
Create this EXACT structure using ONLY the PROVIDED extracted data:
|
68 |
+
|
69 |
+
```json
|
70 |
+
{
|
71 |
+
"[Category_Name]": {
|
72 |
+
"headers": {
|
73 |
+
"Item": "Revenue Item",
|
74 |
+
"Value": "Amount",
|
75 |
+
"Unit": "Currency/Scale",
|
76 |
+
"Period": "Reporting Period",
|
77 |
+
"Confidence": "Accuracy Score"
|
78 |
+
},
|
79 |
+
"data": [
|
80 |
+
{
|
81 |
+
"item": "[EXACT field name from PROVIDED data]",
|
82 |
+
"value": "[EXACT value from PROVIDED data - no modifications]",
|
83 |
+
"unit": "[EXACT unit from PROVIDED data]",
|
84 |
+
"period": "[EXACT period from PROVIDED data]",
|
85 |
+
"confidence": "[EXACT confidence from PROVIDED data]"
|
86 |
+
}
|
87 |
+
],
|
88 |
+
"metadata": {
|
89 |
+
"description": "[Category description]",
|
90 |
+
"data_count": "[Number of items from PROVIDED data]",
|
91 |
+
"quality_score": "[Average confidence from PROVIDED data]"
|
92 |
+
}
|
93 |
+
}
|
94 |
+
}
|
95 |
+
```
|
96 |
+
|
97 |
+
**CRITICAL RULES FOR DATA USAGE:**
|
98 |
+
- Use EXACT company name from provided metadata (e.g., "Deutsche Telekom AG")
|
99 |
+
- Use EXACT currency from provided data (e.g., "EUR")
|
100 |
+
- Use EXACT reporting period from provided data (e.g., "FY 2023")
|
101 |
+
- Use EXACT revenue values from provided data (e.g., "111,985 million")
|
102 |
+
- NEVER create fake/sample data like "Global Corp", "Q2 2025", or made-up numbers
|
103 |
+
|
104 |
+
**Step 4: File Operations with Verification (1 minute)**
|
105 |
+
- save_file('arranged_financial_data.json', complete_json_structure)
|
106 |
+
- list_files() to verify file exists - CRITICAL VERIFICATION STEP
|
107 |
+
- If file NOT found in list_files(), retry save_file() operation immediately
|
108 |
+
- read_file('arranged_financial_data.json') to validate JSON syntax
|
109 |
+
- If any step fails, retry up to 3 times total
|
110 |
+
- Log: "Revenue data organization complete: file saved and validated"
|
111 |
+
|
112 |
+
=== DATA PRESERVATION RULES ===
|
113 |
+
CRITICAL - You MUST follow these rules exactly:
|
114 |
+
- **NEVER** modify original revenue values
|
115 |
+
- **NEVER** perform calculations or analysis
|
116 |
+
- **NEVER** interpret or add insights
|
117 |
+
- **NEVER** change currency units or scales
|
118 |
+
- **ONLY** organize revenue data for Excel import
|
119 |
+
- **IGNORE** all non-revenue financial data completely
|
120 |
+
|
121 |
+
=== REVENUE DATA VALIDATION ===
|
122 |
+
Before saving, verify:
|
123 |
+
- Company name preserved exactly from extraction
|
124 |
+
- Total revenue data properly categorized
|
125 |
+
- Segment revenue breakdowns organized logically
|
126 |
+
- Regional revenue data grouped appropriately
|
127 |
+
- All confidence scores preserved
|
128 |
+
- All original values unchanged
|
129 |
+
|
130 |
+
=== FILE OPERATIONS SEQUENCE ===
|
131 |
+
Execute in EXACT order with verification:
|
132 |
+
1. **save_file('arranged_financial_data.json', json_content)**
|
133 |
+
2. **list_files() - CRITICAL VERIFICATION STEP**
|
134 |
+
- Check if 'arranged_financial_data.json' appears in the file list
|
135 |
+
- If file NOT found, retry save_file() operation immediately
|
136 |
+
3. **read_file('arranged_financial_data.json')** - validate JSON syntax
|
137 |
+
4. **Retry up to 3 times total if any step fails**
|
138 |
+
- NEVER proceed without file confirmation in list_files()
|
139 |
+
|
140 |
+
=== SUCCESS CRITERIA ===
|
141 |
+
Organization is successful ONLY if:
|
142 |
+
β arranged_financial_data.json saved and confirmed in list_files()
|
143 |
+
β File exists and is readable via read_file()
|
144 |
+
β JSON syntax is valid and well-formed
|
145 |
+
β All 5 revenue categories are present (even if empty)
|
146 |
+
β Every revenue data point mapped to exactly one category
|
147 |
+
β No original revenue values modified
|
148 |
+
β All non-revenue data filtered out
|
149 |
+
β File validation passes completely
|
150 |
+
|
151 |
+
=== ERROR HANDLING ===
|
152 |
+
If you encounter issues:
|
153 |
+
- **Empty revenue data**: Create categories with "No revenue data available"
|
154 |
+
- **Invalid JSON**: Fix syntax errors and retry save
|
155 |
+
- **File save fails**: Try different filename and retry
|
156 |
+
- **Missing categories**: Create empty category with metadata
|
157 |
+
- **Non-revenue data**: Filter out completely, focus only on revenue
|
158 |
+
|
159 |
+
Execute now. Focus EXCLUSIVELY on revenue data organization. Preserve all revenue values exactly as extracted.
|
160 |
+
|
@@ -1,58 +1,135 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
===
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
=== OUTPUT REQUIREMENTS ===
|
44 |
-
Return
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
REVENUE-FOCUSED FINANCIAL DATA EXTRACTION
|
2 |
+
|
3 |
+
=== DOCUMENT TO ANALYZE ===
|
4 |
+
File: {file_path}
|
5 |
+
(Document will be provided directly to you for analysis)
|
6 |
+
|
7 |
+
=== YOUR MISSION ===
|
8 |
+
Extract ONLY revenue-related financial data from the provided document with 100% accuracy.
|
9 |
+
Focus exclusively on company name and revenue data - ignore all other financial metrics.
|
10 |
+
You are using gemini-2.5-pro with thinking budget optimization.
|
11 |
+
|
12 |
+
=== WHAT TO EXTRACT (REVENUE ONLY) ===
|
13 |
+
|
14 |
+
**MANDATORY (Must Extract):**
|
15 |
+
1. **Company Name** - Official legal company name
|
16 |
+
2. **Total Revenue** - Consolidated revenue/net sales for most recent period
|
17 |
+
3. **Document Type** - 10-K, 10-Q, Annual Report, Quarterly Report, etc.
|
18 |
+
4. **Reporting Period** - FY 2023, Q1 2024, etc.
|
19 |
+
5. **Currency** - USD, EUR, etc.
|
20 |
+
|
21 |
+
**HIGH VALUE (Extract if clearly present):**
|
22 |
+
6. **Segment Revenue** - Revenue by business segment/division/product line
|
23 |
+
7. **Regional Revenue** - Revenue by geographic region/country
|
24 |
+
|
25 |
+
**IGNORE COMPLETELY:**
|
26 |
+
- Net income, profit, losses
|
27 |
+
- Assets, liabilities, equity
|
28 |
+
- Cash flow data
|
29 |
+
- Expenses, costs, operating income
|
30 |
+
- Balance sheet items
|
31 |
+
- Ratios, per-share metrics
|
32 |
+
- Non-financial data
|
33 |
+
|
34 |
+
=== SYSTEMATIC EXTRACTION PROCESS ===
|
35 |
+
|
36 |
+
**Step 1: Document Structure Analysis**
|
37 |
+
- Scan document to understand structure and layout
|
38 |
+
- Identify document type and reporting period
|
39 |
+
- Locate revenue-related sections (Income Statement, Segment Reporting, Geographic Data)
|
40 |
+
|
41 |
+
**Step 2: Company Identification**
|
42 |
+
- Extract official company name from document header/title
|
43 |
+
- Verify name consistency throughout document
|
44 |
+
- Use parent company name if multiple entities present
|
45 |
+
|
46 |
+
**Step 3: Total Revenue Extraction (CRITICAL)**
|
47 |
+
- Find consolidated revenue figure for most recent period
|
48 |
+
- Look in: Consolidated Statements of Operations, Income Statement
|
49 |
+
- Search terms: "Revenue", "Net Sales", "Total Revenue", "Net Revenue"
|
50 |
+
- Record exact value with currency and time period
|
51 |
+
|
52 |
+
**Step 4: Segment Revenue Analysis**
|
53 |
+
- Locate segment reporting section (usually separate section after financial statements)
|
54 |
+
- Extract revenue by business segment, division, or product line
|
55 |
+
- Common segments: Products, Services, Geographic regions, Business units
|
56 |
+
- Ensure segment revenues sum to total revenue for validation
|
57 |
+
|
58 |
+
**Step 5: Regional Revenue Analysis**
|
59 |
+
- Find geographic revenue breakdown section
|
60 |
+
- Look for revenue by: Americas, EMEA, APAC, US vs International, specific countries
|
61 |
+
- Extract revenue figures for major geographic regions
|
62 |
+
- Validate regional totals match consolidated revenue
|
63 |
+
|
64 |
+
**Step 6: Data Validation**
|
65 |
+
- Verify company name is not empty
|
66 |
+
- Confirm total revenue has high confidence score (>0.7)
|
67 |
+
- Check that segment/regional breakdowns sum to total
|
68 |
+
- Ensure all mandatory fields are extracted
|
69 |
+
|
70 |
+
=== CONFIDENCE SCORING (REVENUE DATA ONLY) ===
|
71 |
+
- **1.0**: Revenue clearly stated in financial table with proper labels
|
72 |
+
- **0.8**: Revenue stated in structured text with clear context
|
73 |
+
- **0.6**: Revenue derived from segment/regional totals
|
74 |
+
- **0.4**: Revenue estimated or context somewhat unclear
|
75 |
+
- **0.2**: Revenue barely visible or questionable source
|
76 |
+
- **0.0**: Revenue not found or completely unclear
|
77 |
|
78 |
=== OUTPUT REQUIREMENTS ===
|
79 |
+
Return ExtractedFinancialData with ONLY revenue-related data:
|
80 |
+
|
81 |
+
```json
|
82 |
+
{
|
83 |
+
"company_name": "[Official Company Name]",
|
84 |
+
"document_type": "[10-K|10-Q|Annual Report|etc.]",
|
85 |
+
"reporting_period": "[FY 2023|Q1 2024|etc.]",
|
86 |
+
"currency": "[USD|EUR|etc.]",
|
87 |
+
"data_points": [
|
88 |
+
{
|
89 |
+
"field_name": "Total Revenue",
|
90 |
+
"value": "$50.3 billion",
|
91 |
+
"category": "Revenue",
|
92 |
+
"period": "FY 2023",
|
93 |
+
"unit": "USD billions",
|
94 |
+
"confidence": 1.0
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"field_name": "Product Revenue",
|
98 |
+
"value": "$30.2 billion",
|
99 |
+
"category": "Segment Revenue",
|
100 |
+
"period": "FY 2023",
|
101 |
+
"unit": "USD billions",
|
102 |
+
"confidence": 0.9
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"field_name": "Americas Revenue",
|
106 |
+
"value": "$25.1 billion",
|
107 |
+
"category": "Regional Revenue",
|
108 |
+
"period": "FY 2023",
|
109 |
+
"unit": "USD billions",
|
110 |
+
"confidence": 0.8
|
111 |
+
}
|
112 |
+
],
|
113 |
+
"summary": "[2-3 sentences describing key revenue findings and trends]"
|
114 |
+
}
|
115 |
+
```
|
116 |
+
|
117 |
+
=== SUCCESS CRITERIA ===
|
118 |
+
Extraction is successful ONLY if:
|
119 |
+
β Company name extracted (never empty)
|
120 |
+
β Total revenue extracted with confidence > 0.5
|
121 |
+
β Document type and period identified
|
122 |
+
β Currency specified
|
123 |
+
β All data points are revenue-related only
|
124 |
+
β Summary focuses on revenue insights (2-3 sentences)
|
125 |
+
β Segment/regional data sums to total (if present)
|
126 |
+
|
127 |
+
=== REVENUE EXTRACTION STRATEGY ===
|
128 |
+
1. **Income Statement First** - Look for consolidated revenue in primary financial statements
|
129 |
+
2. **Segment Section Second** - Find detailed segment revenue breakdowns
|
130 |
+
3. **Geographic Section Third** - Locate regional revenue data
|
131 |
+
4. **Management Discussion** - Check for revenue highlights and explanations
|
132 |
+
5. **Tables Over Text** - Prioritize tabular data over narrative mentions
|
133 |
+
|
134 |
+
**Remember**: Focus EXCLUSIVELY on revenue data. Ignore all other financial metrics. Your goal is 100% accuracy on revenue extraction with proper segment and regional breakdowns.
|
135 |
+
|
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
RestrictedPythonTools - Self-Healing Python Execution with Shell Backend
|
3 |
+
|
4 |
+
This toolkit provides Python code execution with built-in directory constraints,
|
5 |
+
path auto-correction, and self-healing capabilities. Uses RestrictedShellTools
|
6 |
+
as the backend execution engine, mirroring Claude Code's architecture.
|
7 |
+
"""
|
8 |
+
|
9 |
+
import os
|
10 |
+
import re
|
11 |
+
import ast
|
12 |
+
import sys
|
13 |
+
import json
|
14 |
+
import time
|
15 |
+
import uuid
|
16 |
+
import tempfile
|
17 |
+
from pathlib import Path
|
18 |
+
from typing import Optional, Dict, Any, List
|
19 |
+
from agno.tools import Toolkit
|
20 |
+
from agno.utils.log import logger
|
21 |
+
|
22 |
+
from .shell_toolkit import RestrictedShellTools
|
23 |
+
|
24 |
+
|
25 |
+
class RestrictedPythonTools(Toolkit):
|
26 |
+
"""
|
27 |
+
Self-healing Python execution toolkit with directory constraints.
|
28 |
+
|
29 |
+
Uses RestrictedShellTools as backend for secure, constrained Python execution.
|
30 |
+
Includes automatic path correction, package installation, and error recovery.
|
31 |
+
"""
|
32 |
+
|
33 |
+
def __init__(self, base_dir: Optional[Path] = None, **kwargs):
|
34 |
+
"""
|
35 |
+
Initialize the restricted Python toolkit.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
base_dir: Base directory to constrain all Python operations to
|
39 |
+
**kwargs: Additional arguments passed to parent Toolkit
|
40 |
+
"""
|
41 |
+
self.base_dir = Path(base_dir) if base_dir else Path.cwd()
|
42 |
+
self.base_dir.mkdir(parents=True, exist_ok=True)
|
43 |
+
|
44 |
+
# Initialize backend tools
|
45 |
+
self.shell_tools = RestrictedShellTools(base_dir=self.base_dir)
|
46 |
+
|
47 |
+
# Track installed packages to avoid redundant installations
|
48 |
+
self.installed_packages = set()
|
49 |
+
|
50 |
+
# Initialize toolkit with Python execution functions
|
51 |
+
super().__init__(
|
52 |
+
name="restricted_python_tools",
|
53 |
+
tools=[
|
54 |
+
self.run_python_code,
|
55 |
+
self.install_package,
|
56 |
+
self.save_python_file,
|
57 |
+
self.list_python_files,
|
58 |
+
self.validate_python_syntax
|
59 |
+
],
|
60 |
+
**kwargs
|
61 |
+
)
|
62 |
+
|
63 |
+
logger.info(f"RestrictedPythonTools initialized with base_dir: {self.base_dir}")
|
64 |
+
|
65 |
+
def run_python_code(self, code: str, timeout: int = 120) -> str:
|
66 |
+
"""
|
67 |
+
Execute Python code with self-healing and directory constraints.
|
68 |
+
|
69 |
+
Args:
|
70 |
+
code (str): Python code to execute
|
71 |
+
timeout (int): Maximum execution time in seconds
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
str: Output from code execution or error message
|
75 |
+
"""
|
76 |
+
try:
|
77 |
+
# Step 1: Auto-correct and heal the code
|
78 |
+
healed_code = self._heal_python_code(code)
|
79 |
+
|
80 |
+
# Step 2: Validate syntax before execution
|
81 |
+
syntax_result = self.validate_python_syntax(healed_code)
|
82 |
+
if "Error" in syntax_result:
|
83 |
+
return f"Syntax Error: {syntax_result}"
|
84 |
+
|
85 |
+
# Step 3: Extract and auto-install required packages
|
86 |
+
self._auto_install_packages(healed_code)
|
87 |
+
|
88 |
+
# Step 4: Create temporary Python file
|
89 |
+
temp_filename = f"temp_script_{uuid.uuid4().hex[:8]}.py"
|
90 |
+
temp_filepath = self.base_dir / temp_filename
|
91 |
+
|
92 |
+
try:
|
93 |
+
# Save healed code to temporary file
|
94 |
+
with open(temp_filepath, 'w', encoding='utf-8') as f:
|
95 |
+
f.write(healed_code)
|
96 |
+
|
97 |
+
logger.info(f"Executing Python code via shell backend: {temp_filename}")
|
98 |
+
|
99 |
+
# Step 5: Execute via RestrictedShellTools backend
|
100 |
+
execution_command = f"python3 {temp_filename}"
|
101 |
+
result = self.shell_tools.run_shell_command(execution_command, timeout=timeout)
|
102 |
+
|
103 |
+
# Step 6: Check for common errors and attempt recovery
|
104 |
+
if self._has_execution_errors(result):
|
105 |
+
recovery_result = self._attempt_error_recovery(healed_code, result, temp_filename, timeout)
|
106 |
+
if recovery_result:
|
107 |
+
result = recovery_result
|
108 |
+
|
109 |
+
return result
|
110 |
+
|
111 |
+
finally:
|
112 |
+
# Cleanup temporary file
|
113 |
+
if temp_filepath.exists():
|
114 |
+
temp_filepath.unlink()
|
115 |
+
|
116 |
+
except Exception as e:
|
117 |
+
error_msg = f"Error executing Python code: {str(e)}"
|
118 |
+
logger.error(error_msg)
|
119 |
+
return error_msg
|
120 |
+
|
121 |
+
def _heal_python_code(self, code: str) -> str:
|
122 |
+
"""
|
123 |
+
Auto-correct common path and directory issues in Python code.
|
124 |
+
|
125 |
+
Args:
|
126 |
+
code (str): Original Python code
|
127 |
+
|
128 |
+
Returns:
|
129 |
+
str: Healed Python code with corrected paths
|
130 |
+
"""
|
131 |
+
healed_code = code
|
132 |
+
|
133 |
+
# Path correction patterns
|
134 |
+
path_corrections = [
|
135 |
+
# Fix relative paths that go outside base directory
|
136 |
+
(r'\.\./', ''),
|
137 |
+
(r'\.\./\.\./', ''),
|
138 |
+
(r'\.\.\\', ''),
|
139 |
+
|
140 |
+
# Convert absolute paths to relative paths within base directory
|
141 |
+
(r'["\']\/[^"\']*\/([^"\'\/]+\.(xlsx?|csv|json|txt|py))["\']', r'"\1"'),
|
142 |
+
|
143 |
+
# Fix common pandas path issues
|
144 |
+
(r'pd\.to_excel\(["\'][^"\']*\/([^"\'\/]+\.xlsx?)["\']', r'pd.to_excel("\1"'),
|
145 |
+
(r'pd\.read_excel\(["\'][^"\']*\/([^"\'\/]+\.xlsx?)["\']', r'pd.read_excel("\1"'),
|
146 |
+
(r'pd\.to_csv\(["\'][^"\']*\/([^"\'\/]+\.csv)["\']', r'pd.to_csv("\1"'),
|
147 |
+
|
148 |
+
# Fix file operations
|
149 |
+
(r'open\(["\'][^"\']*\/([^"\'\/]+)["\']', r'open("\1"'),
|
150 |
+
(r'with open\(["\'][^"\']*\/([^"\'\/]+)["\']', r'with open("\1"'),
|
151 |
+
]
|
152 |
+
|
153 |
+
for pattern, replacement in path_corrections:
|
154 |
+
healed_code = re.sub(pattern, replacement, healed_code)
|
155 |
+
|
156 |
+
# Add working directory insurance at the beginning
|
157 |
+
directory_insurance = f"""
|
158 |
+
import os
|
159 |
+
import sys
|
160 |
+
|
161 |
+
# Ensure we're in the correct working directory
|
162 |
+
base_dir = r'{self.base_dir}'
|
163 |
+
if os.getcwd() != base_dir:
|
164 |
+
os.chdir(base_dir)
|
165 |
+
print(f"Working directory corrected to: {{os.getcwd()}}")
|
166 |
+
|
167 |
+
"""
|
168 |
+
|
169 |
+
# Add directory insurance to the beginning of the code
|
170 |
+
healed_code = directory_insurance + healed_code
|
171 |
+
|
172 |
+
logger.debug(f"Code healing applied - original length: {len(code)}, healed length: {len(healed_code)}")
|
173 |
+
return healed_code
|
174 |
+
|
175 |
+
def _extract_required_packages(self, code: str) -> List[str]:
|
176 |
+
"""
|
177 |
+
Extract package names from import statements in Python code.
|
178 |
+
|
179 |
+
Args:
|
180 |
+
code (str): Python code to analyze
|
181 |
+
|
182 |
+
Returns:
|
183 |
+
List[str]: List of package names that need to be installed
|
184 |
+
"""
|
185 |
+
packages = set()
|
186 |
+
|
187 |
+
# Built-in modules that don't need installation
|
188 |
+
builtin_modules = {
|
189 |
+
'os', 'sys', 'json', 'time', 'datetime', 'uuid', 'tempfile',
|
190 |
+
're', 'ast', 'pathlib', 'math', 'random', 'subprocess',
|
191 |
+
'collections', 'itertools', 'functools', 'logging', 'io',
|
192 |
+
'csv', 'xml', 'urllib', 'http', 'email', 'sqlite3'
|
193 |
+
}
|
194 |
+
|
195 |
+
# Common package mappings (import name -> pip package name)
|
196 |
+
package_mappings = {
|
197 |
+
'pandas': 'pandas',
|
198 |
+
'numpy': 'numpy',
|
199 |
+
'openpyxl': 'openpyxl',
|
200 |
+
'xlsxwriter': 'xlsxwriter',
|
201 |
+
'matplotlib': 'matplotlib',
|
202 |
+
'seaborn': 'seaborn',
|
203 |
+
'plotly': 'plotly',
|
204 |
+
'requests': 'requests',
|
205 |
+
'beautifulsoup4': 'beautifulsoup4',
|
206 |
+
'bs4': 'beautifulsoup4',
|
207 |
+
'sklearn': 'scikit-learn',
|
208 |
+
'cv2': 'opencv-python',
|
209 |
+
'PIL': 'Pillow',
|
210 |
+
'yaml': 'PyYAML',
|
211 |
+
}
|
212 |
+
|
213 |
+
# Extract import statements using regex
|
214 |
+
import_patterns = [
|
215 |
+
r'^import\s+([a-zA-Z_][a-zA-Z0-9_]*)',
|
216 |
+
r'^from\s+([a-zA-Z_][a-zA-Z0-9_]*)\s+import',
|
217 |
+
]
|
218 |
+
|
219 |
+
for line in code.split('\n'):
|
220 |
+
line = line.strip()
|
221 |
+
for pattern in import_patterns:
|
222 |
+
match = re.match(pattern, line)
|
223 |
+
if match:
|
224 |
+
package_name = match.group(1)
|
225 |
+
|
226 |
+
# Skip built-in modules
|
227 |
+
if package_name in builtin_modules:
|
228 |
+
continue
|
229 |
+
|
230 |
+
# Map to pip package name if known
|
231 |
+
pip_package = package_mappings.get(package_name, package_name)
|
232 |
+
packages.add(pip_package)
|
233 |
+
|
234 |
+
return list(packages)
|
235 |
+
|
236 |
+
def _auto_install_packages(self, code: str) -> None:
|
237 |
+
"""
|
238 |
+
Automatically install required packages for the Python code.
|
239 |
+
|
240 |
+
Args:
|
241 |
+
code (str): Python code to analyze for package requirements
|
242 |
+
"""
|
243 |
+
required_packages = self._extract_required_packages(code)
|
244 |
+
|
245 |
+
for package in required_packages:
|
246 |
+
if package not in self.installed_packages:
|
247 |
+
logger.info(f"Auto-installing package: {package}")
|
248 |
+
install_result = self.install_package(package)
|
249 |
+
if "successfully" in install_result.lower():
|
250 |
+
self.installed_packages.add(package)
|
251 |
+
else:
|
252 |
+
logger.warning(f"Failed to install package {package}: {install_result}")
|
253 |
+
|
254 |
+
def _has_execution_errors(self, result: str) -> bool:
|
255 |
+
"""
|
256 |
+
Check if execution result contains errors that might be recoverable.
|
257 |
+
|
258 |
+
Args:
|
259 |
+
result (str): Execution result to check
|
260 |
+
|
261 |
+
Returns:
|
262 |
+
bool: True if recoverable errors are detected
|
263 |
+
"""
|
264 |
+
error_indicators = [
|
265 |
+
"ModuleNotFoundError",
|
266 |
+
"ImportError",
|
267 |
+
"FileNotFoundError",
|
268 |
+
"PermissionError",
|
269 |
+
"No such file or directory",
|
270 |
+
]
|
271 |
+
|
272 |
+
return any(error in result for error in error_indicators)
|
273 |
+
|
274 |
+
def _attempt_error_recovery(self, code: str, error_result: str, temp_filename: str, timeout: int) -> Optional[str]:
|
275 |
+
"""
|
276 |
+
Attempt to recover from execution errors.
|
277 |
+
|
278 |
+
Args:
|
279 |
+
code (str): Original code that failed
|
280 |
+
error_result (str): Error message from failed execution
|
281 |
+
temp_filename (str): Temporary file name used
|
282 |
+
timeout (int): Execution timeout
|
283 |
+
|
284 |
+
Returns:
|
285 |
+
Optional[str]: Recovery result if successful, None if recovery failed
|
286 |
+
"""
|
287 |
+
try:
|
288 |
+
# Recovery attempt 1: Install missing packages
|
289 |
+
if "ModuleNotFoundError" in error_result or "ImportError" in error_result:
|
290 |
+
logger.info("Attempting recovery: Installing missing packages")
|
291 |
+
|
292 |
+
# Extract package name from error message
|
293 |
+
missing_package_match = re.search(r"No module named '([^']+)'", error_result)
|
294 |
+
if missing_package_match:
|
295 |
+
missing_package = missing_package_match.group(1)
|
296 |
+
install_result = self.install_package(missing_package)
|
297 |
+
|
298 |
+
if "successfully" in install_result.lower():
|
299 |
+
logger.info(f"Recovery successful: Installed {missing_package}")
|
300 |
+
# Retry execution
|
301 |
+
retry_result = self.shell_tools.run_shell_command(f"python3 {temp_filename}", timeout=timeout)
|
302 |
+
return retry_result
|
303 |
+
|
304 |
+
# Recovery attempt 2: Fix file path issues
|
305 |
+
if "FileNotFoundError" in error_result or "No such file or directory" in error_result:
|
306 |
+
logger.info("Attempting recovery: Fixing file path issues")
|
307 |
+
|
308 |
+
# Create any missing directories that might be referenced
|
309 |
+
self.shell_tools.run_shell_command("mkdir -p data reports output")
|
310 |
+
|
311 |
+
# Retry execution
|
312 |
+
retry_result = self.shell_tools.run_shell_command(f"python3 {temp_filename}", timeout=timeout)
|
313 |
+
return retry_result
|
314 |
+
|
315 |
+
except Exception as e:
|
316 |
+
logger.error(f"Error recovery failed: {str(e)}")
|
317 |
+
|
318 |
+
return None
|
319 |
+
|
320 |
+
def install_package(self, package_name: str) -> str:
|
321 |
+
"""
|
322 |
+
Install a Python package using pip via shell backend.
|
323 |
+
|
324 |
+
Args:
|
325 |
+
package_name (str): Name of the package to install
|
326 |
+
|
327 |
+
Returns:
|
328 |
+
str: Installation result message
|
329 |
+
"""
|
330 |
+
try:
|
331 |
+
logger.info(f"Installing Python package: {package_name}")
|
332 |
+
|
333 |
+
# Try multiple installation methods
|
334 |
+
install_commands = [
|
335 |
+
f"pip3 install {package_name}",
|
336 |
+
f"python3 -m pip install {package_name}",
|
337 |
+
f"pip install {package_name}",
|
338 |
+
]
|
339 |
+
|
340 |
+
for command in install_commands:
|
341 |
+
result = self.shell_tools.run_shell_command(command, timeout=120)
|
342 |
+
|
343 |
+
if "Successfully installed" in result or "already satisfied" in result:
|
344 |
+
self.installed_packages.add(package_name)
|
345 |
+
return f"Package '{package_name}' installed successfully"
|
346 |
+
|
347 |
+
# If first method fails, try the next one
|
348 |
+
if "error" not in result.lower():
|
349 |
+
break
|
350 |
+
|
351 |
+
return f"Package installation failed: {result}"
|
352 |
+
|
353 |
+
except Exception as e:
|
354 |
+
error_msg = f"Error installing package '{package_name}': {str(e)}"
|
355 |
+
logger.error(error_msg)
|
356 |
+
return error_msg
|
357 |
+
|
358 |
+
def save_python_file(self, filename: str, code: str) -> str:
|
359 |
+
"""
|
360 |
+
Save Python code to a file in the base directory.
|
361 |
+
|
362 |
+
Args:
|
363 |
+
filename (str): Name of the Python file
|
364 |
+
code (str): Python code content
|
365 |
+
|
366 |
+
Returns:
|
367 |
+
str: Success/failure message
|
368 |
+
"""
|
369 |
+
try:
|
370 |
+
if not filename.endswith('.py'):
|
371 |
+
filename += '.py'
|
372 |
+
|
373 |
+
filepath = self.base_dir / filename
|
374 |
+
|
375 |
+
# Heal the code before saving
|
376 |
+
healed_code = self._heal_python_code(code)
|
377 |
+
|
378 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
379 |
+
f.write(healed_code)
|
380 |
+
|
381 |
+
logger.info(f"Python file saved: {filename}")
|
382 |
+
return f"Python file '{filename}' saved successfully to {self.base_dir}"
|
383 |
+
|
384 |
+
except Exception as e:
|
385 |
+
error_msg = f"Error saving Python file '{filename}': {str(e)}"
|
386 |
+
logger.error(error_msg)
|
387 |
+
return error_msg
|
388 |
+
|
389 |
+
def list_python_files(self) -> str:
|
390 |
+
"""
|
391 |
+
List all Python files in the base directory.
|
392 |
+
|
393 |
+
Returns:
|
394 |
+
str: List of Python files
|
395 |
+
"""
|
396 |
+
try:
|
397 |
+
python_files = list(self.base_dir.glob("*.py"))
|
398 |
+
|
399 |
+
if not python_files:
|
400 |
+
return "No Python files found in the base directory"
|
401 |
+
|
402 |
+
file_list = []
|
403 |
+
for file_path in python_files:
|
404 |
+
file_stat = file_path.stat()
|
405 |
+
file_info = f"{file_path.name} ({file_stat.st_size} bytes, modified: {time.ctime(file_stat.st_mtime)})"
|
406 |
+
file_list.append(file_info)
|
407 |
+
|
408 |
+
return "Python files in base directory:\n" + "\n".join(file_list)
|
409 |
+
|
410 |
+
except Exception as e:
|
411 |
+
error_msg = f"Error listing Python files: {str(e)}"
|
412 |
+
logger.error(error_msg)
|
413 |
+
return error_msg
|
414 |
+
|
415 |
+
def validate_python_syntax(self, code: str) -> str:
|
416 |
+
"""
|
417 |
+
Validate Python code syntax without executing it.
|
418 |
+
|
419 |
+
Args:
|
420 |
+
code (str): Python code to validate
|
421 |
+
|
422 |
+
Returns:
|
423 |
+
str: Validation result message
|
424 |
+
"""
|
425 |
+
try:
|
426 |
+
# Parse the code to check for syntax errors
|
427 |
+
ast.parse(code)
|
428 |
+
return "Python syntax is valid"
|
429 |
+
|
430 |
+
except SyntaxError as e:
|
431 |
+
error_msg = f"Syntax Error at line {e.lineno}: {e.msg}"
|
432 |
+
logger.warning(f"Python syntax validation failed: {error_msg}")
|
433 |
+
return error_msg
|
434 |
+
|
435 |
+
except Exception as e:
|
436 |
+
error_msg = f"Error validating Python syntax: {str(e)}"
|
437 |
+
logger.error(error_msg)
|
438 |
+
return error_msg
|
439 |
+
|
440 |
+
def get_base_directory(self) -> str:
|
441 |
+
"""
|
442 |
+
Get the current base directory path.
|
443 |
+
|
444 |
+
Returns:
|
445 |
+
str: Absolute path of the base directory
|
446 |
+
"""
|
447 |
+
return str(self.base_dir.absolute())
|
448 |
+
|
449 |
+
def clear_temp_files(self) -> str:
|
450 |
+
"""
|
451 |
+
Clean up any temporary Python files in the base directory.
|
452 |
+
|
453 |
+
Returns:
|
454 |
+
str: Cleanup result message
|
455 |
+
"""
|
456 |
+
try:
|
457 |
+
temp_files = list(self.base_dir.glob("temp_script_*.py"))
|
458 |
+
|
459 |
+
if not temp_files:
|
460 |
+
return "No temporary files to clean up"
|
461 |
+
|
462 |
+
for temp_file in temp_files:
|
463 |
+
temp_file.unlink()
|
464 |
+
|
465 |
+
return f"Cleaned up {len(temp_files)} temporary Python files"
|
466 |
+
|
467 |
+
except Exception as e:
|
468 |
+
error_msg = f"Error cleaning up temporary files: {str(e)}"
|
469 |
+
logger.error(error_msg)
|
470 |
+
return error_msg
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Custom Shell Toolkit with Base Directory Support
|
3 |
+
|
4 |
+
This toolkit provides shell command execution constrained to a specific base directory,
|
5 |
+
preventing agents from navigating outside their assigned working directory.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import subprocess
|
10 |
+
from pathlib import Path
|
11 |
+
from typing import List, Optional
|
12 |
+
from agno.tools import Toolkit
|
13 |
+
from agno.utils.log import logger
|
14 |
+
|
15 |
+
|
16 |
+
class RestrictedShellTools(Toolkit):
|
17 |
+
"""
|
18 |
+
Shell toolkit that restricts command execution to a specific base directory.
|
19 |
+
|
20 |
+
This ensures agents cannot navigate outside their assigned working directory,
|
21 |
+
solving the issue of files being saved in wrong locations.
|
22 |
+
"""
|
23 |
+
|
24 |
+
def __init__(self, base_dir: Optional[Path] = None, **kwargs):
|
25 |
+
"""
|
26 |
+
Initialize the restricted shell toolkit.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
base_dir: Base directory to constrain all shell operations to
|
30 |
+
**kwargs: Additional arguments passed to parent Toolkit
|
31 |
+
"""
|
32 |
+
self.base_dir = Path(base_dir) if base_dir else Path.cwd()
|
33 |
+
|
34 |
+
# Ensure base directory exists
|
35 |
+
self.base_dir.mkdir(parents=True, exist_ok=True)
|
36 |
+
|
37 |
+
# Initialize toolkit with our shell command function
|
38 |
+
super().__init__(
|
39 |
+
name="restricted_shell_tools",
|
40 |
+
tools=[self.run_shell_command],
|
41 |
+
**kwargs
|
42 |
+
)
|
43 |
+
|
44 |
+
logger.info(f"RestrictedShellTools initialized with base_dir: {self.base_dir}")
|
45 |
+
|
46 |
+
def run_shell_command(self, command: str, timeout: int = 30) -> str:
|
47 |
+
"""
|
48 |
+
Runs a shell command in the constrained base directory.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
command (str): The shell command to execute
|
52 |
+
timeout (int): Maximum execution time in seconds
|
53 |
+
|
54 |
+
Returns:
|
55 |
+
str: The output of the command or error message
|
56 |
+
"""
|
57 |
+
try:
|
58 |
+
# Log the command and working directory
|
59 |
+
logger.info(f"Executing shell command in {self.base_dir}: {command}")
|
60 |
+
|
61 |
+
# Ensure we're working in the correct directory
|
62 |
+
original_cwd = os.getcwd()
|
63 |
+
|
64 |
+
try:
|
65 |
+
# Change to base directory before executing command
|
66 |
+
os.chdir(self.base_dir)
|
67 |
+
|
68 |
+
# Execute the command in the base directory
|
69 |
+
result = subprocess.run(
|
70 |
+
command,
|
71 |
+
shell=True,
|
72 |
+
capture_output=True,
|
73 |
+
text=True,
|
74 |
+
timeout=timeout,
|
75 |
+
cwd=str(self.base_dir) # Explicitly set working directory
|
76 |
+
)
|
77 |
+
|
78 |
+
# Log execution details
|
79 |
+
logger.debug(f"Command executed with return code: {result.returncode}")
|
80 |
+
|
81 |
+
if result.returncode != 0:
|
82 |
+
error_msg = f"Command failed with return code {result.returncode}\nSTDERR: {result.stderr}\nSTDOUT: {result.stdout}"
|
83 |
+
logger.warning(error_msg)
|
84 |
+
return error_msg
|
85 |
+
|
86 |
+
# Return successful output
|
87 |
+
output = result.stdout.strip()
|
88 |
+
logger.debug(f"Command output: {output[:200]}{'...' if len(output) > 200 else ''}")
|
89 |
+
return output
|
90 |
+
|
91 |
+
finally:
|
92 |
+
# Always restore original working directory
|
93 |
+
os.chdir(original_cwd)
|
94 |
+
|
95 |
+
except subprocess.TimeoutExpired:
|
96 |
+
error_msg = f"Command timed out after {timeout} seconds: {command}"
|
97 |
+
logger.error(error_msg)
|
98 |
+
return error_msg
|
99 |
+
|
100 |
+
except Exception as e:
|
101 |
+
error_msg = f"Error executing command '{command}': {str(e)}"
|
102 |
+
logger.error(error_msg)
|
103 |
+
return error_msg
|
104 |
+
|
105 |
+
def get_current_directory(self) -> str:
|
106 |
+
"""
|
107 |
+
Returns the current base directory path.
|
108 |
+
|
109 |
+
Returns:
|
110 |
+
str: Absolute path of the base directory
|
111 |
+
"""
|
112 |
+
return str(self.base_dir.absolute())
|
113 |
+
|
114 |
+
def list_directory_contents(self) -> str:
|
115 |
+
"""
|
116 |
+
Lists the contents of the base directory.
|
117 |
+
|
118 |
+
Returns:
|
119 |
+
str: Directory listing
|
120 |
+
"""
|
121 |
+
return self.run_shell_command("ls -la")
|
122 |
+
|
123 |
+
def check_file_exists(self, filename: str) -> str:
|
124 |
+
"""
|
125 |
+
Checks if a file exists in the base directory.
|
126 |
+
|
127 |
+
Args:
|
128 |
+
filename (str): Name of the file to check
|
129 |
+
|
130 |
+
Returns:
|
131 |
+
str: Result of the check
|
132 |
+
"""
|
133 |
+
file_path = self.base_dir / filename
|
134 |
+
if file_path.exists():
|
135 |
+
return f"File '{filename}' exists in {self.base_dir}"
|
136 |
+
else:
|
137 |
+
return f"File '{filename}' does not exist in {self.base_dir}"
|
@@ -1,360 +1,349 @@
|
|
1 |
"""
|
2 |
-
Financial Document Analysis Workflow
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
"""
|
5 |
|
6 |
import json
|
|
|
7 |
from pathlib import Path
|
8 |
-
from typing import
|
9 |
-
from
|
|
|
10 |
|
11 |
-
from agno.agent import Agent
|
12 |
-
from agno.models.google import Gemini
|
13 |
-
from agno.media import File
|
14 |
from agno.tools.file import FileTools
|
|
|
15 |
from agno.tools.python import PythonTools
|
16 |
-
from agno.workflow import Workflow
|
|
|
|
|
|
|
17 |
from agno.utils.log import logger
|
18 |
-
from
|
|
|
19 |
from config.settings import settings
|
20 |
from utils.prompt_loader import prompt_loader
|
|
|
|
|
21 |
|
22 |
|
23 |
-
# Structured Output Models to avoid JSON parsing issues
|
24 |
class DataPoint(BaseModel):
|
25 |
-
"""Individual financial data point"""
|
26 |
-
field_name: str = Field(
|
27 |
-
value: str = Field(
|
28 |
-
category: str = Field(
|
29 |
period: str = Field(default="", description="Time period if applicable")
|
30 |
unit: str = Field(default="", description="Currency or measurement unit")
|
31 |
confidence: float = Field(default=0.9, description="Confidence score 0-1")
|
32 |
|
33 |
-
class ExtractedFinancialData(BaseModel):
|
34 |
-
"""Structured output for data extraction phase"""
|
35 |
-
company_name: str = Field(default="", description="Company name")
|
36 |
-
document_type: str = Field(..., description="Type of financial document")
|
37 |
-
reporting_period: str = Field(default="", description="Reporting period")
|
38 |
-
data_points: List[DataPoint] = Field(..., description="All extracted financial data points")
|
39 |
-
summary: str = Field(..., description="Brief summary of extracted data")
|
40 |
|
41 |
-
class
|
42 |
-
"""
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
|
48 |
-
class ArrangedFinancialData(BaseModel):
|
49 |
-
"""Structured output for data arrangement phase"""
|
50 |
-
categories: List[FinancialCategory] = Field(..., description="Organized financial categories")
|
51 |
-
key_metrics: Dict[str, str] = Field(default_factory=dict, description="Key financial metrics")
|
52 |
-
insights: List[str] = Field(default_factory=list, description="Financial insights and analysis")
|
53 |
-
summary: str = Field(..., description="Summary of arranged data")
|
54 |
|
55 |
-
class
|
56 |
-
"""Structured
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
execution_notes: str = Field(default="", description="Notes about code execution")
|
61 |
|
62 |
|
63 |
class FinancialDocumentWorkflow(Workflow):
|
64 |
"""
|
65 |
-
|
66 |
-
Uses structured outputs to eliminate JSON parsing issues
|
67 |
-
"""
|
68 |
-
|
69 |
-
description: str = "Financial document analysis workflow with data extraction, organization, and Excel generation"
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
response_model=ExtractedFinancialData,
|
77 |
-
structured_outputs=True,
|
78 |
-
debug_mode=True,
|
79 |
-
)
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
model=Gemini(id=settings.DATA_ARRANGER_MODEL,thinking_budget=settings.DATA_ARRANGER_MODEL_THINKING_BUDGET,api_key=settings.GOOGLE_API_KEY),
|
84 |
-
description="Financial data organization and analysis expert",
|
85 |
-
instructions=prompt_loader.load_instructions_as_list("agents/data_arranger"),
|
86 |
-
tools=[FileTools()], # FileTools for saving arranged data
|
87 |
-
# NOTE: Cannot use structured_outputs with tools in Gemini - choosing tools over structured outputs
|
88 |
-
markdown=True,
|
89 |
-
debug_mode=True,
|
90 |
-
add_memory_references=True,
|
91 |
-
add_session_summary_references=True,
|
92 |
-
exponential_backoff=True,
|
93 |
-
retries=10,
|
94 |
-
)
|
95 |
-
|
96 |
-
# Code Generator Agent - Creates Excel generation code
|
97 |
-
code_generator = Agent(
|
98 |
-
model=Gemini(
|
99 |
-
id=settings.CODE_GENERATOR_MODEL,
|
100 |
-
api_key=settings.GOOGLE_API_KEY
|
101 |
-
),
|
102 |
-
description="Excel report generator that analyzes JSON data and creates formatted workbooks using shell execution on any OS",
|
103 |
-
goal="Generate a professional Excel report from arranged_financial_data.json with multiple worksheets, formatting, and charts",
|
104 |
-
instructions=prompt_loader.load_instructions_as_list("agents/code_generator"),
|
105 |
-
expected_output="A Financial_Report_YYYYMMDD_HHMMSS.xlsx file containing formatted data from the JSON with multiple worksheets, professional styling, and relevant charts",
|
106 |
-
additional_context="This agent must work on Windows, Mac, and Linux. Always use os.path for file operations and handle path separators correctly. Include proper error handling for cross-platform compatibility.",
|
107 |
-
tools=[
|
108 |
-
ShellTools(),
|
109 |
-
FileTools(save_files=True, read_files=True, list_files=True),
|
110 |
-
PythonTools(pip_install=True, save_and_run=False, run_code=False)
|
111 |
-
],
|
112 |
-
markdown=False,
|
113 |
-
show_tool_calls=True,
|
114 |
-
debug_mode=True,
|
115 |
-
retries=10,
|
116 |
-
add_datetime_to_instructions=True,
|
117 |
-
delay_between_retries=10
|
118 |
-
)
|
119 |
-
|
120 |
-
def __init__(self, session_id: str = None, **kwargs):
|
121 |
-
super().__init__(session_id=session_id, **kwargs)
|
122 |
-
self.session_id = session_id or f"financial_workflow_{int(__import__('time').time())}"
|
123 |
-
self.session_output_dir = Path(settings.TEMP_DIR) / self.session_id / "output"
|
124 |
-
self.session_input_dir = Path(settings.TEMP_DIR) / self.session_id / "input"
|
125 |
-
self.session_temp_dir = Path(settings.TEMP_DIR) / self.session_id / "temp"
|
126 |
|
127 |
-
#
|
128 |
-
self.
|
129 |
-
self.session_input_dir.mkdir(parents=True, exist_ok=True)
|
130 |
-
self.session_temp_dir.mkdir(parents=True, exist_ok=True)
|
131 |
|
132 |
-
#
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
-
|
|
|
|
|
|
|
136 |
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
-
def
|
158 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
try:
|
160 |
-
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
-
# Remove entire session directory
|
164 |
-
session_dir = Path(settings.TEMP_DIR) / self.session_id
|
165 |
-
if session_dir.exists():
|
166 |
-
import shutil
|
167 |
-
try:
|
168 |
-
shutil.rmtree(session_dir)
|
169 |
-
logger.info(f"Completely removed session directory: {session_dir}")
|
170 |
-
except Exception as e:
|
171 |
-
logger.warning(f"Could not remove session directory: {e}")
|
172 |
-
|
173 |
except Exception as e:
|
174 |
-
logger.error(f"
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
if isinstance(tool, FileTools):
|
182 |
-
tool.base_dir = self.session_output_dir
|
183 |
-
|
184 |
-
# Configure code generator's tools with session output directory
|
185 |
-
if hasattr(self.code_generator, 'tools') and self.code_generator.tools:
|
186 |
-
for tool in self.code_generator.tools:
|
187 |
-
if isinstance(tool, FileTools):
|
188 |
-
tool.base_dir = self.session_output_dir
|
189 |
-
elif isinstance(tool, PythonTools):
|
190 |
-
tool.base_dir = self.session_output_dir
|
191 |
-
|
192 |
-
def run(self, file_path: str = None, **kwargs) -> RunResponse:
|
193 |
"""
|
194 |
-
Main workflow execution
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
"""
|
197 |
-
# Handle file_path from
|
198 |
if file_path is None:
|
199 |
-
file_path =
|
200 |
|
201 |
if file_path is None:
|
202 |
-
|
203 |
-
|
204 |
-
logger.info(f"Processing financial document: {file_path}")
|
205 |
|
206 |
-
|
207 |
-
use_cache = kwargs.get('use_cache', True)
|
208 |
-
|
209 |
-
# Check cache first if enabled
|
210 |
-
if use_cache and "final_results" in self.session_state:
|
211 |
-
logger.info("Returning cached results")
|
212 |
-
return RunResponse(
|
213 |
-
run_id=self.run_id,
|
214 |
-
content=self.session_state["final_results"]
|
215 |
-
)
|
216 |
|
217 |
try:
|
218 |
-
#
|
219 |
-
|
|
|
|
|
|
|
220 |
|
221 |
-
#
|
222 |
-
|
223 |
-
|
224 |
-
self.session_state["extracted_data"]
|
225 |
-
)
|
226 |
-
logger.info("Using cached extraction data")
|
227 |
-
else:
|
228 |
-
document = File(filepath=file_path)
|
229 |
-
extraction_prompt = prompt_loader.load_prompt("workflow/data_extraction", file_path=file_path)
|
230 |
-
|
231 |
-
extraction_response: RunResponse = self.data_extractor.run(
|
232 |
-
extraction_prompt,
|
233 |
-
files=[document]
|
234 |
-
)
|
235 |
-
extracted_data: ExtractedFinancialData = extraction_response.content
|
236 |
-
|
237 |
-
# Cache the result
|
238 |
-
self.session_state["extracted_data"] = extracted_data.model_dump()
|
239 |
-
logger.info(f"Extracted {len(extracted_data.data_points)} data points")
|
240 |
|
241 |
-
|
242 |
-
logger.info("Step 2: Organizing financial data...")
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
else:
|
248 |
-
# Debug: Check extracted data before passing to prompt
|
249 |
-
extracted_json = extracted_data.model_dump_json(indent=2)
|
250 |
-
logger.debug(f"Extracted data size: {len(extracted_json)} characters")
|
251 |
-
logger.debug(f"First 200 chars of extracted data: {extracted_json[:200]}...")
|
252 |
-
|
253 |
-
arrangement_prompt = prompt_loader.load_prompt("workflow/data_arrangement",
|
254 |
-
extracted_data=extracted_json)
|
255 |
-
|
256 |
-
# Debug: Check if prompt contains the actual data or just the placeholder
|
257 |
-
if "{extracted_data}" in arrangement_prompt:
|
258 |
-
logger.error("CRITICAL: Variable substitution failed! Prompt still contains {extracted_data} placeholder")
|
259 |
-
logger.error(f"Prompt length: {len(arrangement_prompt)}")
|
260 |
-
else:
|
261 |
-
logger.info(f"Variable substitution successful. Prompt length: {len(arrangement_prompt)}")
|
262 |
-
|
263 |
-
arrangement_response: RunResponse = self.data_arranger.run(arrangement_prompt)
|
264 |
-
arrangement_content = arrangement_response.content
|
265 |
-
|
266 |
-
# Cache the result
|
267 |
-
self.session_state["arrangement_response"] = arrangement_content
|
268 |
-
logger.info("Data organization completed - check output directory for arranged_financial_data.json")
|
269 |
|
270 |
-
#
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
logger.info("Using cached code generation results")
|
277 |
-
else:
|
278 |
-
code_prompt = prompt_loader.load_prompt("workflow/code_generation")
|
279 |
-
|
280 |
-
code_response: RunResponse = self.code_generator.run(code_prompt)
|
281 |
-
code_generation_content = code_response.content
|
282 |
-
|
283 |
-
# Simple check for execution success based on response content
|
284 |
-
execution_success = (
|
285 |
-
"error" not in code_generation_content.lower() or
|
286 |
-
"success" in code_generation_content.lower() or
|
287 |
-
"completed" in code_generation_content.lower()
|
288 |
-
)
|
289 |
-
|
290 |
-
# Cache the results
|
291 |
-
self.session_state["code_generation_response"] = code_generation_content
|
292 |
-
self.session_state["execution_success"] = execution_success
|
293 |
-
|
294 |
-
logger.info(f"Code generation and execution completed: {'β
Success' if execution_success else 'β Failed'}")
|
295 |
|
296 |
-
#
|
297 |
-
#
|
298 |
-
|
299 |
-
|
300 |
-
|
|
|
|
|
301 |
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
## Document Information
|
306 |
-
- **Company**: {extracted_data.company_name or 'Not specified'}
|
307 |
-
- **Document Type**: {extracted_data.document_type}
|
308 |
-
- **Reporting Period**: {extracted_data.reporting_period or 'Not specified'}
|
309 |
-
|
310 |
-
## Processing Summary
|
311 |
-
- **Data Points Extracted**: {len(extracted_data.data_points)}
|
312 |
-
- **Data Organization**: {'β
Completed' if arrangement_content else 'β Failed'}
|
313 |
-
- **Excel Creation**: {'β
Success' if execution_success else 'β Failed'}
|
314 |
-
|
315 |
-
## Data Organization Results
|
316 |
-
{arrangement_content[:500] + '...' if arrangement_content and len(arrangement_content) > 500 else arrangement_content or 'No arrangement data available'}
|
317 |
-
|
318 |
-
## Tool Execution Summary
|
319 |
-
**Data Arranger**: Used FileTools to save organized data to JSON
|
320 |
-
**Code Generator**: Used PythonTools and FileTools for Excel generation
|
321 |
-
|
322 |
-
## Code Generation Results
|
323 |
-
{code_generation_content[:500] + '...' if code_generation_content and len(code_generation_content) > 500 else code_generation_content or 'No code generation results available'}
|
324 |
-
|
325 |
-
## Generated Files ({len(output_files)} files)
|
326 |
-
{chr(10).join(f"- **{file}**" for file in output_files) if output_files else "- No files generated"}
|
327 |
-
|
328 |
-
## Output Directory
|
329 |
-
π `{self.session_output_dir}`
|
330 |
-
|
331 |
-
---
|
332 |
-
*Generated using Agno Workflows with FileTools and PythonTools integration*
|
333 |
-
*Note: Due to Gemini limitations, structured outputs were used for data extraction only*
|
334 |
-
"""
|
335 |
|
336 |
-
|
337 |
-
|
338 |
|
339 |
-
return
|
340 |
-
run_id=self.run_id,
|
341 |
-
content=results_summary
|
342 |
-
)
|
343 |
|
344 |
except Exception as e:
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
)
|
351 |
-
|
352 |
-
def get_processing_status(self) -> Dict[str, str]:
|
353 |
-
"""Get the current processing status"""
|
354 |
status = {
|
355 |
-
"
|
356 |
-
"
|
357 |
-
"
|
358 |
-
"
|
|
|
359 |
}
|
360 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""
|
2 |
+
Financial Document Analysis Workflow - Agno Workflow 2.0 Implementation (Fixed)
|
3 |
+
|
4 |
+
This workflow processes financial documents through a multi-agent system using the new
|
5 |
+
step-based architecture introduced in Agno Workflow 2.0:
|
6 |
+
1. Data Extractor Agent: Extracts structured financial data
|
7 |
+
2. Data Arrangement Function: Organizes data into Excel-ready format
|
8 |
+
3. Code Generator Agent: Creates professional Excel reports
|
9 |
+
|
10 |
+
Built according to Agno Workflow 2.0 standards with simple sequential execution.
|
11 |
"""
|
12 |
|
13 |
import json
|
14 |
+
import time
|
15 |
from pathlib import Path
|
16 |
+
from typing import Optional, Dict, Any
|
17 |
+
from textwrap import dedent
|
18 |
+
import os
|
19 |
|
20 |
+
from agno.agent import Agent
|
21 |
+
from agno.models.google import Gemini
|
|
|
22 |
from agno.tools.file import FileTools
|
23 |
+
from agno.tools.shell import ShellTools
|
24 |
from agno.tools.python import PythonTools
|
25 |
+
from agno.workflow.v2.workflow import Workflow
|
26 |
+
from agno.workflow.v2.types import StepInput, StepOutput
|
27 |
+
from agno.workflow.v2.step import Step
|
28 |
+
from agno.storage.sqlite import SqliteStorage # Added this import
|
29 |
from agno.utils.log import logger
|
30 |
+
from pydantic import BaseModel, Field
|
31 |
+
|
32 |
from config.settings import settings
|
33 |
from utils.prompt_loader import prompt_loader
|
34 |
+
from utils.shell_toolkit import RestrictedShellTools
|
35 |
+
from utils.restricted_python_tools import RestrictedPythonTools
|
36 |
|
37 |
|
|
|
38 |
class DataPoint(BaseModel):
|
39 |
+
"""Individual financial data point."""
|
40 |
+
field_name: str = Field(description="Name of the financial data field")
|
41 |
+
value: str = Field(description="Value of the field")
|
42 |
+
category: str = Field(description="Financial category (revenue, expenses, assets, etc.)")
|
43 |
period: str = Field(default="", description="Time period if applicable")
|
44 |
unit: str = Field(default="", description="Currency or measurement unit")
|
45 |
confidence: float = Field(default=0.9, description="Confidence score 0-1")
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
class Metadata(BaseModel):
|
49 |
+
"""Metadata for extracted financial data."""
|
50 |
+
company_name: str = Field(default="Unknown Company", description="Company name")
|
51 |
+
document_type: str = Field(default="Unknown", description="Type of financial document")
|
52 |
+
reporting_period: str = Field(default="", description="Reporting period")
|
53 |
+
currency: str = Field(default="", description="Primary currency used")
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
+
class ExtractedFinancialData(BaseModel):
|
57 |
+
"""Structured model for extracted financial data."""
|
58 |
+
data_points: list[DataPoint] = Field(description="List of extracted financial data points")
|
59 |
+
summary: str = Field(description="Summary of the extracted data")
|
60 |
+
metadata: Metadata = Field(default_factory=Metadata, description="Additional metadata")
|
|
|
61 |
|
62 |
|
63 |
class FinancialDocumentWorkflow(Workflow):
|
64 |
"""
|
65 |
+
Financial document analysis workflow using Agno Workflow 2.0 step-based architecture.
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
This workflow processes financial documents through three specialized steps:
|
68 |
+
- Data extraction with structured outputs
|
69 |
+
- Data arrangement for Excel compatibility
|
70 |
+
- Excel report generation with formatting
|
71 |
+
"""
|
|
|
|
|
|
|
|
|
72 |
|
73 |
+
def __init__(self, session_id: Optional[str] = None, **kwargs):
|
74 |
+
"""Initialize workflow with session management and step-based architecture."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
+
# Initialize session directories first
|
77 |
+
self._setup_session_directories(session_id)
|
|
|
|
|
78 |
|
79 |
+
# Create storage with auto schema upgrade
|
80 |
+
storage = SqliteStorage(
|
81 |
+
table_name="financial_workflows",
|
82 |
+
db_file="tmp/agno_workflows.db",
|
83 |
+
mode="workflow_v2", # Use workflow_v2 mode
|
84 |
+
auto_upgrade_schema=True # This will fix your schema issues
|
85 |
+
)
|
86 |
|
87 |
+
# Create agents for the workflow
|
88 |
+
self.data_extractor = self._create_data_extractor()
|
89 |
+
self.data_arranger = self._create_data_arranger()
|
90 |
+
self.code_generator = self._create_code_generator()
|
91 |
|
92 |
+
# Create steps using Step objects for better tracking
|
93 |
+
data_extraction_step = Step(
|
94 |
+
name="FinancialDataExtractor",
|
95 |
+
agent=self.data_extractor,
|
96 |
+
description="Expert financial data extraction specialist optimized for Gemini"
|
97 |
+
)
|
98 |
+
|
99 |
+
data_arrangement_step = Step(
|
100 |
+
name="DataArrangement",
|
101 |
+
executor=self._arrangement_function,
|
102 |
+
description="User-defined callable step for data arrangement"
|
103 |
+
)
|
104 |
+
|
105 |
+
excel_generation_step = Step(
|
106 |
+
name="ExcelReportGenerator",
|
107 |
+
agent=self.code_generator,
|
108 |
+
description="Excel report generator optimized for Gemini with cross-platform support"
|
109 |
+
)
|
110 |
+
|
111 |
+
# Initialize the Workflow 2.0 with step-based architecture
|
112 |
+
super().__init__(
|
113 |
+
name="FinancialDocumentWorkflow",
|
114 |
+
description=dedent("""\
|
115 |
+
Financial document analysis workflow using Agno Workflow 2.0 with step-based execution.
|
116 |
+
Processes financial documents through extraction, arrangement, and Excel report generation.
|
117 |
+
Uses session state for caching and proper error recovery mechanisms.
|
118 |
+
"""),
|
119 |
+
steps=[
|
120 |
+
data_extraction_step,
|
121 |
+
data_arrangement_step,
|
122 |
+
excel_generation_step
|
123 |
+
],
|
124 |
+
session_id=session_id,
|
125 |
+
storage=storage, # Add the storage here
|
126 |
+
debug_mode=True,
|
127 |
+
**kwargs
|
128 |
+
)
|
129 |
+
|
130 |
+
logger.info(f"FinancialDocumentWorkflow v2.0 initialized with session: {self.session_id}")
|
131 |
+
logger.info(f"Session directories: {list(self.session_directories.keys())}")
|
132 |
+
|
133 |
+
def _setup_session_directories(self, session_id: Optional[str] = None):
|
134 |
+
"""Setup session-specific directories."""
|
135 |
+
self.session_id = session_id
|
136 |
+
self.session_directories = settings.create_session_directories(self.session_id)
|
137 |
+
self.session_output_dir = self.session_directories["output"]
|
138 |
+
self.session_input_dir = self.session_directories["input"]
|
139 |
+
self.session_temp_dir = self.session_directories["temp"]
|
140 |
+
self.session_cache_dir = self.session_directories["cache"]
|
141 |
+
|
142 |
+
def _create_data_extractor(self) -> Agent:
|
143 |
+
"""Create the data extraction agent."""
|
144 |
+
return Agent(
|
145 |
+
model=Gemini(
|
146 |
+
id=settings.DATA_EXTRACTOR_MODEL,
|
147 |
+
thinking_budget=settings.DATA_EXTRACTOR_MODEL_THINKING_BUDGET,
|
148 |
+
api_key=settings.GOOGLE_API_KEY
|
149 |
+
),
|
150 |
+
name="FinancialDataExtractor",
|
151 |
+
description="Expert financial data extraction specialist optimized for Gemini",
|
152 |
+
instructions=prompt_loader.load_instructions_as_list("agents/data_extractor"),
|
153 |
+
response_model=ExtractedFinancialData,
|
154 |
+
structured_outputs=True,
|
155 |
+
debug_mode=True,
|
156 |
+
retries=10,
|
157 |
+
delay_between_retries=10,
|
158 |
+
exponential_backoff=True,
|
159 |
+
)
|
160 |
|
161 |
+
def _create_data_arranger(self) -> Agent:
|
162 |
+
"""Create the data arrangement agent."""
|
163 |
+
logger.info(f"Data arranger base directory: {self.session_output_dir}")
|
164 |
+
logger.info(f"Directory exists: {self.session_output_dir.exists()}")
|
165 |
+
logger.info(f"Directory is writable: {os.access(self.session_output_dir, os.W_OK)}")
|
166 |
+
return Agent(
|
167 |
+
model=Gemini(
|
168 |
+
id=settings.DATA_ARRANGER_MODEL,
|
169 |
+
thinking_budget=settings.DATA_ARRANGER_MODEL_THINKING_BUDGET,
|
170 |
+
api_key=settings.GOOGLE_API_KEY
|
171 |
+
),
|
172 |
+
name="FinancialDataArranger",
|
173 |
+
description="Financial data organization specialist optimized for Gemini",
|
174 |
+
instructions=prompt_loader.load_instructions_as_list("agents/data_arranger"),
|
175 |
+
tools=[
|
176 |
+
RestrictedShellTools(base_dir=self.session_output_dir),
|
177 |
+
FileTools(base_dir=self.session_output_dir, save_files=True, read_files=True, list_files=True),
|
178 |
+
],
|
179 |
+
markdown=False,
|
180 |
+
debug_mode=True,
|
181 |
+
add_memory_references=True,
|
182 |
+
add_session_summary_references=True,
|
183 |
+
retries=10,
|
184 |
+
delay_between_retries=10,
|
185 |
+
exponential_backoff=True,
|
186 |
+
)
|
187 |
+
|
188 |
+
def _create_code_generator(self) -> Agent:
|
189 |
+
"""Create the code generation agent."""
|
190 |
+
return Agent(
|
191 |
+
model=Gemini(
|
192 |
+
id=settings.CODE_GENERATOR_MODEL,
|
193 |
+
thinking_budget=settings.CODE_GENERATOR_MODEL_THINKING_BUDGET,
|
194 |
+
api_key=settings.GOOGLE_API_KEY
|
195 |
+
),
|
196 |
+
name="ExcelReportGenerator",
|
197 |
+
description="Excel report generator optimized for Gemini with cross-platform support",
|
198 |
+
goal="Generate professional Excel reports from arranged financial data with multiple worksheets and formatting",
|
199 |
+
instructions=prompt_loader.load_instructions_as_list("agents/code_generator"),
|
200 |
+
expected_output="A professionally formatted Excel file with multiple worksheets, charts, and proper styling",
|
201 |
+
additional_context=f"Working directory: {self.session_output_dir}. All files must be saved in this directory only.",
|
202 |
+
tools=[
|
203 |
+
RestrictedShellTools(base_dir=self.session_output_dir),
|
204 |
+
RestrictedPythonTools(base_dir=self.session_output_dir),
|
205 |
+
FileTools(base_dir=self.session_output_dir, save_files=True, read_files=True, list_files=True)
|
206 |
+
],
|
207 |
+
markdown=False,
|
208 |
+
show_tool_calls=True,
|
209 |
+
debug_mode=True,
|
210 |
+
add_datetime_to_instructions=True,
|
211 |
+
retries=10,
|
212 |
+
delay_between_retries=10,
|
213 |
+
exponential_backoff=True,
|
214 |
+
)
|
215 |
+
|
216 |
+
def _arrangement_function(self, step_input: StepInput) -> StepOutput:
|
217 |
+
"""Custom function for data arrangement step."""
|
218 |
try:
|
219 |
+
message = step_input.message
|
220 |
+
previous_step_content = step_input.previous_step_content
|
221 |
+
|
222 |
+
logger.info("Starting data arrangement step")
|
223 |
+
|
224 |
+
# Load the base arrangement prompt
|
225 |
+
arrangement_prompt = prompt_loader.load_prompt("workflow/data_arrangement")
|
226 |
+
|
227 |
+
# Combine prompt with extracted data from previous step
|
228 |
+
full_arrangement_prompt = f"{arrangement_prompt}\n\nHere is the extracted financial data to arrange:\n\n{previous_step_content}"
|
229 |
+
|
230 |
+
# Run data arrangement using the agent
|
231 |
+
response = self.data_arranger.run(full_arrangement_prompt)
|
232 |
+
|
233 |
+
# Cache the arrangement results in workflow session state
|
234 |
+
if hasattr(self, 'session_state') and self.session_state:
|
235 |
+
cache_key = f"arrangement_{int(time.time())}"
|
236 |
+
self.session_state[cache_key] = response.content
|
237 |
+
logger.info(f"Cached arrangement results with key: {cache_key}")
|
238 |
+
|
239 |
+
logger.info("Data arrangement completed successfully")
|
240 |
+
|
241 |
+
return StepOutput(
|
242 |
+
content=response.content,
|
243 |
+
response=response,
|
244 |
+
success=True
|
245 |
+
)
|
246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
except Exception as e:
|
248 |
+
logger.error(f"Data arrangement failed: {str(e)}")
|
249 |
+
return StepOutput(
|
250 |
+
content=f"Data arrangement failed: {str(e)}",
|
251 |
+
success=False,
|
252 |
+
)
|
253 |
+
|
254 |
+
def run(self, file_path: str = None, **kwargs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
"""
|
256 |
+
Main workflow execution using Workflow 2.0 step-based architecture.
|
257 |
+
|
258 |
+
Args:
|
259 |
+
file_path: Path to the financial document to process
|
260 |
+
**kwargs: Additional parameters
|
261 |
+
|
262 |
+
Returns:
|
263 |
+
Workflow execution result using the new step-based system
|
264 |
"""
|
265 |
+
# Handle file_path from kwargs if not provided as positional
|
266 |
if file_path is None:
|
267 |
+
file_path = kwargs.get('file_path')
|
268 |
|
269 |
if file_path is None:
|
270 |
+
logger.error("file_path is required but not provided")
|
271 |
+
raise ValueError("file_path is required but not provided")
|
|
|
272 |
|
273 |
+
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
try:
|
276 |
+
# Validate input file
|
277 |
+
file_path = Path(file_path).resolve()
|
278 |
+
if not file_path.exists():
|
279 |
+
logger.error(f"File not found: {file_path}")
|
280 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
281 |
|
282 |
+
# Copy input file to session directory for reference
|
283 |
+
input_file = self.session_input_dir / file_path.name
|
284 |
+
input_file.write_bytes(file_path.read_bytes())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
+
logger.info(f"Starting financial document analysis for: {file_path.name}")
|
|
|
287 |
|
288 |
+
# Create File object for direct upload to Gemini API (for first step)
|
289 |
+
from agno.media import File
|
290 |
+
document = File(filepath=str(file_path))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
|
292 |
+
# Load extraction prompt for the first step
|
293 |
+
extraction_prompt = prompt_loader.load_prompt(
|
294 |
+
"workflow/data_extraction",
|
295 |
+
file_path=str(file_path),
|
296 |
+
output_directory=str(self.session_output_dir)
|
297 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
+
# Execute the workflow using the new 2.0 step-based system
|
300 |
+
# Pass the extraction prompt as the message and include the file
|
301 |
+
result = super().run(
|
302 |
+
message=extraction_prompt,
|
303 |
+
files=[document],
|
304 |
+
**kwargs
|
305 |
+
)
|
306 |
|
307 |
+
# Final status
|
308 |
+
execution_time = time.time() - start_time
|
309 |
+
status = self._get_workflow_status()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
|
311 |
+
logger.info(f"Workflow completed successfully in {execution_time:.2f} seconds")
|
312 |
+
logger.info(f"Results: {status}")
|
313 |
|
314 |
+
return result
|
|
|
|
|
|
|
315 |
|
316 |
except Exception as e:
|
317 |
+
logger.error(f"Workflow execution failed: {str(e)}")
|
318 |
+
raise
|
319 |
+
|
320 |
+
def _get_workflow_status(self) -> Dict[str, Any]:
|
321 |
+
"""Get current workflow status and file counts."""
|
|
|
|
|
|
|
|
|
322 |
status = {
|
323 |
+
"session_id": self.session_id,
|
324 |
+
"output_directory": str(self.session_output_dir),
|
325 |
+
"json_files": 0,
|
326 |
+
"excel_files": 0,
|
327 |
+
"data_points": 0
|
328 |
}
|
329 |
+
|
330 |
+
if self.session_output_dir.exists():
|
331 |
+
status["json_files"] = len(list(self.session_output_dir.glob("*.json")))
|
332 |
+
status["excel_files"] = len(list(self.session_output_dir.glob("*.xlsx")))
|
333 |
+
|
334 |
+
return status
|
335 |
+
|
336 |
+
|
337 |
+
# Compatibility function to maintain the same interface as the original workflow
|
338 |
+
def create_financial_workflow(session_id: Optional[str] = None, **kwargs) -> FinancialDocumentWorkflow:
|
339 |
+
"""
|
340 |
+
Create a new FinancialDocumentWorkflow instance using Workflow 2.0.
|
341 |
+
|
342 |
+
Args:
|
343 |
+
session_id: Optional session ID for tracking workflow execution
|
344 |
+
**kwargs: Additional parameters for workflow configuration
|
345 |
+
|
346 |
+
Returns:
|
347 |
+
FinancialDocumentWorkflow: Configured workflow instance
|
348 |
+
"""
|
349 |
+
return FinancialDocumentWorkflow(session_id=session_id, **kwargs)
|
@@ -0,0 +1,357 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Financial Document Analysis Workflow - Optimized for Gemini Models
|
3 |
+
|
4 |
+
This workflow processes financial documents through a multi-agent system:
|
5 |
+
1. Data Extractor: Extracts structured financial data
|
6 |
+
2. Data Arranger: Organizes data into Excel-ready format
|
7 |
+
3. Code Generator: Creates professional Excel reports
|
8 |
+
|
9 |
+
Built according to official Agno documentation standards.
|
10 |
+
"""
|
11 |
+
|
12 |
+
import json
|
13 |
+
import time
|
14 |
+
from pathlib import Path
|
15 |
+
from typing import Iterator, Optional, Dict, Any
|
16 |
+
from textwrap import dedent
|
17 |
+
|
18 |
+
from agno.agent import Agent, RunResponse
|
19 |
+
from agno.models.google import Gemini
|
20 |
+
from agno.tools.file import FileTools
|
21 |
+
from agno.tools.shell import ShellTools
|
22 |
+
from agno.tools.python import PythonTools
|
23 |
+
from agno.workflow import Workflow
|
24 |
+
from agno.utils.log import logger
|
25 |
+
from pydantic import BaseModel, Field
|
26 |
+
|
27 |
+
from config.settings import settings
|
28 |
+
from utils.prompt_loader import prompt_loader
|
29 |
+
from utils.shell_toolkit import RestrictedShellTools
|
30 |
+
from utils.restricted_python_tools import RestrictedPythonTools
|
31 |
+
|
32 |
+
|
33 |
+
class DataPoint(BaseModel):
|
34 |
+
"""Individual financial data point."""
|
35 |
+
field_name: str = Field(description="Name of the financial data field")
|
36 |
+
value: str = Field(description="Value of the field")
|
37 |
+
category: str = Field(description="Financial category (revenue, expenses, assets, etc.)")
|
38 |
+
period: str = Field(default="", description="Time period if applicable")
|
39 |
+
unit: str = Field(default="", description="Currency or measurement unit")
|
40 |
+
confidence: float = Field(default=0.9, description="Confidence score 0-1")
|
41 |
+
|
42 |
+
|
43 |
+
class Metadata(BaseModel):
|
44 |
+
"""Metadata for extracted financial data."""
|
45 |
+
company_name: str = Field(default="Unknown Company", description="Company name")
|
46 |
+
document_type: str = Field(default="Unknown", description="Type of financial document")
|
47 |
+
reporting_period: str = Field(default="", description="Reporting period")
|
48 |
+
currency: str = Field(default="", description="Primary currency used")
|
49 |
+
|
50 |
+
|
51 |
+
class ExtractedFinancialData(BaseModel):
|
52 |
+
"""Structured model for extracted financial data."""
|
53 |
+
data_points: list[DataPoint] = Field(description="List of extracted financial data points")
|
54 |
+
summary: str = Field(description="Summary of the extracted data")
|
55 |
+
metadata: Metadata = Field(default_factory=Metadata, description="Additional metadata")
|
56 |
+
|
57 |
+
|
58 |
+
class FinancialDocumentWorkflow(Workflow):
|
59 |
+
"""
|
60 |
+
Financial document analysis workflow optimized for Gemini models.
|
61 |
+
|
62 |
+
This workflow processes financial documents through three specialized agents:
|
63 |
+
- Data extraction with structured outputs
|
64 |
+
- Data arrangement for Excel compatibility
|
65 |
+
- Excel report generation with formatting
|
66 |
+
"""
|
67 |
+
|
68 |
+
description: str = dedent("""\
|
69 |
+
Financial document analysis workflow optimized for Gemini models with robust error handling.
|
70 |
+
Processes financial documents through extraction, arrangement, and Excel report generation.
|
71 |
+
Uses session state for caching and proper error recovery mechanisms.
|
72 |
+
""")
|
73 |
+
|
74 |
+
# Data Extractor Agent - Uses structured outputs for reliable data extraction
|
75 |
+
data_extractor: Agent = Agent(
|
76 |
+
model=Gemini(
|
77 |
+
id=settings.DATA_EXTRACTOR_MODEL,
|
78 |
+
thinking_budget=settings.DATA_EXTRACTOR_MODEL_THINKING_BUDGET,
|
79 |
+
api_key=settings.GOOGLE_API_KEY
|
80 |
+
),
|
81 |
+
name="FinancialDataExtractor",
|
82 |
+
description="Expert financial data extraction specialist optimized for Gemini",
|
83 |
+
instructions=prompt_loader.load_instructions_as_list("agents/data_extractor"),
|
84 |
+
response_model=ExtractedFinancialData,
|
85 |
+
structured_outputs=True,
|
86 |
+
debug_mode=True,
|
87 |
+
retries=10,
|
88 |
+
delay_between_retries=10,
|
89 |
+
exponential_backoff=True,
|
90 |
+
)
|
91 |
+
|
92 |
+
def __init__(self, session_id: Optional[str] = None, **kwargs):
|
93 |
+
"""Initialize workflow with session management."""
|
94 |
+
super().__init__(session_id=session_id, **kwargs)
|
95 |
+
|
96 |
+
# Initialize session directories
|
97 |
+
self._setup_session_directories()
|
98 |
+
|
99 |
+
# Initialize remaining agents with session-specific configurations
|
100 |
+
self._initialize_session_agents()
|
101 |
+
|
102 |
+
logger.info(f"FinancialDocumentWorkflow initialized with session: {self.session_id}")
|
103 |
+
logger.info(f"Session directories: {list(self.session_directories.keys())}")
|
104 |
+
|
105 |
+
def _setup_session_directories(self):
|
106 |
+
"""Setup session-specific directories."""
|
107 |
+
self.session_directories = settings.create_session_directories(self.session_id)
|
108 |
+
self.session_output_dir = self.session_directories["output"]
|
109 |
+
self.session_input_dir = self.session_directories["input"]
|
110 |
+
self.session_temp_dir = self.session_directories["temp"]
|
111 |
+
self.session_cache_dir = self.session_directories["cache"]
|
112 |
+
|
113 |
+
def _initialize_session_agents(self):
|
114 |
+
"""Initialize agents that need session-specific configuration."""
|
115 |
+
|
116 |
+
# Data Arranger Agent - Organizes data with file operations
|
117 |
+
self.data_arranger = Agent(
|
118 |
+
model=Gemini(
|
119 |
+
id=settings.DATA_ARRANGER_MODEL,
|
120 |
+
thinking_budget=settings.DATA_ARRANGER_MODEL_THINKING_BUDGET,
|
121 |
+
api_key=settings.GOOGLE_API_KEY
|
122 |
+
),
|
123 |
+
name="FinancialDataArranger",
|
124 |
+
description="Financial data organization specialist optimized for Gemini",
|
125 |
+
instructions=prompt_loader.load_instructions_as_list("agents/data_arranger"),
|
126 |
+
tools=[
|
127 |
+
RestrictedShellTools(base_dir=self.session_output_dir),
|
128 |
+
FileTools(base_dir=self.session_output_dir)
|
129 |
+
],
|
130 |
+
markdown=False,
|
131 |
+
debug_mode=True,
|
132 |
+
add_memory_references=True,
|
133 |
+
add_session_summary_references=True,
|
134 |
+
retries=10,
|
135 |
+
delay_between_retries=10,
|
136 |
+
exponential_backoff=True,
|
137 |
+
debug_level=2,
|
138 |
+
)
|
139 |
+
|
140 |
+
# Code Generator Agent - Creates Excel reports with comprehensive tools
|
141 |
+
self.code_generator = Agent(
|
142 |
+
model=Gemini(
|
143 |
+
id=settings.CODE_GENERATOR_MODEL,
|
144 |
+
thinking_budget=settings.CODE_GENERATOR_MODEL_THINKING_BUDGET,
|
145 |
+
api_key=settings.GOOGLE_API_KEY
|
146 |
+
),
|
147 |
+
name="ExcelReportGenerator",
|
148 |
+
description="Excel report generator optimized for Gemini with cross-platform support",
|
149 |
+
goal="Generate professional Excel reports from arranged financial data with multiple worksheets and formatting",
|
150 |
+
instructions=prompt_loader.load_instructions_as_list("agents/code_generator"),
|
151 |
+
expected_output="A professionally formatted Excel file with multiple worksheets, charts, and proper styling",
|
152 |
+
additional_context=f"Working directory: {self.session_output_dir}. All files must be saved in this directory only.",
|
153 |
+
tools=[
|
154 |
+
RestrictedShellTools(base_dir=self.session_output_dir),
|
155 |
+
RestrictedPythonTools(base_dir=self.session_output_dir),
|
156 |
+
FileTools(base_dir=self.session_output_dir, save_files=True, read_files=True, list_files=True)
|
157 |
+
],
|
158 |
+
markdown=False,
|
159 |
+
show_tool_calls=True,
|
160 |
+
debug_mode=True,
|
161 |
+
add_datetime_to_instructions=True,
|
162 |
+
retries=10,
|
163 |
+
delay_between_retries=10,
|
164 |
+
exponential_backoff=True,
|
165 |
+
)
|
166 |
+
|
167 |
+
logger.info("All agents initialized with Gemini models and proper tool configuration")
|
168 |
+
|
169 |
+
def run(self, file_path: str = None, **kwargs) -> Iterator[RunResponse]:
|
170 |
+
"""
|
171 |
+
Main workflow execution following official Agno documentation patterns.
|
172 |
+
|
173 |
+
Args:
|
174 |
+
file_path: Path to the financial document to process
|
175 |
+
**kwargs: Additional parameters
|
176 |
+
|
177 |
+
Yields:
|
178 |
+
RunResponse: Streaming responses from the workflow execution
|
179 |
+
"""
|
180 |
+
# Handle file_path from kwargs if not provided as positional
|
181 |
+
if file_path is None:
|
182 |
+
file_path = kwargs.get('file_path')
|
183 |
+
|
184 |
+
if file_path is None:
|
185 |
+
yield RunResponse(
|
186 |
+
run_id=self.run_id,
|
187 |
+
content="β Error: file_path is required but not provided"
|
188 |
+
)
|
189 |
+
return
|
190 |
+
|
191 |
+
start_time = time.time()
|
192 |
+
|
193 |
+
try:
|
194 |
+
# Validate input file
|
195 |
+
file_path = Path(file_path).resolve()
|
196 |
+
if not file_path.exists():
|
197 |
+
yield RunResponse(
|
198 |
+
run_id=self.run_id,
|
199 |
+
content=f"β Error: File not found: {file_path}"
|
200 |
+
)
|
201 |
+
return
|
202 |
+
|
203 |
+
# Copy input file to session directory for reference
|
204 |
+
input_file = self.session_input_dir / file_path.name
|
205 |
+
input_file.write_bytes(file_path.read_bytes())
|
206 |
+
|
207 |
+
yield RunResponse(
|
208 |
+
run_id=self.run_id,
|
209 |
+
content=f"π Starting financial document analysis for: {file_path.name}"
|
210 |
+
)
|
211 |
+
|
212 |
+
# Step 1: Data Extraction
|
213 |
+
yield RunResponse(
|
214 |
+
run_id=self.run_id,
|
215 |
+
content="π Step 1: Extracting financial data..."
|
216 |
+
)
|
217 |
+
|
218 |
+
# Check cache first
|
219 |
+
cache_key = f"extraction_{file_path.name}_{file_path.stat().st_mtime}"
|
220 |
+
if cache_key in self.session_state:
|
221 |
+
logger.info("Using cached extraction results")
|
222 |
+
extracted_data = self.session_state[cache_key]
|
223 |
+
yield RunResponse(
|
224 |
+
run_id=self.run_id,
|
225 |
+
content="β
Using cached extraction results"
|
226 |
+
)
|
227 |
+
else:
|
228 |
+
# Create File object for direct upload to Gemini API
|
229 |
+
from agno.media import File
|
230 |
+
document = File(filepath=str(file_path))
|
231 |
+
|
232 |
+
# Load extraction prompt
|
233 |
+
extraction_prompt = prompt_loader.load_prompt(
|
234 |
+
"workflow/data_extraction",
|
235 |
+
file_path=str(file_path),
|
236 |
+
output_directory=str(self.session_output_dir)
|
237 |
+
)
|
238 |
+
|
239 |
+
# Run data extraction with file upload
|
240 |
+
extraction_response = self.data_extractor.run(extraction_prompt, files=[document])
|
241 |
+
extracted_data = extraction_response.content
|
242 |
+
|
243 |
+
# Debug: Log the type of extracted_data
|
244 |
+
logger.info(f"DEBUG: extracted_data type: {type(extracted_data)}")
|
245 |
+
logger.info(f"DEBUG: extracted_data has model_dump_json: {hasattr(extracted_data, 'model_dump_json')}")
|
246 |
+
|
247 |
+
# Cache the results
|
248 |
+
self.session_state[cache_key] = extracted_data
|
249 |
+
|
250 |
+
yield RunResponse(
|
251 |
+
run_id=self.run_id,
|
252 |
+
content=f"β
Extracted {len(extracted_data.data_points) if hasattr(extracted_data, 'data_points') else 'N/A'} data points"
|
253 |
+
)
|
254 |
+
|
255 |
+
# Step 2: Data Arrangement
|
256 |
+
yield RunResponse(
|
257 |
+
run_id=self.run_id,
|
258 |
+
content="π Step 2: Arranging data for Excel..."
|
259 |
+
)
|
260 |
+
|
261 |
+
# Load the base arrangement prompt (without placeholders)
|
262 |
+
arrangement_prompt = prompt_loader.load_prompt("workflow/data_arrangement")
|
263 |
+
|
264 |
+
# Serialize extracted data
|
265 |
+
try:
|
266 |
+
if hasattr(extracted_data, 'model_dump_json'):
|
267 |
+
extracted_data_json = extracted_data.model_dump_json(indent=2)
|
268 |
+
elif hasattr(extracted_data, 'model_dump'):
|
269 |
+
import json
|
270 |
+
extracted_data_json = json.dumps(extracted_data.model_dump(), indent=2)
|
271 |
+
else:
|
272 |
+
import json
|
273 |
+
extracted_data_json = json.dumps(str(extracted_data), indent=2)
|
274 |
+
|
275 |
+
logger.info(f"DEBUG: Successfully serialized extracted_data ({len(extracted_data_json)} chars)")
|
276 |
+
|
277 |
+
except Exception as e:
|
278 |
+
logger.error(f"DEBUG: Failed to serialize extracted_data: {e}")
|
279 |
+
import json
|
280 |
+
extracted_data_json = json.dumps({"error": "Failed to serialize extracted data", "data": str(extracted_data)}, indent=2)
|
281 |
+
|
282 |
+
# Pass both the prompt and data directly to the agent
|
283 |
+
full_arrangement_prompt = f"{arrangement_prompt}\n\nHere is the extracted financial data to arrange:\n\n{extracted_data_json}"
|
284 |
+
|
285 |
+
# Run data arrangement
|
286 |
+
arrangement_response = self.data_arranger.run(full_arrangement_prompt)
|
287 |
+
yield RunResponse(
|
288 |
+
run_id=self.run_id,
|
289 |
+
content=f"β
Data arrangement completed"
|
290 |
+
)
|
291 |
+
|
292 |
+
yield RunResponse(
|
293 |
+
run_id=self.run_id,
|
294 |
+
content="β
Data arranged and saved to JSON"
|
295 |
+
)
|
296 |
+
|
297 |
+
# Step 3: Excel Report Generation
|
298 |
+
yield RunResponse(
|
299 |
+
run_id=self.run_id,
|
300 |
+
content="π Step 3: Generating Excel report..."
|
301 |
+
)
|
302 |
+
|
303 |
+
# Prepare code generation prompt
|
304 |
+
code_generation_prompt = prompt_loader.load_prompt(
|
305 |
+
"workflow/code_generation",
|
306 |
+
session_directory=str(self.session_output_dir)
|
307 |
+
)
|
308 |
+
|
309 |
+
# Run code generation
|
310 |
+
code_generation_response = self.code_generator.run(code_generation_prompt)
|
311 |
+
yield RunResponse(
|
312 |
+
run_id=self.run_id,
|
313 |
+
content=f"β
Excel report generation completed"
|
314 |
+
)
|
315 |
+
|
316 |
+
# Final status
|
317 |
+
execution_time = time.time() - start_time
|
318 |
+
status = self._get_workflow_status()
|
319 |
+
|
320 |
+
yield RunResponse(
|
321 |
+
run_id=self.run_id,
|
322 |
+
content=f"""
|
323 |
+
β
Workflow completed successfully in {execution_time:.2f} seconds
|
324 |
+
|
325 |
+
π Results Summary:
|
326 |
+
- Data points extracted: {status.get('data_points', 'N/A')}
|
327 |
+
- JSON files created: {status.get('json_files', 0)}
|
328 |
+
- Excel files created: {status.get('excel_files', 0)}
|
329 |
+
- Session directory: {self.session_output_dir}
|
330 |
+
|
331 |
+
π― All files saved to: {self.session_output_dir}
|
332 |
+
""".strip()
|
333 |
+
)
|
334 |
+
|
335 |
+
except Exception as e:
|
336 |
+
logger.error(f"Workflow execution failed: {str(e)}")
|
337 |
+
yield RunResponse(
|
338 |
+
run_id=self.run_id,
|
339 |
+
content=f"β Workflow failed: {str(e)}"
|
340 |
+
)
|
341 |
+
|
342 |
+
def _get_workflow_status(self) -> Dict[str, Any]:
|
343 |
+
"""Get current workflow status and file counts."""
|
344 |
+
status = {
|
345 |
+
"session_id": self.session_id,
|
346 |
+
"output_directory": str(self.session_output_dir),
|
347 |
+
"json_files": 0,
|
348 |
+
"excel_files": 0,
|
349 |
+
"data_points": 0
|
350 |
+
}
|
351 |
+
|
352 |
+
if self.session_output_dir.exists():
|
353 |
+
status["json_files"] = len(list(self.session_output_dir.glob("*.json")))
|
354 |
+
status["excel_files"] = len(list(self.session_output_dir.glob("*.xlsx")))
|
355 |
+
|
356 |
+
return status
|
357 |
+
|