Spaces:
Running
Running
Commit
·
8675ade
1
Parent(s):
7d927f4
pushing api
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .env.template +1 -0
- .gitignore +8 -0
- Dockerfile +16 -0
- README.md +111 -10
- config.yaml +13 -0
- main.py +47 -0
- prepare_kb.ipynb +118 -0
- requirements.txt +201 -0
- src/api/__init__.py +5 -0
- src/api/v1/__init__.py +9 -0
- src/api/v1/build_ml_plan/__init__.py +7 -0
- src/api/v1/build_ml_plan/eda.py +25 -0
- src/api/v1/build_ml_plan/task_analysis.py +54 -0
- src/api/v1/eda_engine/__init__.py +12 -0
- src/api/v1/eda_engine/data_quality.py +58 -0
- src/api/v1/eda_engine/data_statistics.py +61 -0
- src/api/v1/eda_engine/data_understanding.py +61 -0
- src/api/v1/eda_engine/univariate_analysis.py +59 -0
- src/app/pipelines/eda/agents/agents.py +65 -0
- src/app/pipelines/eda/agents/models.py +77 -0
- src/app/pipelines/eda/agents/prompts.py +333 -0
- src/app/pipelines/eda/helper.py +89 -0
- src/app/pipelines/eda/model.py +78 -0
- src/app/pipelines/eda/pipeline.py +256 -0
- src/app/pipelines/eda/tools/analysis_tools/__init__.py +3 -0
- src/app/pipelines/eda/tools/analysis_tools/bivariate_analysis.py +1028 -0
- src/app/pipelines/eda/tools/analysis_tools/multivariate_analysis.py +1039 -0
- src/app/pipelines/eda/tools/analysis_tools/univariate_analysis.py +517 -0
- src/app/pipelines/eda/tools/data_cleaning_tools/__init__.py +2 -0
- src/app/pipelines/eda/tools/data_cleaning_tools/handle_missing_values.py +64 -0
- src/app/pipelines/eda/tools/data_cleaning_tools/handle_outliers.py +83 -0
- src/app/pipelines/eda/tools/lib.py +59 -0
- src/app/pipelines/modules/__init__.py +4 -0
- src/app/pipelines/modules/data_quality_assessment.py +1657 -0
- src/app/pipelines/modules/data_statistics.py +1270 -0
- src/app/pipelines/modules/data_understanding_context.py +332 -0
- src/app/pipelines/modules/univariate_analysis.py +1437 -0
- src/app/pipelines/task_analysis/__init__.py +2 -0
- src/app/pipelines/task_analysis/ml_analysis_workflow.py +202 -0
- src/app/pipelines/task_analysis/ml_implementation_planner_workflow.py +132 -0
- src/app/pipelines/task_analysis/model.py +162 -0
- src/app/schemas/requests/__init__.py +1 -0
- src/app/schemas/requests/eda.py +25 -0
- src/app/schemas/requests/task_analysis.py +12 -0
- src/app/schemas/responses/eda.py +31 -0
- src/core/cache/redis_cache.py +33 -0
- src/core/server.py +23 -0
- src/core/utils/__init__.py +3 -0
- src/core/utils/knowledge_base.py +81 -0
- src/core/utils/logger.py +46 -0
.env.template
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
OPENAI_API_KEY=
|
.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
*.lock
|
3 |
+
.env
|
4 |
+
dev
|
5 |
+
*.log
|
6 |
+
*.csv
|
7 |
+
raw/
|
8 |
+
vector/
|
Dockerfile
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11.4
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY . /app/
|
6 |
+
|
7 |
+
RUN apt-get update && \
|
8 |
+
apt-get install -y \
|
9 |
+
redis-server \
|
10 |
+
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
11 |
+
|
12 |
+
RUN pip install --upgrade pip && \
|
13 |
+
pip install uv && \
|
14 |
+
uv pip install --system -r requirements.txt
|
15 |
+
|
16 |
+
CMD ["python", "-m", "main"]
|
README.md
CHANGED
@@ -1,10 +1,111 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Franky EDA Engine
|
2 |
+
|
3 |
+
## Project Structure
|
4 |
+
|
5 |
+
```plaintext
|
6 |
+
FRANKY-API/
|
7 |
+
├── knowledge_base/
|
8 |
+
│ ├── raw/
|
9 |
+
│ └── vector/
|
10 |
+
├── src/
|
11 |
+
│ ├── api/
|
12 |
+
│ │ ├── v1/
|
13 |
+
│ │ │ ├── build_ml_plan/
|
14 |
+
│ │ │ │ ├── __init__.py
|
15 |
+
│ │ │ │ ├── eda.py
|
16 |
+
│ │ │ │ └── task_analysis.py
|
17 |
+
│ │ │ ├── eda_engine/
|
18 |
+
│ │ │ │ ├── __init__.py
|
19 |
+
│ │ │ │ ├── data_quality.py
|
20 |
+
│ │ │ │ ├── data_statistics.py
|
21 |
+
│ │ │ │ ├── data_understanding.py
|
22 |
+
│ │ │ │ └── univariate_analysis.py
|
23 |
+
│ │ │ └── __init__.py
|
24 |
+
│ │ └── __init__.py
|
25 |
+
│ ├── app/
|
26 |
+
│ │ ├── pipelines/
|
27 |
+
│ │ │ ├── eda/
|
28 |
+
│ │ │ │ ├── agemts/
|
29 |
+
│ │ │ │ │ ├── agents.py
|
30 |
+
│ │ │ │ │ ├── models.py
|
31 |
+
│ │ │ │ │ └── prompts.py
|
32 |
+
│ │ │ │ ├── tools/
|
33 |
+
│ │ │ │ │ ├── analysis_tools/
|
34 |
+
│ │ │ │ │ │ ├── __init__.py
|
35 |
+
│ │ │ │ │ │ ├── bivariate_analysis.py
|
36 |
+
│ │ │ │ │ │ ├── multivariate_analysis.py
|
37 |
+
│ │ │ │ │ │ └── univariate_analysis.py
|
38 |
+
│ │ │ │ │ ├── data_cleaning_tools/
|
39 |
+
│ │ │ │ │ │ ├── __init__.py
|
40 |
+
│ │ │ │ │ │ ├── handle_missing_values.py
|
41 |
+
│ │ │ │ │ │ └── handle_outliers.py
|
42 |
+
│ │ │ │ │ ├── transformations_tools/
|
43 |
+
│ │ │ │ │ └── lib.py
|
44 |
+
│ │ │ │ ├── helper.py
|
45 |
+
│ │ │ │ ├── model.py
|
46 |
+
│ │ │ │ └── pipeline.py
|
47 |
+
│ │ │ ├── modules/
|
48 |
+
│ │ │ │ ├── __init__.py
|
49 |
+
│ │ │ │ ├── data_quality_assessment.py
|
50 |
+
│ │ │ │ ├── data_statistics.py
|
51 |
+
│ │ │ │ ├── data_understanding_context.py
|
52 |
+
│ │ │ │ └── univariate_analysis.py
|
53 |
+
│ │ │ └── task_analysis/
|
54 |
+
│ │ │ ├── __init__.py
|
55 |
+
│ │ │ ├── ml_analysis_workflow.py
|
56 |
+
│ │ │ ├── ml_implementation_planner_workflow.py
|
57 |
+
│ │ │ └── model.py
|
58 |
+
│ │ └── schemas/
|
59 |
+
│ │ ├── requests/
|
60 |
+
│ │ │ ├── __init__.py
|
61 |
+
│ │ │ ├── eda.py
|
62 |
+
│ │ │ └── task_analysis.py
|
63 |
+
│ │ └── responses/
|
64 |
+
│ │ └── eda.py
|
65 |
+
│ └── core/
|
66 |
+
│ ├── cache/
|
67 |
+
│ │ ├── code_generated/
|
68 |
+
│ │ ├── dataset_logs/
|
69 |
+
│ │ ├── downloads/
|
70 |
+
│ │ └── redis_cache.py
|
71 |
+
│ ├── database/
|
72 |
+
│ ├── logs/
|
73 |
+
│ ├── utils/
|
74 |
+
│ │ ├── __init__.py
|
75 |
+
│ │ ├── knowledge_base.py
|
76 |
+
│ │ ├── logger.py
|
77 |
+
│ │ ├── prompts.py
|
78 |
+
│ │ └── read_config.py
|
79 |
+
│ └── server.py
|
80 |
+
├── .env.template
|
81 |
+
├── .gitignore
|
82 |
+
├── config.yaml
|
83 |
+
├── Dockerfile
|
84 |
+
├── main.py
|
85 |
+
├── prepare_kb.py
|
86 |
+
├── README.md
|
87 |
+
└── Dockerfile
|
88 |
+
```
|
89 |
+
|
90 |
+
## Getting Started
|
91 |
+
|
92 |
+
### 1. Clone the Repository
|
93 |
+
|
94 |
+
### 2. Running the Docker Container
|
95 |
+
|
96 |
+
To build the docker image, run the following cmd:
|
97 |
+
|
98 |
+
```bash
|
99 |
+
docker build -t franky-api .
|
100 |
+
```
|
101 |
+
|
102 |
+
To run the docker container, run the following cmd:
|
103 |
+
|
104 |
+
```bash
|
105 |
+
docker run -d -p 8000:8000 --name franky-api-container franky-api
|
106 |
+
```
|
107 |
+
|
108 |
+
You can access the api at:
|
109 |
+
|
110 |
+
[http://localhost:8000/](http://localhost:8000/docs)
|
111 |
+
|
config.yaml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
app:
|
2 |
+
verbose: True
|
3 |
+
|
4 |
+
server:
|
5 |
+
host: "127.0.0.1"
|
6 |
+
port: 8000
|
7 |
+
reload: True
|
8 |
+
workers: 1
|
9 |
+
|
10 |
+
redis_server:
|
11 |
+
host: "127.0.0.1"
|
12 |
+
port: 6379
|
13 |
+
db: 0
|
main.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uvicorn
|
2 |
+
import subprocess
|
3 |
+
import shutil
|
4 |
+
import time
|
5 |
+
import asyncio
|
6 |
+
from src.core.utils import logger, read_config
|
7 |
+
|
8 |
+
def start_redis_server(redis_config: dict):
|
9 |
+
redis_path = shutil.which("redis-server")
|
10 |
+
if not redis_path:
|
11 |
+
raise RuntimeError("redis-server is not installed or not in PATH")
|
12 |
+
|
13 |
+
process = subprocess.Popen(
|
14 |
+
[redis_path, "--port", str(redis_config['port']), "--bind", redis_config['host']],
|
15 |
+
stdout=subprocess.DEVNULL,
|
16 |
+
stderr=subprocess.DEVNULL
|
17 |
+
)
|
18 |
+
|
19 |
+
time.sleep(1)
|
20 |
+
logger.info(
|
21 |
+
f"Redis server started successfully on {redis_config['host']}:{redis_config['port']}",
|
22 |
+
log_type="server",
|
23 |
+
console=True
|
24 |
+
)
|
25 |
+
return process
|
26 |
+
|
27 |
+
def initialize_config() -> dict:
|
28 |
+
return read_config(config_path="config.yaml")
|
29 |
+
|
30 |
+
async def main():
|
31 |
+
config = initialize_config()
|
32 |
+
redis_process = start_redis_server(redis_config=config['redis_server'])
|
33 |
+
|
34 |
+
try:
|
35 |
+
uvicorn.run(
|
36 |
+
app="src.core.server:app",
|
37 |
+
host=config['server']['host'],
|
38 |
+
port=config['server']['port'],
|
39 |
+
reload=config['server']['reload'],
|
40 |
+
workers=config['server']['workers']
|
41 |
+
)
|
42 |
+
finally:
|
43 |
+
logger.info("Shutting down Redis server...", log_type="server", console=config['app']['verbose'])
|
44 |
+
redis_process.terminate()
|
45 |
+
|
46 |
+
if __name__ == "__main__":
|
47 |
+
asyncio.run(main())
|
prepare_kb.ipynb
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "f5a0d75d",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"import chromadb\n",
|
11 |
+
"from llama_index.core import StorageContext\n",
|
12 |
+
"from llama_index.vector_stores.chroma import ChromaVectorStore\n",
|
13 |
+
"# from llama_index.embeddings.fastembed import FastEmbedEmbedding\n",
|
14 |
+
"from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n",
|
15 |
+
"from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex\n",
|
16 |
+
"\n",
|
17 |
+
"# embed_model = FastEmbedEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
|
18 |
+
"data_dir = r\"knowledge_base\\raw\\classification\"\n",
|
19 |
+
"\n",
|
20 |
+
"documents = SimpleDirectoryReader(str(data_dir)).load_data()\n",
|
21 |
+
"data_path = r\"knowledge_base\\vector\\classification\"\n",
|
22 |
+
"db = chromadb.PersistentClient(path=data_path)"
|
23 |
+
]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"cell_type": "markdown",
|
27 |
+
"id": "b52b6ba8",
|
28 |
+
"metadata": {},
|
29 |
+
"source": [
|
30 |
+
"### Storing the data locally"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"cell_type": "code",
|
35 |
+
"execution_count": null,
|
36 |
+
"id": "348df588",
|
37 |
+
"metadata": {},
|
38 |
+
"outputs": [],
|
39 |
+
"source": [
|
40 |
+
"chroma_collection = db.get_or_create_collection(\"classification_db\")\n",
|
41 |
+
"vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
|
42 |
+
"storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
|
43 |
+
"index = VectorStoreIndex.from_documents(\n",
|
44 |
+
" documents=documents,\n",
|
45 |
+
" storage_context=storage_context,\n",
|
46 |
+
" show_progress=True,\n",
|
47 |
+
" # embed_model=embed_model\n",
|
48 |
+
")"
|
49 |
+
]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"cell_type": "markdown",
|
53 |
+
"id": "f7411c03",
|
54 |
+
"metadata": {},
|
55 |
+
"source": [
|
56 |
+
"### Loading the locally stored vector index"
|
57 |
+
]
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"cell_type": "code",
|
61 |
+
"execution_count": 6,
|
62 |
+
"id": "4d9cbd1b",
|
63 |
+
"metadata": {},
|
64 |
+
"outputs": [],
|
65 |
+
"source": [
|
66 |
+
"import chromadb\n",
|
67 |
+
"from llama_index.core import StorageContext\n",
|
68 |
+
"from llama_index.core import VectorStoreIndex\n",
|
69 |
+
"from llama_index.core.retrievers import VectorIndexRetriever\n",
|
70 |
+
"from llama_index.vector_stores.chroma import ChromaVectorStore\n",
|
71 |
+
"# from llama_index.embeddings.fastembed import FastEmbedEmbedding\n",
|
72 |
+
"\n",
|
73 |
+
"# embed_model = FastEmbedEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
|
74 |
+
"\n",
|
75 |
+
"data_path = r\"knowledge_base\\vector\\classification\"\n",
|
76 |
+
"db = chromadb.PersistentClient(path=data_path)\n",
|
77 |
+
"chroma_collection = db.get_or_create_collection(\"classification_db\")\n",
|
78 |
+
"vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
|
79 |
+
"storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
|
80 |
+
"\n",
|
81 |
+
"index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)\n",
|
82 |
+
"retriever = VectorIndexRetriever(\n",
|
83 |
+
" index, \n",
|
84 |
+
" # embed_model=embed_model\n",
|
85 |
+
")"
|
86 |
+
]
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"cell_type": "code",
|
90 |
+
"execution_count": null,
|
91 |
+
"id": "05804310",
|
92 |
+
"metadata": {},
|
93 |
+
"outputs": [],
|
94 |
+
"source": []
|
95 |
+
}
|
96 |
+
],
|
97 |
+
"metadata": {
|
98 |
+
"kernelspec": {
|
99 |
+
"display_name": "dev",
|
100 |
+
"language": "python",
|
101 |
+
"name": "python3"
|
102 |
+
},
|
103 |
+
"language_info": {
|
104 |
+
"codemirror_mode": {
|
105 |
+
"name": "ipython",
|
106 |
+
"version": 3
|
107 |
+
},
|
108 |
+
"file_extension": ".py",
|
109 |
+
"mimetype": "text/x-python",
|
110 |
+
"name": "python",
|
111 |
+
"nbconvert_exporter": "python",
|
112 |
+
"pygments_lexer": "ipython3",
|
113 |
+
"version": "3.11.4"
|
114 |
+
}
|
115 |
+
},
|
116 |
+
"nbformat": 4,
|
117 |
+
"nbformat_minor": 5
|
118 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
agno==1.1.3
|
2 |
+
aiohappyeyeballs==2.6.1
|
3 |
+
aiohttp==3.11.18
|
4 |
+
aiosignal==1.3.2
|
5 |
+
annotated-types==0.7.0
|
6 |
+
anyio==4.9.0
|
7 |
+
appnope==0.1.4
|
8 |
+
asgiref==3.8.1
|
9 |
+
asttokens==3.0.0
|
10 |
+
attrs==25.3.0
|
11 |
+
backoff==2.2.1
|
12 |
+
banks==2.1.2
|
13 |
+
bcrypt==4.3.0
|
14 |
+
build==1.2.2.post1
|
15 |
+
cachetools==5.5.2
|
16 |
+
certifi==2025.1.31
|
17 |
+
charset-normalizer==3.4.1
|
18 |
+
chromadb==1.0.9
|
19 |
+
click==8.1.8
|
20 |
+
colorama==0.4.6
|
21 |
+
coloredlogs==15.0.1
|
22 |
+
comm==0.2.2
|
23 |
+
contourpy==1.3.1
|
24 |
+
cycler==0.12.1
|
25 |
+
dataclasses-json==0.6.7
|
26 |
+
debugpy==1.8.14
|
27 |
+
decorator==5.2.1
|
28 |
+
Deprecated==1.2.18
|
29 |
+
dirtyjson==1.0.8
|
30 |
+
distro==1.9.0
|
31 |
+
dnspython==2.7.0
|
32 |
+
docling-core==2.28.1
|
33 |
+
docstring_parser==0.16
|
34 |
+
duckduckgo_search==8.0.0
|
35 |
+
durationpy==0.9
|
36 |
+
email_validator==2.2.0
|
37 |
+
executing==2.2.0
|
38 |
+
faiss-cpu==1.11.0
|
39 |
+
fastapi==0.115.9
|
40 |
+
fastapi-cli==0.0.7
|
41 |
+
fastembed==0.6.1
|
42 |
+
filelock==3.18.0
|
43 |
+
filetype==1.2.0
|
44 |
+
flatbuffers==25.2.10
|
45 |
+
fonttools==4.57.0
|
46 |
+
frozenlist==1.6.0
|
47 |
+
fsspec==2025.3.2
|
48 |
+
gitdb==4.0.12
|
49 |
+
GitPython==3.1.44
|
50 |
+
google-auth==2.40.1
|
51 |
+
googleapis-common-protos==1.70.0
|
52 |
+
greenlet==3.2.1
|
53 |
+
griffe==1.7.3
|
54 |
+
groq==0.23.1
|
55 |
+
grpcio==1.71.0
|
56 |
+
h11==0.14.0
|
57 |
+
httpcore==1.0.8
|
58 |
+
httptools==0.6.4
|
59 |
+
httpx==0.28.1
|
60 |
+
huggingface-hub==0.30.2
|
61 |
+
humanfriendly==10.0
|
62 |
+
idna==3.10
|
63 |
+
importlib_metadata==8.6.1
|
64 |
+
importlib_resources==6.5.2
|
65 |
+
ipykernel==6.29.5
|
66 |
+
ipython==9.1.0
|
67 |
+
ipython_pygments_lexers==1.1.1
|
68 |
+
jedi==0.19.2
|
69 |
+
Jinja2==3.1.6
|
70 |
+
jiter==0.9.0
|
71 |
+
joblib==1.4.2
|
72 |
+
jsonref==1.1.0
|
73 |
+
jsonschema==4.23.0
|
74 |
+
jsonschema-specifications==2025.4.1
|
75 |
+
jupyter_client==8.6.3
|
76 |
+
jupyter_core==5.7.2
|
77 |
+
kiwisolver==1.4.8
|
78 |
+
kubernetes==32.0.1
|
79 |
+
latex2mathml==3.77.0
|
80 |
+
llama-index-core==0.12.34.post1
|
81 |
+
llama-index-embeddings-fastembed==0.3.1
|
82 |
+
llama-index-embeddings-openai==0.3.1
|
83 |
+
llama-index-vector-stores-chroma==0.4.1
|
84 |
+
llama-index-vector-stores-faiss==0.4.0
|
85 |
+
lmoments3==1.0.8
|
86 |
+
loguru==0.7.3
|
87 |
+
lxml==5.3.2
|
88 |
+
markdown-it-py==3.0.0
|
89 |
+
MarkupSafe==3.0.2
|
90 |
+
marshmallow==3.26.1
|
91 |
+
matplotlib==3.10.1
|
92 |
+
matplotlib-inline==0.1.7
|
93 |
+
mdurl==0.1.2
|
94 |
+
missingno==0.5.2
|
95 |
+
mmh3==5.1.0
|
96 |
+
mpmath==1.3.0
|
97 |
+
multidict==6.4.3
|
98 |
+
mypy_extensions==1.1.0
|
99 |
+
nest-asyncio==1.6.0
|
100 |
+
networkx==3.4.2
|
101 |
+
nltk==3.9.1
|
102 |
+
numpy==2.2.4
|
103 |
+
oauthlib==3.2.2
|
104 |
+
onnxruntime==1.21.1
|
105 |
+
openai==1.72.0
|
106 |
+
opentelemetry-api==1.33.1
|
107 |
+
opentelemetry-exporter-otlp-proto-common==1.33.1
|
108 |
+
opentelemetry-exporter-otlp-proto-grpc==1.33.1
|
109 |
+
opentelemetry-instrumentation==0.54b1
|
110 |
+
opentelemetry-instrumentation-asgi==0.54b1
|
111 |
+
opentelemetry-instrumentation-fastapi==0.54b1
|
112 |
+
opentelemetry-proto==1.33.1
|
113 |
+
opentelemetry-sdk==1.33.1
|
114 |
+
opentelemetry-semantic-conventions==0.54b1
|
115 |
+
opentelemetry-util-http==0.54b1
|
116 |
+
orjson==3.10.18
|
117 |
+
overrides==7.7.0
|
118 |
+
packaging==24.2
|
119 |
+
pandas==2.2.3
|
120 |
+
parso==0.8.4
|
121 |
+
patsy==1.0.1
|
122 |
+
pexpect==4.9.0
|
123 |
+
pillow==11.1.0
|
124 |
+
platformdirs==4.3.7
|
125 |
+
posthog==4.0.1
|
126 |
+
primp==0.14.0
|
127 |
+
prompt_toolkit==3.0.50
|
128 |
+
propcache==0.3.1
|
129 |
+
protobuf==5.29.4
|
130 |
+
psutil==7.0.0
|
131 |
+
ptyprocess==0.7.0
|
132 |
+
pure_eval==0.2.3
|
133 |
+
py_rust_stemmers==0.1.5
|
134 |
+
pyasn1==0.6.1
|
135 |
+
pyasn1_modules==0.4.2
|
136 |
+
pydantic==2.11.3
|
137 |
+
pydantic-settings==2.8.1
|
138 |
+
pydantic_core==2.33.1
|
139 |
+
Pygments==2.19.1
|
140 |
+
pyparsing==3.2.3
|
141 |
+
PyPika==0.48.9
|
142 |
+
pyproject_hooks==1.2.0
|
143 |
+
pyreadline3==3.5.4
|
144 |
+
python-dateutil==2.9.0.post0
|
145 |
+
python-dotenv==1.1.0
|
146 |
+
python-multipart==0.0.20
|
147 |
+
pytz==2025.2
|
148 |
+
pywin32==310
|
149 |
+
PyYAML==6.0.2
|
150 |
+
pyzmq==26.4.0
|
151 |
+
RapidFuzz==3.13.0
|
152 |
+
redis==5.2.1
|
153 |
+
redis-cli==1.0.1
|
154 |
+
referencing==0.36.2
|
155 |
+
regex==2024.11.6
|
156 |
+
requests==2.32.3
|
157 |
+
requests-oauthlib==2.0.0
|
158 |
+
rich==14.0.0
|
159 |
+
rich-toolkit==0.14.1
|
160 |
+
rpds-py==0.24.0
|
161 |
+
rsa==4.9.1
|
162 |
+
safetensors==0.5.3
|
163 |
+
scikit-learn==1.6.1
|
164 |
+
scipy==1.15.2
|
165 |
+
seaborn==0.13.2
|
166 |
+
shellingham==1.5.4
|
167 |
+
six==1.17.0
|
168 |
+
smmap==5.0.2
|
169 |
+
sniffio==1.3.1
|
170 |
+
SQLAlchemy==2.0.40
|
171 |
+
stack-data==0.6.3
|
172 |
+
starlette==0.45.3
|
173 |
+
statsmodels==0.14.4
|
174 |
+
sympy==1.13.3
|
175 |
+
tabulate==0.9.0
|
176 |
+
tenacity==9.1.2
|
177 |
+
threadpoolctl==3.6.0
|
178 |
+
tiktoken==0.9.0
|
179 |
+
tokenizers==0.21.1
|
180 |
+
tomli==2.2.1
|
181 |
+
torch==2.7.0
|
182 |
+
tornado==6.4.2
|
183 |
+
tqdm==4.67.1
|
184 |
+
traitlets==5.14.3
|
185 |
+
transformers==4.51.3
|
186 |
+
typer==0.15.2
|
187 |
+
typing-inspect==0.9.0
|
188 |
+
typing-inspection==0.4.0
|
189 |
+
typing_extensions==4.13.2
|
190 |
+
tzdata==2025.2
|
191 |
+
urllib3==2.4.0
|
192 |
+
uv==0.6.14
|
193 |
+
uvicorn==0.34.0
|
194 |
+
watchfiles==1.0.5
|
195 |
+
wcwidth==0.2.13
|
196 |
+
websocket-client==1.8.0
|
197 |
+
websockets==15.0.1
|
198 |
+
win32_setctime==1.2.0
|
199 |
+
wrapt==1.17.2
|
200 |
+
yarl==1.20.0
|
201 |
+
zipp==3.21.0
|
src/api/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter
|
2 |
+
from .v1 import v1_router
|
3 |
+
|
4 |
+
router = APIRouter()
|
5 |
+
router.include_router(v1_router, prefix='/v1')
|
src/api/v1/__init__.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter
|
2 |
+
from .build_ml_plan import build_ml_plan_router
|
3 |
+
from .eda_engine import eda_engine_router
|
4 |
+
|
5 |
+
v1_router = APIRouter()
|
6 |
+
|
7 |
+
v1_router.include_router(build_ml_plan_router, prefix='/build-ml-plan')
|
8 |
+
v1_router.include_router(eda_engine_router, prefix='/eda-engine')
|
9 |
+
|
src/api/v1/build_ml_plan/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter
|
2 |
+
from .task_analysis import analysis_router
|
3 |
+
from .eda import eda_router
|
4 |
+
|
5 |
+
build_ml_plan_router = APIRouter()
|
6 |
+
build_ml_plan_router.include_router(analysis_router, prefix="/analyze-task")
|
7 |
+
# build_ml_plan_router.include_router(eda_router, prefix="/run-eda")
|
src/api/v1/build_ml_plan/eda.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter
|
2 |
+
from src.app.schemas.requests.eda import EdaRequestSchema
|
3 |
+
from src.app.pipelines.eda.pipeline import EdaLoop
|
4 |
+
import os
|
5 |
+
|
6 |
+
eda_router = APIRouter()
|
7 |
+
|
8 |
+
def delete_dir_contents(directory: str)->None:
|
9 |
+
for filename in os.listdir(directory):
|
10 |
+
file_path = os.path.join(directory, filename)
|
11 |
+
if os.path.isfile(file_path):
|
12 |
+
os.remove(file_path)
|
13 |
+
|
14 |
+
@eda_router.post('/')
|
15 |
+
async def main(response: EdaRequestSchema):
|
16 |
+
|
17 |
+
eda = EdaLoop(payload=response, verbose=True)
|
18 |
+
logs = eda.loop(verbose=True)
|
19 |
+
|
20 |
+
delete_dir_contents(directory="src/core/cache/dataset_logs")
|
21 |
+
delete_dir_contents(directory="src/core/cache/downloads")
|
22 |
+
|
23 |
+
return {
|
24 |
+
"execution_logs": logs
|
25 |
+
}
|
src/api/v1/build_ml_plan/task_analysis.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, File, UploadFile, Form, HTTPException, Request
|
2 |
+
from src.app.pipelines.task_analysis import MLImplementationPlannerWorkflow, MLAnalysisWorkflow
|
3 |
+
from src.app.schemas.requests.eda import EdaRequestSchema
|
4 |
+
from src.core.utils import logger
|
5 |
+
from typing import Optional
|
6 |
+
import os
|
7 |
+
import shutil
|
8 |
+
|
9 |
+
analysis_router = APIRouter()
|
10 |
+
|
11 |
+
UPLOAD_DIR = "src/core/cache/downloads"
|
12 |
+
|
13 |
+
@analysis_router.post("/")
|
14 |
+
async def main(
|
15 |
+
request: Request,
|
16 |
+
user_input: str = Form(...),
|
17 |
+
file: UploadFile = File(...)
|
18 |
+
):
|
19 |
+
'''Retrieving config from app state. This config is also stored in redis cache'''
|
20 |
+
config = request.app.state.config
|
21 |
+
|
22 |
+
if not file.filename.endswith('.csv'):
|
23 |
+
logger.error("Only CSV files are allowed", log_type="api: /analyze-task", console=config['app']['verbose'])
|
24 |
+
raise HTTPException(status_code=400, detail="Only CSV files are allowed.")
|
25 |
+
|
26 |
+
file_path = os.path.join(UPLOAD_DIR, file.filename)
|
27 |
+
|
28 |
+
'''Storing the file in /downloads'''
|
29 |
+
try:
|
30 |
+
with open(file_path, "wb") as buffer:
|
31 |
+
shutil.copyfileobj(file.file, buffer)
|
32 |
+
logger.info("File uploaded successfully!", log_type="api: /analyze-task", console=config['app']['verbose'])
|
33 |
+
except Exception as e:
|
34 |
+
logger.error(f"Error saving file: {str(e)}", log_type="api: /analyze-task", console=config['app']['verbose'])
|
35 |
+
raise HTTPException(status_code=500, detail=f"Error saving file: {str(e)}")
|
36 |
+
|
37 |
+
|
38 |
+
ml_analysis_wf = MLAnalysisWorkflow(user_prompt=user_input)
|
39 |
+
ml_analysis_results = ml_analysis_wf.run(verbose=True)
|
40 |
+
|
41 |
+
ml_imp_planner_wf = MLImplementationPlannerWorkflow(requirements_analysis=ml_analysis_results[0], technical_research=ml_analysis_results[1])
|
42 |
+
ml_imp_planner_results = ml_imp_planner_wf.run(verbose=True)
|
43 |
+
|
44 |
+
api_response = EdaRequestSchema(
|
45 |
+
dataset_path = file_path,
|
46 |
+
user_prompt = user_input,
|
47 |
+
requirement_analysis = ml_analysis_results[0],
|
48 |
+
technical_research = ml_analysis_results[1],
|
49 |
+
implementation_plan = ml_imp_planner_results
|
50 |
+
)
|
51 |
+
|
52 |
+
return {
|
53 |
+
"response": api_response
|
54 |
+
}
|
src/api/v1/eda_engine/__init__.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter
|
2 |
+
from .data_understanding import data_understanding_router
|
3 |
+
from .data_statistics import data_statistics_router
|
4 |
+
from .data_quality import data_quality_router
|
5 |
+
from .univariate_analysis import univariate_analysis_router
|
6 |
+
|
7 |
+
eda_engine_router = APIRouter()
|
8 |
+
eda_engine_router.include_router(data_understanding_router, prefix="/get-data-understanding-context")
|
9 |
+
eda_engine_router.include_router(data_statistics_router, prefix="/get-data-statistics")
|
10 |
+
eda_engine_router.include_router(data_quality_router, prefix="/get-data-quality")
|
11 |
+
eda_engine_router.include_router(univariate_analysis_router, prefix="/get-univariate-analysis")
|
12 |
+
|
src/api/v1/eda_engine/data_quality.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
from fastapi import APIRouter
|
4 |
+
from src.core.utils import logger
|
5 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, Form
|
6 |
+
from src.app.pipelines.modules import DataQualityAssessmentWorkflow
|
7 |
+
|
8 |
+
data_quality_router = APIRouter()
|
9 |
+
|
10 |
+
def delete_dir_contents(directory: str)->None:
|
11 |
+
for filename in os.listdir(directory):
|
12 |
+
file_path = os.path.join(directory, filename)
|
13 |
+
if os.path.isfile(file_path):
|
14 |
+
os.remove(file_path)
|
15 |
+
|
16 |
+
@data_quality_router.post('/')
|
17 |
+
async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
|
18 |
+
''' ## This endpoint accepts a CSV file upload to initiate the Data Quality Workflow.
|
19 |
+
|
20 |
+
### Parameters:
|
21 |
+
-----------
|
22 |
+
- file : CSV File for the dataset
|
23 |
+
|
24 |
+
### Returns:
|
25 |
+
--------
|
26 |
+
- dict: Markdown Report
|
27 |
+
'''
|
28 |
+
|
29 |
+
if not file.filename.endswith('.csv'):
|
30 |
+
raise HTTPException(status_code=400, detail="Only CSV files are allowed.")
|
31 |
+
|
32 |
+
'''Clears the /downloads folder and stores the recieved file under 'dataset.csv' '''
|
33 |
+
|
34 |
+
downloads_path = "src/core/cache/downloads"
|
35 |
+
os.makedirs(downloads_path, exist_ok=True)
|
36 |
+
delete_dir_contents(downloads_path)
|
37 |
+
destination_path = os.path.join(downloads_path, "dataset.csv")
|
38 |
+
with open(destination_path, "wb") as buffer:
|
39 |
+
shutil.copyfileobj(file.file, buffer)
|
40 |
+
|
41 |
+
logger.info(f"CSV file saved to {destination_path}", log_type='eda-engine/data_quality', console=True)
|
42 |
+
|
43 |
+
'''Runs the data quality assessment workflow'''
|
44 |
+
|
45 |
+
try:
|
46 |
+
ds_wf = DataQualityAssessmentWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
|
47 |
+
results = ds_wf.run(verbose=True)
|
48 |
+
|
49 |
+
return {
|
50 |
+
"status": "Pipeline finished running",
|
51 |
+
"results": results
|
52 |
+
}
|
53 |
+
|
54 |
+
except Exception as e:
|
55 |
+
logger.error(f"DataQualityAssessmentWorkflow failed with error: {e}", log_type='eda-engine/data_quality', console=True)
|
56 |
+
return {
|
57 |
+
"status": "Pipeline failed to finish",
|
58 |
+
}
|
src/api/v1/eda_engine/data_statistics.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
from fastapi import APIRouter
|
4 |
+
from src.core.utils import logger
|
5 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, Form
|
6 |
+
from src.app.pipelines.modules import DataStatisticsWorkflow
|
7 |
+
|
8 |
+
data_statistics_router = APIRouter()
|
9 |
+
|
10 |
+
def delete_dir_contents(directory: str)->None:
|
11 |
+
for filename in os.listdir(directory):
|
12 |
+
file_path = os.path.join(directory, filename)
|
13 |
+
if os.path.isfile(file_path):
|
14 |
+
os.remove(file_path)
|
15 |
+
|
16 |
+
@data_statistics_router.post('/')
|
17 |
+
async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
|
18 |
+
''' ## This endpoint accepts a CSV file upload to initiate the Data Statistics Workflow.
|
19 |
+
|
20 |
+
### Parameters:
|
21 |
+
-----------
|
22 |
+
- file : CSV File for the dataset
|
23 |
+
\n
|
24 |
+
- ml_task : Final machine learning task/target
|
25 |
+
|
26 |
+
### Returns:
|
27 |
+
--------
|
28 |
+
- dict: Markdown Report
|
29 |
+
'''
|
30 |
+
|
31 |
+
if not file.filename.endswith('.csv'):
|
32 |
+
raise HTTPException(status_code=400, detail="Only CSV files are allowed.")
|
33 |
+
|
34 |
+
'''Clears the /downloads folder and stores the recieved file under 'dataset.csv' '''
|
35 |
+
|
36 |
+
downloads_path = "src/core/cache/downloads"
|
37 |
+
os.makedirs(downloads_path, exist_ok=True)
|
38 |
+
delete_dir_contents(downloads_path)
|
39 |
+
destination_path = os.path.join(downloads_path, "dataset.csv")
|
40 |
+
with open(destination_path, "wb") as buffer:
|
41 |
+
shutil.copyfileobj(file.file, buffer)
|
42 |
+
|
43 |
+
logger.info(f"CSV file saved to {destination_path}", log_type='eda-engine/data_statistics', console=True)
|
44 |
+
|
45 |
+
'''Runs the data statistics workflow'''
|
46 |
+
try:
|
47 |
+
ds_wf = DataStatisticsWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
|
48 |
+
results = ds_wf.run(verbose=True)
|
49 |
+
|
50 |
+
return {
|
51 |
+
"status": "Pipeline finished running",
|
52 |
+
"results": results
|
53 |
+
}
|
54 |
+
|
55 |
+
except Exception as e:
|
56 |
+
logger.error(f"DataStatisticsWorkflow failed with error: {e}", log_type='eda-engine/data_statistics', console=True)
|
57 |
+
return {
|
58 |
+
"status": "Pipeline failed to finish",
|
59 |
+
}
|
60 |
+
|
61 |
+
|
src/api/v1/eda_engine/data_understanding.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
from typing import Optional
|
4 |
+
from src.core.utils import logger
|
5 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, Form
|
6 |
+
from src.app.pipelines.modules import DataUnderstandingContextWorkflow
|
7 |
+
|
8 |
+
data_understanding_router = APIRouter()
|
9 |
+
|
10 |
+
def delete_dir_contents(directory: str) -> None:
|
11 |
+
for filename in os.listdir(directory):
|
12 |
+
file_path = os.path.join(directory, filename)
|
13 |
+
if os.path.isfile(file_path):
|
14 |
+
os.remove(file_path)
|
15 |
+
|
16 |
+
@data_understanding_router.post('/')
|
17 |
+
async def main(file: UploadFile = File(...), business_requirements: Optional[str] = Form(None)):
|
18 |
+
''' ## This endpoint accepts a CSV file upload & additional business requirements/context to initiate the Data Understanding Context Workflow.
|
19 |
+
|
20 |
+
### Parameters:
|
21 |
+
-----------
|
22 |
+
- file : CSV File for the dataset
|
23 |
+
\n
|
24 |
+
- business_context : Additional business context information about the dataset
|
25 |
+
|
26 |
+
### Returns:
|
27 |
+
--------
|
28 |
+
- dict: Markdown Report
|
29 |
+
'''
|
30 |
+
|
31 |
+
if not file.filename.endswith('.csv'):
|
32 |
+
raise HTTPException(status_code=400, detail="Only CSV files are allowed.")
|
33 |
+
|
34 |
+
'''Clears the /downloads folder and stores the recieved file under 'dataset.csv' '''
|
35 |
+
|
36 |
+
downloads_path = "src/core/cache/downloads"
|
37 |
+
os.makedirs(downloads_path, exist_ok=True)
|
38 |
+
delete_dir_contents(downloads_path)
|
39 |
+
destination_path = os.path.join(downloads_path, "dataset.csv")
|
40 |
+
with open(destination_path, "wb") as buffer:
|
41 |
+
shutil.copyfileobj(file.file, buffer)
|
42 |
+
|
43 |
+
logger.info(f"CSV file saved to {destination_path}", log_type='eda-engine/data_understanding', console=True)
|
44 |
+
|
45 |
+
|
46 |
+
'''Runs the data understanding workflow'''
|
47 |
+
|
48 |
+
try:
|
49 |
+
duc_wf = DataUnderstandingContextWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", business_context=business_requirements)
|
50 |
+
results = duc_wf.run(verbose=True)
|
51 |
+
|
52 |
+
return {
|
53 |
+
"status": "Pipeline finished running",
|
54 |
+
"results": results
|
55 |
+
}
|
56 |
+
|
57 |
+
except Exception as e:
|
58 |
+
logger.error(f"DataUnderstandingContextWorkflow failed with error: {e}", log_type='eda-engine/data_understanding', console=True)
|
59 |
+
return {
|
60 |
+
"status": "Pipeline failed to finish",
|
61 |
+
}
|
src/api/v1/eda_engine/univariate_analysis.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
from typing import Optional
|
4 |
+
from src.core.utils import logger
|
5 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, Form
|
6 |
+
from src.app.pipelines.modules import UnivariateAnalysisWorkflow
|
7 |
+
|
8 |
+
univariate_analysis_router = APIRouter()
|
9 |
+
|
10 |
+
def delete_dir_contents(directory: str)->None:
|
11 |
+
for filename in os.listdir(directory):
|
12 |
+
file_path = os.path.join(directory, filename)
|
13 |
+
if os.path.isfile(file_path):
|
14 |
+
os.remove(file_path)
|
15 |
+
|
16 |
+
@univariate_analysis_router.post('/')
|
17 |
+
async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
|
18 |
+
''' ## This endpoint accepts a CSV file upload to initiate the Univarite Analysis Workflow.
|
19 |
+
|
20 |
+
### Parameters:
|
21 |
+
-----------
|
22 |
+
- file : CSV File for the dataset
|
23 |
+
\n
|
24 |
+
- ml_task : Final machine learning task/target
|
25 |
+
|
26 |
+
### Returns:
|
27 |
+
--------
|
28 |
+
- dict: Markdown Report
|
29 |
+
'''
|
30 |
+
|
31 |
+
if not file.filename.endswith('.csv'):
|
32 |
+
raise HTTPException(status_code=400, detail="Only CSV files are allowed.")
|
33 |
+
|
34 |
+
'''Clears the /downloads folder and stores the recieved file under 'dataset.csv' '''
|
35 |
+
|
36 |
+
downloads_path = "src/core/cache/downloads"
|
37 |
+
os.makedirs(downloads_path, exist_ok=True)
|
38 |
+
delete_dir_contents(downloads_path)
|
39 |
+
destination_path = os.path.join(downloads_path, "dataset.csv")
|
40 |
+
with open(destination_path, "wb") as buffer:
|
41 |
+
shutil.copyfileobj(file.file, buffer)
|
42 |
+
|
43 |
+
logger.info(f"CSV file saved to {destination_path}", log_type='eda-engine/univariate_analysis', console=True)
|
44 |
+
|
45 |
+
'''Runs the data univariate analysis workflow'''
|
46 |
+
try:
|
47 |
+
ua_wf = UnivariateAnalysisWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
|
48 |
+
results = ua_wf.run(verbose=True)
|
49 |
+
|
50 |
+
return {
|
51 |
+
"status": "Pipeline finished running",
|
52 |
+
"results": results
|
53 |
+
}
|
54 |
+
|
55 |
+
except Exception as e:
|
56 |
+
logger.error(f"UnivariateAnalysisWorkflow failed with error: {e}", log_type='eda-engine/dataunivariate_analysis_statistics', console=True)
|
57 |
+
return {
|
58 |
+
"status": "Pipeline failed to finish"
|
59 |
+
}
|
src/app/pipelines/eda/agents/agents.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .prompts import intel_agent_desc, intel_agent_instructions, orchestrator_agent_desc, orchestrator_agent_instructions, analyzer_agent_desc, analyzer_agent_instructions, judging_agent_desc, judging_agent_instructions
|
2 |
+
from .models import IntelAgentResponseSchema, OrchestratorAgentResponseSchema, AnalyzerAgentResponseSchema, JudgingAgentResponseSchema
|
3 |
+
from agno.models.openai import OpenAIChat # type: ignore
|
4 |
+
from agno.agent import Agent # type: ignore
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from typing import List
|
7 |
+
import os
|
8 |
+
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
+
class AgentClass:
|
12 |
+
def __init__(self):
|
13 |
+
self.llm = OpenAIChat(id="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
|
14 |
+
self.agents = {}
|
15 |
+
|
16 |
+
self.build_agent(
|
17 |
+
agent_name = "intel_agent",
|
18 |
+
agent_desc = intel_agent_desc,
|
19 |
+
agent_instructions = intel_agent_instructions,
|
20 |
+
agent_response_model = IntelAgentResponseSchema,
|
21 |
+
tools = None
|
22 |
+
)
|
23 |
+
|
24 |
+
self.build_agent(
|
25 |
+
agent_name = "orchestrator_agent",
|
26 |
+
agent_desc = orchestrator_agent_desc,
|
27 |
+
agent_instructions = orchestrator_agent_instructions,
|
28 |
+
agent_response_model = OrchestratorAgentResponseSchema,
|
29 |
+
tools = None
|
30 |
+
)
|
31 |
+
|
32 |
+
self.build_agent(
|
33 |
+
agent_name = "analyzer_agent",
|
34 |
+
agent_desc = analyzer_agent_desc,
|
35 |
+
agent_instructions = analyzer_agent_instructions,
|
36 |
+
agent_response_model = AnalyzerAgentResponseSchema,
|
37 |
+
tools = None
|
38 |
+
)
|
39 |
+
|
40 |
+
self.build_agent(
|
41 |
+
agent_name = "judging_agent",
|
42 |
+
agent_desc = judging_agent_desc,
|
43 |
+
agent_instructions = judging_agent_instructions,
|
44 |
+
agent_response_model = JudgingAgentResponseSchema,
|
45 |
+
tools = None
|
46 |
+
)
|
47 |
+
|
48 |
+
|
49 |
+
def build_agent(self, agent_name: str, agent_desc: str, agent_instructions: List[str], agent_response_model, tools=None, debug_mode=False):
|
50 |
+
'''
|
51 |
+
Builds or re-builds an agent dynamically
|
52 |
+
'''
|
53 |
+
self.agents[agent_name] = Agent(
|
54 |
+
model = self.llm,
|
55 |
+
description = agent_desc,
|
56 |
+
instructions = agent_instructions,
|
57 |
+
response_model = agent_response_model,
|
58 |
+
structured_outputs = True,
|
59 |
+
tools = tools,
|
60 |
+
debug_mode=debug_mode,
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
|
src/app/pipelines/eda/agents/models.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import Field, BaseModel
|
2 |
+
from ..model import DatasetSummary
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
class IntelAgentResponseSchema(BaseModel):
|
6 |
+
dataset_description: str = Field(
|
7 |
+
default=None,
|
8 |
+
description = "Gives an idea about the dataset while also containing insights from the business understanding and task goal"
|
9 |
+
)
|
10 |
+
ml_task: str = Field(
|
11 |
+
default=None,
|
12 |
+
description = "Contains the task type which will help in the EDA process"
|
13 |
+
)
|
14 |
+
dataset_summary: DatasetSummary = Field(
|
15 |
+
default=None,
|
16 |
+
description = "Contains the summary of the dataset including num of rows, columns, features, target variables"
|
17 |
+
)
|
18 |
+
|
19 |
+
class OrchestratorAgentResponseSchema(BaseModel):
|
20 |
+
tool_name: str = Field(
|
21 |
+
default=None,
|
22 |
+
description="Name of the tool selected for execution"
|
23 |
+
)
|
24 |
+
justification: str = Field(
|
25 |
+
default=None,
|
26 |
+
description="Reason for selecting the tool"
|
27 |
+
)
|
28 |
+
|
29 |
+
class AnalyzerAgentResponseSchema(BaseModel):
|
30 |
+
key_insights: str = Field(
|
31 |
+
default=None,
|
32 |
+
description="Detected Patterns, Key findings, Anamolies, detected trends all must be discussed here"
|
33 |
+
)
|
34 |
+
potential_issues: str = Field(
|
35 |
+
default=None,
|
36 |
+
description="Detected Issues, Any problems found in the data, Steps that have not yet been performed, Any unresolved problems with the dataset, **Unmet Business Requirements**"
|
37 |
+
)
|
38 |
+
recommendations: str = Field(
|
39 |
+
default=None,
|
40 |
+
description="areas that still need analysis or processing"
|
41 |
+
)
|
42 |
+
final_task_achieved: bool = Field(
|
43 |
+
default=None,
|
44 |
+
description="Whether user task has been completed or not"
|
45 |
+
)
|
46 |
+
|
47 |
+
class ExecuterAgentResponseSchema(BaseModel):
|
48 |
+
executed_tool: str = Field(
|
49 |
+
default=None,
|
50 |
+
description="Tool that was executed"
|
51 |
+
)
|
52 |
+
execution_details: str = Field(
|
53 |
+
default=None,
|
54 |
+
description="Summary of execution process"
|
55 |
+
)
|
56 |
+
error_logs: Optional[str] = Field(
|
57 |
+
default=None,
|
58 |
+
description="Error logs if execution failed"
|
59 |
+
)
|
60 |
+
output_file_path: Optional[str] = Field(
|
61 |
+
default = None,
|
62 |
+
description = "Path of output file stored"
|
63 |
+
)
|
64 |
+
|
65 |
+
class JudgingAgentResponseSchema(BaseModel):
|
66 |
+
detailed_context: str = Field(
|
67 |
+
default=None,
|
68 |
+
description="Contains a detailed context about all the previously executed tools/functions"
|
69 |
+
)
|
70 |
+
stop_loop: bool = Field(
|
71 |
+
default=None,
|
72 |
+
description="Contains boolean value whether the eda loop must be stopped or not"
|
73 |
+
)
|
74 |
+
justification: str = Field(
|
75 |
+
default=None,
|
76 |
+
description="Contains the justification for its decision of stopping the loop or not"
|
77 |
+
)
|
src/app/pipelines/eda/agents/prompts.py
ADDED
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
intel_agent_desc = '''
|
2 |
+
You are a Dataset Understanding and Problem Analysis Expert.
|
3 |
+
Your task is to analyze the provided dataset, interpret the user's problem statement, and extract meaningful insights from both the dataset and business context.
|
4 |
+
Your analysis will guide the Exploratory Data Analysis (EDA) and Machine Learning (ML) pipeline setup.
|
5 |
+
You will identify target variables, infer feature meanings, and ensure dataset insights are actionable and relevant for model building.
|
6 |
+
'''
|
7 |
+
|
8 |
+
intel_agent_instructions = [
|
9 |
+
'''1. Business Understanding:
|
10 |
+
- Analyze the user's goal from the user's prompt and business requirements to determine the nature of the problem.
|
11 |
+
- Identify the type of ML task (e.g., classification, regression, clustering, ranking, recommendation, etc.).
|
12 |
+
- Determine relevant target variable(s) based on the problem statement and dataset structure.
|
13 |
+
- For numerical features, interpret the statistics like mean, min., max. and std value and use this analysis to generate informative feature descriptions
|
14 |
+
''',
|
15 |
+
'''2. Dataset Analysis:
|
16 |
+
- Extract key dataset metadata, including the number of rows, columns, and data types.
|
17 |
+
- Identify the target variable(s) that align with the task goal.
|
18 |
+
- Examine a **random sample of 10 rows** to infer initial insights about categorical and numerical features.
|
19 |
+
- For numerical features, interpret the statistics like mean, min., max. and std value and use this analysis to generate informative feature descriptions
|
20 |
+
''',
|
21 |
+
'''3. Output Generation:
|
22 |
+
- Construct a structured dataset summary with:
|
23 |
+
- Feature descriptions, feature types (numerical, categorical, text), and missing value counts.
|
24 |
+
- Key statistics for numerical features (mean, min, max, std deviation) with interpretations.
|
25 |
+
- Define a clear **ML task type** that informs feature selection and EDA.
|
26 |
+
- Provide a refined list of **target variables** aligned with the business problem.
|
27 |
+
- Ensure that dataset insights are **actionable** and relevant for feature engineering and model training.
|
28 |
+
- For numerical features, interpret the statistics like mean, min., max. and std value and use this analysis to generate informative feature descriptions
|
29 |
+
''',
|
30 |
+
]
|
31 |
+
|
32 |
+
orchestrator_agent_desc = '''
|
33 |
+
You are the Adaptive Data Analysis & Processing Decision-Making Agent. You are an expert at dynamically selecting and sequencing the most relevant data analysis, preprocessing, and transformation steps based on user instructions, dataset characteristics, and analysis reports from previously executed functions.
|
34 |
+
Your primary responsibility is to create a customized analytical pathway for each dataset, ensuring that prerequisite tasks are completed in an optimal sequence while avoiding unnecessary steps.
|
35 |
+
You operate in a continuous feedback loop, evaluating each new analysis report to refine your understanding of the dataset and adjust subsequent decisions accordingly.
|
36 |
+
'''
|
37 |
+
|
38 |
+
orchestrator_agent_instructions = [
|
39 |
+
'''
|
40 |
+
1. Understand User Intent & Business Requirements:
|
41 |
+
- Extract the 'Business Objective', 'ML Task', and 'Final Task' from the prompt.
|
42 |
+
- Identify whether the user request requires simple preprocessing, in-depth analysis, feature engineering, or another transformation.
|
43 |
+
- Map the requested task to its position in a typical data science workflow to identify prerequisite and dependent steps.
|
44 |
+
- Understand the end goal to prioritize steps that directly contribute to achieving the business objective.
|
45 |
+
''',
|
46 |
+
'''
|
47 |
+
2. Analyze Dataset Characteristics & Processing History:
|
48 |
+
- Carefully examine 'Dataset Description' and 'Dataset Overview' to identify data types, distributions, outliers, and quality issues.
|
49 |
+
- Review past tool executions from 'Previous Pre-processing tools executed' chronologically to build context.
|
50 |
+
- Critically evaluate the most recent analysis reports to understand:
|
51 |
+
* What issues were detected and resolved
|
52 |
+
* What issues remain unaddressed
|
53 |
+
* How the data distribution and characteristics have changed after previous transformations
|
54 |
+
* What new patterns or relationships have emerged from previous analyses
|
55 |
+
- Detect dataset-specific challenges that might require specialized handling.
|
56 |
+
''',
|
57 |
+
'''
|
58 |
+
3. Make Data-Driven Decisions on Next Steps:
|
59 |
+
- Always start with the Univariate Analysis before executing any other analysis or pre-processing / transformation tool.
|
60 |
+
- Assess whether the requested operation has dependencies and determine if those dependencies have been satisfied based on previous analysis reports.
|
61 |
+
- Select the most appropriate tool from 'Available Tools for Execution' considering:
|
62 |
+
* Current dataset state as revealed by the most recent analysis
|
63 |
+
* Required prerequisites for the user's requested task
|
64 |
+
* Potential impact on downstream analyses and transformations
|
65 |
+
- For complex tasks like feature engineering, ensure that sufficient data understanding has been established through appropriate exploratory analyses.
|
66 |
+
- Adapt the analytical pathway based on insights from previous steps—if an analysis reveals unexpected patterns, reprioritize subsequent steps accordingly.
|
67 |
+
- Identify when a specific analysis can be skipped because previous reports have already provided the necessary information.
|
68 |
+
''',
|
69 |
+
'''
|
70 |
+
4. Provide Comprehensive Justification Grounded in Analysis Reports:
|
71 |
+
- Reference specific findings from previous analysis reports to justify your current decision.
|
72 |
+
- Explain how the chosen step builds upon or addresses issues identified in previous analyses.
|
73 |
+
- When bypassing a seemingly logical step, cite specific evidence from previous reports showing why it's unnecessary.
|
74 |
+
- Connect your decision to both immediate needs and long-term business objectives.
|
75 |
+
- When recommending prerequisite steps, clearly articulate how they enable the user's requested task.
|
76 |
+
''',
|
77 |
+
'''
|
78 |
+
5. Output Format:
|
79 |
+
- Your response must contain:
|
80 |
+
1. Step / Iteration number the entire loop is currently at
|
81 |
+
2. Function Name to Execute: Choose from the available tools and return the exact name of the tool. Don't add function. to the tool name. Be very precise and under no circumstance will you return a tool name out of the list.
|
82 |
+
3. Comprehensive Justification for Selection that references:
|
83 |
+
* Specific findings from previous analysis reports
|
84 |
+
* Current state of the dataset
|
85 |
+
* How this step advances toward the business objective and user's requested task
|
86 |
+
* Why alternative approaches were not selected
|
87 |
+
''',
|
88 |
+
'''
|
89 |
+
6. Critical Rules to Follow:
|
90 |
+
- Never proceed with feature engineering or advanced transformations without first verifying data quality through previous analysis reports.
|
91 |
+
- Continuously adapt your strategy based on new insights—previous analysis results should directly influence current decisions.
|
92 |
+
- Balance thoroughness with efficiency—skip steps only when analysis reports provide evidence they're unnecessary.
|
93 |
+
- Recognize dataset-specific characteristics that require specialized treatment rather than applying a generic approach.
|
94 |
+
- Be opportunistic in identifying when multiple objectives can be achieved with a single analytical step.
|
95 |
+
- When analysis reports reveal unexpected data characteristics, be prepared to recommend a different path than originally anticipated.
|
96 |
+
- Always validate that a user-requested step is appropriate given the current dataset state—recommend alternatives when necessary.
|
97 |
+
- Consider the computational cost of each step relative to its potential value—prioritize high-impact analyses.
|
98 |
+
''',
|
99 |
+
'''
|
100 |
+
7. Special Case: First Iteration Handling:
|
101 |
+
- If there is no history/logs of any executed tools, always start with univariate analysis.
|
102 |
+
- If the user has requested a very specific function, check if it requires any prerequisite steps.
|
103 |
+
- If prerequisites are needed, execute those first before running the requested function.
|
104 |
+
- If no specific function is requested, always execute univariate analysis as the first step.
|
105 |
+
''',
|
106 |
+
'''
|
107 |
+
8. **New Rule: Ensure Each Function is Only Executed Once:**
|
108 |
+
- If a function or type of analysis has already been executed, **do not execute it again** under any circumstance.
|
109 |
+
- Keep track of all previously executed functions using the history/logs.
|
110 |
+
- If a user requests a function that has already been run, return a justification explaining why it will not be repeated.
|
111 |
+
- Instead of re-running a function, suggest alternative actions based on current dataset needs and past analysis results.
|
112 |
+
'''
|
113 |
+
]
|
114 |
+
|
115 |
+
analyzer_agent_desc = """
|
116 |
+
You are the Analysis Interpretation Agent.
|
117 |
+
You are an expert at extracting meaningful insights from analysis results, identifying issues, and determining logical next steps in a data processing workflow.
|
118 |
+
Your task is to interpret the results of previously executed functions, provide detailed insights, and suggest relevant next steps based on the dataset characteristics, business requirements, and detected issues.
|
119 |
+
Your output will serve as input for a decision-making agent that determines which function to execute next.
|
120 |
+
Additionally, you must determine whether the user's task has been successfully completed based on the business objective and user objective.
|
121 |
+
"""
|
122 |
+
|
123 |
+
analyzer_agent_instructions = [
|
124 |
+
"""
|
125 |
+
1. Understand the Received Inputs:
|
126 |
+
- Analyze the 'Function Execution Details' (a short summary of results) to grasp key takeaways.
|
127 |
+
- Review 'Function Metadata' to understand what steps were taken to generate the results.
|
128 |
+
- Carefully examine the full structured output of the function execution to extract insights.
|
129 |
+
- Understand the 'Business Objective' and 'User Objective' to assess completeness.
|
130 |
+
""",
|
131 |
+
"""
|
132 |
+
2. Analyze & Interpret Function Results
|
133 |
+
- Examine the structure of the function output.
|
134 |
+
- Determine what insights are meaningful based on the type of analysis performed.
|
135 |
+
- Adaptively interpret key takeaways without assuming a fixed output schema.
|
136 |
+
""",
|
137 |
+
"""
|
138 |
+
3. Identify Issues & Challenges
|
139 |
+
- If the function output provides statistical information, look for:
|
140 |
+
* Patterns, anomalies, or inconsistencies.
|
141 |
+
- If it generates transformed data, check:
|
142 |
+
* Completeness, correctness, and adherence to expected formats.
|
143 |
+
- If it's a model training result, extract:
|
144 |
+
* Performance metrics, overfitting risks, and areas for improvement.
|
145 |
+
- For any other type of output, assess:
|
146 |
+
* How well the function achieved its intended goal.
|
147 |
+
""",
|
148 |
+
"""
|
149 |
+
4. Determine Logical Next Steps
|
150 |
+
- Based on the extracted insights, suggest what should logically follow:
|
151 |
+
* Further data cleaning or transformation?
|
152 |
+
* A different type of analysis?
|
153 |
+
- Ensure that recommendations align with the broader business objective.
|
154 |
+
""",
|
155 |
+
"""
|
156 |
+
5. Identify Unmet Business Requirements & Task Completion Check
|
157 |
+
- Check if there are remaining gaps in the analysis pipeline.
|
158 |
+
- Verify if the extracted insights align with the original 'Business Objective' and 'User Objective'.
|
159 |
+
- Determine if the previous function output sufficiently addresses the business goal.
|
160 |
+
- Identify any aspects of the dataset that haven’t been sufficiently analyzed yet.
|
161 |
+
- If certain transformations or feature engineering steps are necessary for the final objective, highlight them.
|
162 |
+
- Highlight any missing steps required to complete the workflow.
|
163 |
+
- **Task Completion Check:**
|
164 |
+
- If all required steps have been performed and the results meet the business objective and user goal, return `task_completed = True`.
|
165 |
+
- If there are still outstanding issues, required steps, or missing insights, return `task_completed = False`.
|
166 |
+
""",
|
167 |
+
"""
|
168 |
+
6. Output Format:
|
169 |
+
- Your response must contain a structured summary with the following components:
|
170 |
+
1. key_insights: Detected Patterns, Key findings, Anamolies, detected trends all must be discussed here.
|
171 |
+
2. potential_issues: Detected Issues, Any problems found in the data, Steps that have not yet been performed, Any unresolved problems with the dataset, **Unmet Business Requirements**.
|
172 |
+
3. recommendations: Areas that still need analysis or processing.
|
173 |
+
4. task_completed: Boolean (True/False) indicating if the user’s objective has been met.
|
174 |
+
"""
|
175 |
+
]
|
176 |
+
|
177 |
+
executer_agent_desc = '''
|
178 |
+
You are an intelligent ML workflow execution agent responsible for executing selected tools on the dataset and generating a detailed execution report. Your primary role is to apply the given tool to the dataset, monitor the process, and log the results.
|
179 |
+
|
180 |
+
At each iteration, you must:
|
181 |
+
1. Receive execution parameters, including the tool name, expected changes, and dataset description.
|
182 |
+
2. Execute the specified tool with appropriate parameters.
|
183 |
+
3. Capture the execution outcome, including success status, modifications made, and any errors encountered.
|
184 |
+
4. Generate a structured execution report.
|
185 |
+
|
186 |
+
Your goal is to ensure the successful execution of ML workflow steps, log any issues encountered, and maintain detailed execution records.
|
187 |
+
'''
|
188 |
+
|
189 |
+
executer_agent_instructions = [
|
190 |
+
'''1. Input Processing
|
191 |
+
Upon receiving an execution request, analyze the following inputs:
|
192 |
+
|
193 |
+
- Step Number: The iteration number in the pipeline.
|
194 |
+
- Tool Name: The tool to be executed.
|
195 |
+
- Expected Changes: Modifications expected after execution.
|
196 |
+
- Dataset Description: A textual overview of dataset characteristics.
|
197 |
+
- Dataset Overview: Key statistics, structure, feature types, and known issues.
|
198 |
+
|
199 |
+
Validate whether the provided tool is applicable based on the dataset state.
|
200 |
+
''',
|
201 |
+
'''2. Tool Execution
|
202 |
+
Execute the specified tool using the appropriate method and parameters. Ensure:
|
203 |
+
|
204 |
+
- Correct application of the tool based on dataset properties.
|
205 |
+
- Efficient execution without unnecessary operations.
|
206 |
+
- Handling of potential issues such as missing data, outliers, or incompatible transformations.
|
207 |
+
|
208 |
+
If execution fails, capture detailed error logs for debugging.
|
209 |
+
''',
|
210 |
+
'''3. Capture Execution Outcome
|
211 |
+
After execution, document the results:
|
212 |
+
|
213 |
+
- execution_successful: Whether the tool was executed without errors.
|
214 |
+
- execution_details: Summary of operations performed by the tool.
|
215 |
+
In this execution_details analyze the python code and summarize whats happening in the code. Don't be vague be very precise. For eg. Detected 12 outliers and removed them etc
|
216 |
+
- error_logs: If execution failed, log relevant error messages.
|
217 |
+
|
218 |
+
Ensure all outputs are structured and informative.
|
219 |
+
''',
|
220 |
+
'''4. Generate Execution Report
|
221 |
+
Construct a structured execution report with:
|
222 |
+
|
223 |
+
- step_number: Iteration number.
|
224 |
+
- executed_tool: Name of the executed tool.
|
225 |
+
- execution_successful: Boolean flag indicating success or failure.
|
226 |
+
- execution_details: Summary of what was done.
|
227 |
+
- error_logs: Error messages, if any.
|
228 |
+
|
229 |
+
This report must be clear, precise, and provide actionable insights for further processing.
|
230 |
+
''',
|
231 |
+
'''5. You must also output the path of file stored after executing the function. You must output the exact path where the file is stored after executing the function
|
232 |
+
Map this to field: 'output_file_path'
|
233 |
+
|
234 |
+
If there is no output path provided explicitly then return None
|
235 |
+
|
236 |
+
'''
|
237 |
+
]
|
238 |
+
|
239 |
+
judging_agent_desc = '''
|
240 |
+
You are an intelligent agent that analyzes preprocessing function execution history,
|
241 |
+
provides detailed context summaries, and determines when to stop machine learning loops
|
242 |
+
based on goal achievement and tool exhaustion analysis.
|
243 |
+
|
244 |
+
This agent serves two primary functions:
|
245 |
+
1. Synthesize detailed contextual summaries of executed preprocessing functions
|
246 |
+
2. Make informed decisions about stopping ML loops with clear justifications
|
247 |
+
'''
|
248 |
+
|
249 |
+
judging_agent_instructions = [
|
250 |
+
'''1. PREPROCESSING FUNCTION ANALYSIS
|
251 |
+
|
252 |
+
The agent will receive list of previously executed preprocessing functions including:
|
253 |
+
- Function names
|
254 |
+
- Function logic descriptions
|
255 |
+
- Function execution results
|
256 |
+
|
257 |
+
For each function, the agent must:
|
258 |
+
a) Understand the purpose and logic of the function
|
259 |
+
b) Analyze what the function accomplished based on its results
|
260 |
+
c) Track data transformations and their significance
|
261 |
+
d) Identify potential issues or limitations in the preprocessing steps
|
262 |
+
e) Recognize how each function contributes to the overall ML pipeline
|
263 |
+
|
264 |
+
The agent must then generate a comprehensive paragraph that:
|
265 |
+
- Chronologically describes the preprocessing journey
|
266 |
+
- Highlights key transformations and their significance
|
267 |
+
- Identifies data quality improvements
|
268 |
+
- Notes important statistical properties revealed
|
269 |
+
- Explains how each preprocessing step prepares data for modeling
|
270 |
+
- Uses technical but accessible language
|
271 |
+
- Provides quantitative details where relevant
|
272 |
+
|
273 |
+
2. ML LOOP TERMINATION DECISION
|
274 |
+
The agent will receive:
|
275 |
+
- The user's stated ML goal/task
|
276 |
+
- A list of available tools/functions in the system (provided explicitly)
|
277 |
+
- The history of previously executed tools/functions
|
278 |
+
|
279 |
+
The agent must decide whether to stop the ML loop by analyzing only the provided list and history, without speculating about unlisted tools:
|
280 |
+
a) Goal Achievement Analysis:
|
281 |
+
- Compare current state to user's goal
|
282 |
+
- Check if performance metrics meet or exceed success criteria
|
283 |
+
- Confirm if the primary objective has been satisfied
|
284 |
+
- Determine if further iterations yield meaningful improvements
|
285 |
+
|
286 |
+
b) Tool Exhaustion Analysis:
|
287 |
+
- Identify which available tools from the provided list have been executed
|
288 |
+
- Determine which remaining tools (if any) from the provided list are still unused
|
289 |
+
- If no unused tools remain, the loop must terminate
|
290 |
+
|
291 |
+
c) Diminishing Returns Analysis:
|
292 |
+
- Review recent iteration metrics for signs of plateau or minimal gains
|
293 |
+
- Evaluate cost-effectiveness of additional iterations
|
294 |
+
|
295 |
+
Decision Rules (based strictly on the provided tools list):
|
296 |
+
- STOP the loop (stop_loop = True) if ANY of the following conditions are met:
|
297 |
+
1. The user's goal has been achieved with satisfactory results
|
298 |
+
2. All tools in the provided list have been executed
|
299 |
+
3. Remaining provided tools (if any) cannot meaningfully improve results toward the goal
|
300 |
+
4. Recent iterations show diminishing returns below a meaningful threshold
|
301 |
+
- CONTINUE the loop (stop_loop = False) if ALL of the following conditions are met:
|
302 |
+
1. The user's goal is not yet achieved
|
303 |
+
2. There are unused tools in the provided list
|
304 |
+
3. There is clear evidence that applying an unused tool could improve results
|
305 |
+
|
306 |
+
Additional Constraints:
|
307 |
+
- Each available tool may be executed at most once in the ML loop
|
308 |
+
- The agent must not refer to hypothetical or non-existent tools
|
309 |
+
- Decisions must be based exclusively on the provided list of tools
|
310 |
+
|
311 |
+
Justification Requirements:
|
312 |
+
For any decision, provide a detailed justification that:
|
313 |
+
- References specific evidence from the execution history
|
314 |
+
- Cites relevant performance metrics and their values
|
315 |
+
- Explains the reasoning process step-by-step
|
316 |
+
- Acknowledges any limitations or uncertainties
|
317 |
+
- Connects the decision directly to the user's stated goal
|
318 |
+
- If continuing, recommends which unused tool to apply next and why
|
319 |
+
|
320 |
+
3. OUTPUT FORMATTING
|
321 |
+
The agent must return a JSON object matching JudgeAgentResponseSchema with the following keys:
|
322 |
+
|
323 |
+
a) detailed_context: A comprehensive paragraph describing all executed preprocessing steps
|
324 |
+
and their results, explaining the ML pipeline's current state.
|
325 |
+
|
326 |
+
b) stop_loop: A boolean value (True or False) indicating whether the ML loop should stop.
|
327 |
+
|
328 |
+
c) justification: A detailed explanation of the decision to stop or continue the loop,
|
329 |
+
based on goal achievement, tool exhaustion, and diminishing returns analyses.
|
330 |
+
|
331 |
+
NOTE: If no additional tools from the provided list remain that can further the user's goal, the agent must set stop_loop to True.
|
332 |
+
'''
|
333 |
+
]
|
src/app/pipelines/eda/helper.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from typing import Union
|
3 |
+
from .model import DatasetSummary, FeatureStatistics, Feature
|
4 |
+
from src.core.utils import logger
|
5 |
+
|
6 |
+
def get_summary(dataframe: pd.DataFrame)->DatasetSummary:
|
7 |
+
"""
|
8 |
+
Generates a summary of the dataset, including feature details such as type, missing values,
|
9 |
+
and basic statistics for numerical columns.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
dataframe (pd.DataFrame): The input dataset as a Pandas DataFrame.
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
DatasetSummary: A structured summary of the dataset.
|
16 |
+
"""
|
17 |
+
|
18 |
+
feature_list = []
|
19 |
+
|
20 |
+
for col in dataframe.columns:
|
21 |
+
column_name = col
|
22 |
+
dtype = dataframe[col].dtype
|
23 |
+
|
24 |
+
if pd.api.types.is_numeric_dtype(dataframe[col]):
|
25 |
+
category = "Numerical"
|
26 |
+
statistics = FeatureStatistics(
|
27 |
+
mean = dataframe[col].mean(),
|
28 |
+
min_value = dataframe[col].min(),
|
29 |
+
max_value = dataframe[col].max(),
|
30 |
+
standard_deviation_value = dataframe[col].std()
|
31 |
+
)
|
32 |
+
else:
|
33 |
+
category = "Non-Numerical"
|
34 |
+
statistics = None
|
35 |
+
|
36 |
+
null_count = dataframe[col].isnull().sum()
|
37 |
+
unique_count = dataframe[col].nunique()
|
38 |
+
|
39 |
+
feature = Feature(
|
40 |
+
feature_name = column_name,
|
41 |
+
feature_description = None,
|
42 |
+
feature_category = category,
|
43 |
+
feature_data_type = str(dtype),
|
44 |
+
missing_values = null_count,
|
45 |
+
unique_values = unique_count,
|
46 |
+
statistics = statistics
|
47 |
+
)
|
48 |
+
|
49 |
+
feature_list.append(feature)
|
50 |
+
|
51 |
+
return DatasetSummary(
|
52 |
+
num_rows = dataframe.shape[0],
|
53 |
+
num_features = dataframe.shape[1],
|
54 |
+
features = feature_list,
|
55 |
+
target_features = None
|
56 |
+
)
|
57 |
+
|
58 |
+
def sample_dataset(dataset: pd.DataFrame):
|
59 |
+
'''Sample 10 random values from the dataset for prompts'''
|
60 |
+
try:
|
61 |
+
sampled_data = dataset.sample(n=10, random_state=42).to_string(index=False)
|
62 |
+
return sampled_data
|
63 |
+
except ValueError:
|
64 |
+
logger.error("Failed to sample from the dataset", log_type="eda", console=True)
|
65 |
+
return None
|
66 |
+
|
67 |
+
def get_feature_summary(data: DatasetSummary):
|
68 |
+
'''Prepares a string which includes all features with their respective details'''
|
69 |
+
feature_details = []
|
70 |
+
|
71 |
+
for index, feature in enumerate(data.features):
|
72 |
+
stats = (
|
73 |
+
f"Mean: {feature.statistics.mean}, Min: {feature.statistics.min_value}, "
|
74 |
+
f"Max: {feature.statistics.max_value}, Std Dev: {feature.statistics.standard_deviation_value}"
|
75 |
+
if feature.statistics else "No statistics available"
|
76 |
+
)
|
77 |
+
|
78 |
+
feature_details.append(f"""
|
79 |
+
{index + 1}. '{feature.feature_name}'
|
80 |
+
- Description: {feature.feature_description}
|
81 |
+
- Category: {feature.feature_category}
|
82 |
+
- Data type: {feature.feature_data_type}
|
83 |
+
- Unique Values: {feature.unique_values}
|
84 |
+
- Missing Values: {feature.missing_values}
|
85 |
+
- Statistics: {stats}
|
86 |
+
""")
|
87 |
+
|
88 |
+
feature_summary = "\n".join(feature_details)
|
89 |
+
return feature_summary
|
src/app/pipelines/eda/model.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import Field, BaseModel
|
2 |
+
from typing import Optional, List
|
3 |
+
|
4 |
+
class FeatureStatistics(BaseModel):
|
5 |
+
"""
|
6 |
+
Provides the statistics of numerical features (columns) in the dataset.
|
7 |
+
"""
|
8 |
+
mean: float = Field(
|
9 |
+
default=None,
|
10 |
+
description = "The mean value of this feature"
|
11 |
+
)
|
12 |
+
min_value: float = Field(
|
13 |
+
default=None,
|
14 |
+
description = "The minimum value of this feature"
|
15 |
+
)
|
16 |
+
max_value: float = Field(
|
17 |
+
default=None,
|
18 |
+
description = "The maximum value of this feature"
|
19 |
+
)
|
20 |
+
standard_deviation_value: float = Field(
|
21 |
+
default=None,
|
22 |
+
description = "The standard deviation of this feature"
|
23 |
+
)
|
24 |
+
|
25 |
+
class Feature(BaseModel):
|
26 |
+
"""
|
27 |
+
Represents a feature (column) in the dataset.
|
28 |
+
"""
|
29 |
+
feature_name: str = Field(
|
30 |
+
default=None,
|
31 |
+
description = "The name of the feature/column"
|
32 |
+
)
|
33 |
+
feature_description: Optional[str] = Field(
|
34 |
+
default=None,
|
35 |
+
description = "A short description of the feature/column"
|
36 |
+
)
|
37 |
+
feature_category: str = Field(
|
38 |
+
default=None,
|
39 |
+
description= "Whether a feature is numerical or textual"
|
40 |
+
)
|
41 |
+
feature_data_type: str = Field(
|
42 |
+
default=None,
|
43 |
+
description= "The data type of the feature/column"
|
44 |
+
)
|
45 |
+
unique_values: int = Field(
|
46 |
+
default=None,
|
47 |
+
description= "The number of unique values in this particular column"
|
48 |
+
)
|
49 |
+
missing_values: int = Field(
|
50 |
+
default=None,
|
51 |
+
description= "The number of missing values in this particular column"
|
52 |
+
)
|
53 |
+
statistics: Optional[FeatureStatistics] = Field(
|
54 |
+
default=None,
|
55 |
+
description= "Provides statistics for numerical feature"
|
56 |
+
)
|
57 |
+
|
58 |
+
class DatasetSummary(BaseModel):
|
59 |
+
"""
|
60 |
+
Provides a summary of the dataset, including its structure and target features.
|
61 |
+
"""
|
62 |
+
num_rows: int = Field(
|
63 |
+
default=None,
|
64 |
+
description = "The number of rows in the dataset"
|
65 |
+
)
|
66 |
+
num_features: int = Field(
|
67 |
+
default=None,
|
68 |
+
description = "The number of features in the dataset"
|
69 |
+
)
|
70 |
+
features: List[Feature] = Field(
|
71 |
+
default=None,
|
72 |
+
description = "A list of features"
|
73 |
+
)
|
74 |
+
target_features: Optional[List[str]] = Field(
|
75 |
+
default=None,
|
76 |
+
description = "A list of target features relevant to the business task"
|
77 |
+
)
|
78 |
+
|
src/app/pipelines/eda/pipeline.py
ADDED
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
from src.app.schemas.requests.eda import EdaRequestSchema
|
3 |
+
from .helper import get_summary, get_feature_summary, sample_dataset
|
4 |
+
from .agents.models import IntelAgentResponseSchema, ExecuterAgentResponseSchema
|
5 |
+
from src.app.schemas.responses.eda import IterationDetails, IterationLogs
|
6 |
+
from .agents.prompts import executer_agent_desc, executer_agent_instructions
|
7 |
+
from .agents.agents import AgentClass
|
8 |
+
from .tools.lib import tool_library
|
9 |
+
from src.core.utils import logger
|
10 |
+
from agno.agent import RunResponse # type: ignore
|
11 |
+
from typing import Union
|
12 |
+
import os
|
13 |
+
import pandas as pd
|
14 |
+
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
class EdaLoop:
|
18 |
+
def __init__(self, payload: EdaRequestSchema, verbose=False):
|
19 |
+
self.ag = AgentClass()
|
20 |
+
self.payload = payload
|
21 |
+
self.dataset_overview = self._extract_details(verbose=verbose)
|
22 |
+
|
23 |
+
def _extract_details(self, verbose=False)->Union[IntelAgentResponseSchema, None]:
|
24 |
+
'''Prepares an initial dataset summary with feature descriptions'''
|
25 |
+
try:
|
26 |
+
logger.info("Preparing dataset summary....", log_type="eda", console=verbose)
|
27 |
+
|
28 |
+
dataset = pd.read_csv(self.payload.dataset_path)
|
29 |
+
dataset_summary = get_summary(dataframe=dataset)
|
30 |
+
|
31 |
+
feature_summary = get_feature_summary(data=dataset_summary)
|
32 |
+
sampled_data = sample_dataset(dataset=dataset)
|
33 |
+
|
34 |
+
prompt = (
|
35 |
+
f'User prompt: {self.payload.user_prompt}\n'
|
36 |
+
f'Business Understanding: {self.payload.requirement_analysis.business_understanding}\n'
|
37 |
+
f'Ml Task identified: {self.payload.technical_research.model_response.task}\n'
|
38 |
+
f'Dataset format: {self.payload.requirement_analysis.model_response.data_format}\n'
|
39 |
+
f'Features:\n {feature_summary}\n'
|
40 |
+
f'10 Sample Values from dataset:\n {sampled_data}\n'
|
41 |
+
)
|
42 |
+
|
43 |
+
intel_agent_response: RunResponse = self.ag.agents['intel_agent'].run(prompt, stream=False)
|
44 |
+
return intel_agent_response.content
|
45 |
+
|
46 |
+
except Exception as e:
|
47 |
+
logger.error(f"Dataset summary preparation failed with error: {e}", log_type="eda", console=verbose)
|
48 |
+
return None
|
49 |
+
|
50 |
+
def _build_orchestrator_prompt(self, execution_logs=None)->str:
|
51 |
+
available_tools_str = ""
|
52 |
+
for idx, tool in enumerate(tool_library):
|
53 |
+
idx+=1
|
54 |
+
tool_data = f"{idx}. Tool name: {tool}\n Tool details: {tool_library[tool]['metadata']}\n"
|
55 |
+
available_tools_str += tool_data
|
56 |
+
|
57 |
+
execution_history = ""
|
58 |
+
if execution_logs:
|
59 |
+
execution_history = '\n'.join(execution_logs)
|
60 |
+
|
61 |
+
prompt = (
|
62 |
+
f'User Prompt: {self.payload.user_prompt}\n'
|
63 |
+
f'Task: {self.dataset_overview.ml_task}\n'
|
64 |
+
f'Business Understanding: {self.payload.requirement_analysis.business_understanding}\n'
|
65 |
+
f'Requirements: {self.payload.requirement_analysis.model_response.technical_requirements}\n'
|
66 |
+
f'Constraints: {self.payload.requirement_analysis.model_response.constraints}\n\n'
|
67 |
+
# f'Previous Pre-Processing Steps Done: \n{execution_history}\n\n'
|
68 |
+
f'Detailed Execution Summary: \n{self.execution_summary}\n\n'
|
69 |
+
f'Tool Library: \n{available_tools_str}\n'
|
70 |
+
)
|
71 |
+
|
72 |
+
return prompt
|
73 |
+
|
74 |
+
def _build_judging_prompt(self, execution_logs)->str:
|
75 |
+
execution_history = '\n'.join(execution_logs)
|
76 |
+
|
77 |
+
available_tools_str = ""
|
78 |
+
for idx, tool in enumerate(tool_library):
|
79 |
+
idx+=1
|
80 |
+
tool_data = f"{idx}. Tool name: {tool}\n"
|
81 |
+
available_tools_str += tool_data
|
82 |
+
|
83 |
+
prompt = (
|
84 |
+
f'User Task:\n {self.payload.user_prompt}\n'
|
85 |
+
f'Businesss Understanding / Goals:\n {self.payload.requirement_analysis.business_understanding}\n'
|
86 |
+
f'Execution History:\n {execution_history}'
|
87 |
+
f'Available Tools:\n {available_tools_str}\n'
|
88 |
+
)
|
89 |
+
|
90 |
+
return prompt
|
91 |
+
|
92 |
+
def _build_analyzer_prompt(self, llm_response, tool_executed)->str:
|
93 |
+
prompt = (
|
94 |
+
f"Name of the function executed: {tool_executed['name']}\n"
|
95 |
+
f"Function Details: {tool_executed['metadata']}"
|
96 |
+
f'Function execution details:\n{llm_response.content.execution_details}\n'
|
97 |
+
f'Function results:\n{llm_response.messages[3].content}'
|
98 |
+
)
|
99 |
+
return prompt
|
100 |
+
|
101 |
+
def loop(self, verbose=False)->IterationLogs:
|
102 |
+
|
103 |
+
logger.info(f"Starting EDA loop with available tools: {tool_library.keys()}", log_type="eda", console=verbose)
|
104 |
+
|
105 |
+
execution_logs = []
|
106 |
+
iteration_logs = []
|
107 |
+
self.execution_summary = None
|
108 |
+
iteration_count = 1
|
109 |
+
|
110 |
+
recent_data_stored_path = self.payload.dataset_path
|
111 |
+
|
112 |
+
while(True):
|
113 |
+
|
114 |
+
logger.info(f"Running Iteration {iteration_count}", log_type="eda", console=verbose)
|
115 |
+
|
116 |
+
'''==================== ORCHESTRATOR SEGMENT ===================='''
|
117 |
+
|
118 |
+
orchestrator_prompt = self._build_orchestrator_prompt(execution_logs=execution_logs)
|
119 |
+
|
120 |
+
try:
|
121 |
+
orchestrator_agent_response: RunResponse = self.ag.agents['orchestrator_agent'].run(orchestrator_prompt)
|
122 |
+
except Exception as e:
|
123 |
+
logger.error(f"Failed to generate response from orchestator with error: {e}", log_type='eda', console=verbose)
|
124 |
+
continue
|
125 |
+
|
126 |
+
if isinstance(orchestrator_agent_response.content, str):
|
127 |
+
logger.error("Failed to fit orchestrator response to data model", log_type='eda', console=verbose)
|
128 |
+
continue
|
129 |
+
|
130 |
+
try:
|
131 |
+
selected_tool_for_executioner = tool_library[orchestrator_agent_response.content.tool_name]
|
132 |
+
except Exception as e:
|
133 |
+
logger.error(f"Tool allocation failed with error: {e}", log_type='eda', console=verbose)
|
134 |
+
continue
|
135 |
+
|
136 |
+
logger.info(f"Executing tool: {orchestrator_agent_response.content.tool_name}. Justification: {orchestrator_agent_response.content.justification}", log_type="eda", console=verbose)
|
137 |
+
|
138 |
+
'''==================== TOOL EXECUTION SEGMENT ===================='''
|
139 |
+
|
140 |
+
target_features = ""
|
141 |
+
if self.dataset_overview.dataset_summary.target_features:
|
142 |
+
target_features = '\n'.join(self.dataset_overview.dataset_summary.target_features)
|
143 |
+
|
144 |
+
while(True):
|
145 |
+
self.ag.build_agent(
|
146 |
+
agent_name="executer_agent",
|
147 |
+
agent_desc=executer_agent_desc,
|
148 |
+
agent_instructions=executer_agent_instructions,
|
149 |
+
agent_response_model=ExecuterAgentResponseSchema,
|
150 |
+
tools=[selected_tool_for_executioner['function']]
|
151 |
+
)
|
152 |
+
|
153 |
+
prompt = (
|
154 |
+
# f"Only execute the given '{selected_tool_for_executioner}'\n"
|
155 |
+
f"You can read the data from path: '{recent_data_stored_path}' to execute the given tool: '{selected_tool_for_executioner}' \n"
|
156 |
+
f'Details about dataset:\n Target Features/Columns: {target_features}'
|
157 |
+
)
|
158 |
+
|
159 |
+
prompt = f"Read the data from path: '{recent_data_stored_path}'. Use this file path to execute the '{selected_tool_for_executioner}' tool. Target Features/Columns: {target_features}"
|
160 |
+
|
161 |
+
try:
|
162 |
+
executor_agent_response: RunResponse = self.ag.agents['executer_agent'].run(prompt, stream=False)
|
163 |
+
# logger.info(f'{executor_agent_response.messages[3].content}', log_type='eda', console=verbose)
|
164 |
+
_ = executor_agent_response.messages[3].content
|
165 |
+
|
166 |
+
if executor_agent_response.content.output_file_path:
|
167 |
+
if executor_agent_response.content.output_file_path is not None:
|
168 |
+
recent_data_stored_path = executor_agent_response.content.output_file_path
|
169 |
+
|
170 |
+
logger.info(f"Tool Executed successfully", log_type='eda', console=verbose)
|
171 |
+
break
|
172 |
+
except Exception as e:
|
173 |
+
logger.error(f"Tool execution failed with error: {e}. Trying Again....", log_type='eda', console=verbose)
|
174 |
+
|
175 |
+
'''==================== ANALYZER SEGMENT ===================='''
|
176 |
+
|
177 |
+
while(True):
|
178 |
+
try:
|
179 |
+
logger.info("Generating analysis....", log_type='eda', console=verbose)
|
180 |
+
|
181 |
+
analyzer_prompt = self._build_analyzer_prompt(llm_response=executor_agent_response, tool_executed=selected_tool_for_executioner)
|
182 |
+
analyzer_agent_response: RunResponse = self.ag.agents['analyzer_agent'].run(analyzer_prompt, stream=False)
|
183 |
+
|
184 |
+
if not isinstance(analyzer_agent_response.content, str):
|
185 |
+
break
|
186 |
+
|
187 |
+
except Exception as e:
|
188 |
+
logger.error(f"Failed to generate response from analyzer with error: {e}. Trying again.....", log_type='eda', console=verbose)
|
189 |
+
|
190 |
+
execution_details = (
|
191 |
+
f'Iteration / Step Number: {iteration_count}\n'
|
192 |
+
# f'{analyzer_prompt}\n'
|
193 |
+
f"Name of the function executed: {selected_tool_for_executioner['name']}\n"
|
194 |
+
f"Function Details: {selected_tool_for_executioner['metadata']}"
|
195 |
+
f'Function execution details:\n{executor_agent_response.content.execution_details}\n'
|
196 |
+
f'Key Details: {analyzer_agent_response.content.key_insights}\n'
|
197 |
+
# f'Potential Issues: {analyzer_agent_response.content.potential_issues}\n'
|
198 |
+
# f'Recommendations: {analyzer_agent_response.content.recommendations}\n'
|
199 |
+
)
|
200 |
+
|
201 |
+
# logger.info(f"Execution Details: \n{execution_details}", log_type='eda', console=verbose)
|
202 |
+
execution_logs.append(execution_details)
|
203 |
+
|
204 |
+
'''==================== JUDGING SEGMENT ===================='''
|
205 |
+
|
206 |
+
while(True):
|
207 |
+
try:
|
208 |
+
logger.info("Evaluating Iteration with a Judge....", log_type='eda', console=verbose)
|
209 |
+
|
210 |
+
juding_prompt = self._build_judging_prompt(execution_logs = execution_logs)
|
211 |
+
judging_agent_response: RunResponse = self.ag.agents['judging_agent'].run(juding_prompt, stream=False)
|
212 |
+
|
213 |
+
if not isinstance(judging_agent_response.content, str):
|
214 |
+
break
|
215 |
+
|
216 |
+
except Exception as e:
|
217 |
+
logger.error(f"Failed to generate response from judge with error: {e}. Trying again.....", log_type='eda', console=verbose)
|
218 |
+
|
219 |
+
|
220 |
+
'''==================== STORING ITERATION DETAILS SEGMENT ===================='''
|
221 |
+
|
222 |
+
iteration_logs.append(IterationDetails(
|
223 |
+
iteration_number = iteration_count,
|
224 |
+
orchestrator_response = orchestrator_agent_response.content,
|
225 |
+
executer_response = executor_agent_response.content,
|
226 |
+
analyzer_response = analyzer_agent_response.content,
|
227 |
+
judge_response = judging_agent_response.content
|
228 |
+
))
|
229 |
+
|
230 |
+
iteration_count+=1
|
231 |
+
|
232 |
+
if judging_agent_response.content.stop_loop:
|
233 |
+
logger.info(f"Stop Loop = {judging_agent_response.content.stop_loop}. Justification: {judging_agent_response.content.justification}", log_type='eda', console=verbose)
|
234 |
+
break
|
235 |
+
else:
|
236 |
+
self.execution_summary = judging_agent_response.content.detailed_context
|
237 |
+
logger.info(f"Stop Loop = {judging_agent_response.content.stop_loop}. Justification: {judging_agent_response.content.justification}", log_type='eda', console=verbose)
|
238 |
+
|
239 |
+
return IterationLogs(logs=iteration_logs)
|
240 |
+
|
241 |
+
|
242 |
+
|
243 |
+
|
244 |
+
|
245 |
+
|
246 |
+
|
247 |
+
|
248 |
+
|
249 |
+
|
250 |
+
|
251 |
+
|
252 |
+
|
253 |
+
|
254 |
+
|
255 |
+
|
256 |
+
|
src/app/pipelines/eda/tools/analysis_tools/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .univariate_analysis import univariate_analysis
|
2 |
+
from .bivariate_analysis import bivariate_analysis
|
3 |
+
from .multivariate_analysis import multivariate_analysis
|
src/app/pipelines/eda/tools/analysis_tools/bivariate_analysis.py
ADDED
@@ -0,0 +1,1028 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from scipy import stats
|
4 |
+
from sklearn.metrics import mutual_info_score
|
5 |
+
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import seaborn as sns
|
8 |
+
from scipy.stats import pointbiserialr
|
9 |
+
from agno.utils.log import logger
|
10 |
+
import json
|
11 |
+
|
12 |
+
def comprehensive_bivariate_analysis(df, visualize=False, output_dir="bivariate_plots"):
|
13 |
+
"""
|
14 |
+
Performs comprehensive bivariate analysis on all pairs of variables in the dataframe.
|
15 |
+
|
16 |
+
Parameters:
|
17 |
+
-----------
|
18 |
+
df : pandas.DataFrame
|
19 |
+
The input dataframe to analyze
|
20 |
+
visualize : bool, default=True
|
21 |
+
Whether to generate visualizations
|
22 |
+
output_dir : str, default="bivariate_plots"
|
23 |
+
Directory to save visualization plots if visualize=True
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
--------
|
27 |
+
dict
|
28 |
+
A dictionary containing the results of all bivariate analyses and a summary
|
29 |
+
"""
|
30 |
+
results = {
|
31 |
+
'numerical_vs_numerical': {},
|
32 |
+
'categorical_vs_categorical': {},
|
33 |
+
'numerical_vs_categorical': {},
|
34 |
+
'summary': {
|
35 |
+
'strongest_correlations': [],
|
36 |
+
'significant_category_associations': [],
|
37 |
+
'significant_group_differences': [],
|
38 |
+
'key_insights': []
|
39 |
+
}
|
40 |
+
}
|
41 |
+
|
42 |
+
# Identify variable types
|
43 |
+
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
44 |
+
categorical_features = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
|
45 |
+
|
46 |
+
# Convert any low-cardinality numerical features to categorical
|
47 |
+
for col in numerical_features.copy():
|
48 |
+
if df[col].nunique() <= 10: # Threshold for low cardinality
|
49 |
+
categorical_features.append(col)
|
50 |
+
numerical_features.remove(col)
|
51 |
+
|
52 |
+
# 1. Numerical vs Numerical Analysis
|
53 |
+
if len(numerical_features) >= 2:
|
54 |
+
print("Starting Numerical-numerical analysis")
|
55 |
+
results['numerical_vs_numerical'] = analyze_numerical_numerical(df, numerical_features, visualize, output_dir)
|
56 |
+
|
57 |
+
# 2. Categorical vs Categorical Analysis
|
58 |
+
if len(categorical_features) >= 2:
|
59 |
+
print("Starting categorical-categorical analysis")
|
60 |
+
results['categorical_vs_categorical'] = analyze_categorical_categorical(df, categorical_features, visualize, output_dir)
|
61 |
+
|
62 |
+
# 3. Numerical vs Categorical Analysis
|
63 |
+
if len(numerical_features) >= 1 and len(categorical_features) >= 1:
|
64 |
+
print("Starting numerical-categorical analysis")
|
65 |
+
results['numerical_vs_categorical'] = analyze_numerical_categorical(df, numerical_features, categorical_features, visualize=visualize, output_dir=output_dir)
|
66 |
+
|
67 |
+
# Generate summary of findings
|
68 |
+
print("Generating summaries")
|
69 |
+
results['summary'] = generate_summary(results)
|
70 |
+
|
71 |
+
return results
|
72 |
+
|
73 |
+
def analyze_numerical_numerical(df, numerical_features, visualize=False, output_dir="bivariate_plots"):
|
74 |
+
"""Analyze relationships between numerical features"""
|
75 |
+
results = {
|
76 |
+
'correlations': {
|
77 |
+
'pearson': {},
|
78 |
+
'spearman': {},
|
79 |
+
'kendall': {}
|
80 |
+
},
|
81 |
+
'covariance': {},
|
82 |
+
'best_fit_relationships': {},
|
83 |
+
'significant_correlations': []
|
84 |
+
}
|
85 |
+
|
86 |
+
# Create output directory if it doesn't exist and visualization is enabled
|
87 |
+
if visualize:
|
88 |
+
import os
|
89 |
+
os.makedirs(output_dir, exist_ok=True)
|
90 |
+
|
91 |
+
# Calculate correlation matrices
|
92 |
+
if len(numerical_features) > 1:
|
93 |
+
# Pearson correlation (linear)
|
94 |
+
pearson_corr = df[numerical_features].corr(method='pearson')
|
95 |
+
# Spearman correlation (monotonic)
|
96 |
+
spearman_corr = df[numerical_features].corr(method='spearman')
|
97 |
+
# Kendall correlation (ordinal)
|
98 |
+
kendall_corr = df[numerical_features].corr(method='kendall')
|
99 |
+
|
100 |
+
# Calculate p-values for Pearson correlation
|
101 |
+
pearson_p_values = pd.DataFrame(np.zeros_like(pearson_corr),
|
102 |
+
index=pearson_corr.index,
|
103 |
+
columns=pearson_corr.columns)
|
104 |
+
|
105 |
+
# Calculate p-values for Spearman correlation
|
106 |
+
spearman_p_values = pd.DataFrame(np.zeros_like(spearman_corr),
|
107 |
+
index=spearman_corr.index,
|
108 |
+
columns=spearman_corr.columns)
|
109 |
+
|
110 |
+
# Visualize correlation heatmaps if enabled
|
111 |
+
if visualize:
|
112 |
+
plt.figure(figsize=(12, 10))
|
113 |
+
sns.heatmap(pearson_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
|
114 |
+
plt.title('Pearson Correlation Heatmap')
|
115 |
+
plt.tight_layout()
|
116 |
+
plt.savefig(f"{output_dir}/pearson_correlation_heatmap.png")
|
117 |
+
plt.close()
|
118 |
+
|
119 |
+
plt.figure(figsize=(12, 10))
|
120 |
+
sns.heatmap(spearman_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
|
121 |
+
plt.title('Spearman Correlation Heatmap')
|
122 |
+
plt.tight_layout()
|
123 |
+
plt.savefig(f"{output_dir}/spearman_correlation_heatmap.png")
|
124 |
+
plt.close()
|
125 |
+
|
126 |
+
for i in range(len(numerical_features)):
|
127 |
+
for j in range(i+1, len(numerical_features)):
|
128 |
+
x = df[numerical_features[i]].values
|
129 |
+
y = df[numerical_features[j]].values
|
130 |
+
|
131 |
+
# Remove NaN values
|
132 |
+
mask = ~(np.isnan(x) | np.isnan(y))
|
133 |
+
x_clean = x[mask]
|
134 |
+
y_clean = y[mask]
|
135 |
+
|
136 |
+
if len(x_clean) > 2: # Need at least 3 points for correlation
|
137 |
+
# Calculate Pearson correlation and p-value
|
138 |
+
r_pearson, p_pearson = stats.pearsonr(x_clean, y_clean)
|
139 |
+
pearson_p_values.loc[numerical_features[i], numerical_features[j]] = p_pearson
|
140 |
+
pearson_p_values.loc[numerical_features[j], numerical_features[i]] = p_pearson
|
141 |
+
|
142 |
+
# Calculate Spearman correlation and p-value
|
143 |
+
r_spearman, p_spearman = stats.spearmanr(x_clean, y_clean)
|
144 |
+
spearman_p_values.loc[numerical_features[i], numerical_features[j]] = p_spearman
|
145 |
+
spearman_p_values.loc[numerical_features[j], numerical_features[i]] = p_spearman
|
146 |
+
|
147 |
+
# Analyze best fit relationship (linear, polynomial, logarithmic, exponential)
|
148 |
+
if len(x_clean) >= 10: # Only try to fit models if we have enough data
|
149 |
+
best_fit_info = analyze_best_fit_relationship(x_clean, y_clean)
|
150 |
+
results['best_fit_relationships'][f"{numerical_features[i]}_vs_{numerical_features[j]}"] = best_fit_info
|
151 |
+
|
152 |
+
# Visualize scatterplot and regression line if enabled
|
153 |
+
if visualize and abs(r_pearson) >= 0.3: # Only plot more significant relationships
|
154 |
+
plt.figure(figsize=(10, 6))
|
155 |
+
sns.regplot(x=x_clean, y=y_clean, line_kws={"color":"red"})
|
156 |
+
plt.xlabel(numerical_features[i])
|
157 |
+
plt.ylabel(numerical_features[j])
|
158 |
+
plt.title(f'{numerical_features[i]} vs {numerical_features[j]} (r={r_pearson:.3f}, p={p_pearson:.4f})')
|
159 |
+
plt.savefig(f"{output_dir}/{numerical_features[i]}_vs_{numerical_features[j]}_scatter.png")
|
160 |
+
plt.close()
|
161 |
+
|
162 |
+
# Store results
|
163 |
+
results['correlations']['pearson'] = {
|
164 |
+
'correlation_matrix': pearson_corr.to_dict(),
|
165 |
+
'p_values': pearson_p_values.to_dict()
|
166 |
+
}
|
167 |
+
|
168 |
+
results['correlations']['spearman'] = {
|
169 |
+
'correlation_matrix': spearman_corr.to_dict(),
|
170 |
+
'p_values': spearman_p_values.to_dict()
|
171 |
+
}
|
172 |
+
|
173 |
+
results['correlations']['kendall'] = {
|
174 |
+
'correlation_matrix': kendall_corr.to_dict()
|
175 |
+
}
|
176 |
+
|
177 |
+
# Calculate covariance matrix
|
178 |
+
covariance_matrix = df[numerical_features].cov()
|
179 |
+
results['covariance'] = covariance_matrix.to_dict()
|
180 |
+
|
181 |
+
# Find pairs with significant correlation
|
182 |
+
significant_pairs = []
|
183 |
+
for i in range(len(numerical_features)):
|
184 |
+
for j in range(i+1, len(numerical_features)):
|
185 |
+
feat_i = numerical_features[i]
|
186 |
+
feat_j = numerical_features[j]
|
187 |
+
|
188 |
+
if abs(pearson_corr.loc[feat_i, feat_j]) >= 0.3: # Lower threshold to capture more relationships
|
189 |
+
p_value = pearson_p_values.loc[feat_i, feat_j]
|
190 |
+
correlation_type = "positive" if pearson_corr.loc[feat_i, feat_j] > 0 else "negative"
|
191 |
+
correlation_strength = ""
|
192 |
+
|
193 |
+
if abs(pearson_corr.loc[feat_i, feat_j]) >= 0.8:
|
194 |
+
correlation_strength = "very strong"
|
195 |
+
elif abs(pearson_corr.loc[feat_i, feat_j]) >= 0.6:
|
196 |
+
correlation_strength = "strong"
|
197 |
+
elif abs(pearson_corr.loc[feat_i, feat_j]) >= 0.4:
|
198 |
+
correlation_strength = "moderate"
|
199 |
+
else:
|
200 |
+
correlation_strength = "weak"
|
201 |
+
|
202 |
+
# Compare with Spearman to detect non-linear relationships
|
203 |
+
pearson_spearman_diff = abs(pearson_corr.loc[feat_i, feat_j] - spearman_corr.loc[feat_i, feat_j])
|
204 |
+
relationship_type = "linear"
|
205 |
+
if pearson_spearman_diff > 0.1:
|
206 |
+
relationship_type = "potentially non-linear"
|
207 |
+
|
208 |
+
significant_pairs.append({
|
209 |
+
'feature_1': feat_i,
|
210 |
+
'feature_2': feat_j,
|
211 |
+
'pearson_correlation': pearson_corr.loc[feat_i, feat_j],
|
212 |
+
'spearman_correlation': spearman_corr.loc[feat_i, feat_j],
|
213 |
+
'p_value': p_value,
|
214 |
+
'significant': p_value < 0.05,
|
215 |
+
'correlation_type': correlation_type,
|
216 |
+
'correlation_strength': correlation_strength,
|
217 |
+
'relationship_type': relationship_type,
|
218 |
+
'pearson_spearman_diff': pearson_spearman_diff
|
219 |
+
})
|
220 |
+
|
221 |
+
# Sort by absolute correlation value
|
222 |
+
significant_pairs.sort(key=lambda x: abs(x['pearson_correlation']), reverse=True)
|
223 |
+
results['significant_correlations'] = significant_pairs
|
224 |
+
|
225 |
+
return results
|
226 |
+
|
227 |
+
def analyze_best_fit_relationship(x, y):
|
228 |
+
"""Analyze which type of relationship (linear, polynomial, log, exponential) best fits the data"""
|
229 |
+
results = {}
|
230 |
+
|
231 |
+
# Check for non-positive values that would cause issues with log/exponential fits
|
232 |
+
x_min, y_min = np.min(x), np.min(y)
|
233 |
+
|
234 |
+
# Linear fit
|
235 |
+
try:
|
236 |
+
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
|
237 |
+
results['linear'] = {
|
238 |
+
'equation': f'y = {slope:.4f}x + {intercept:.4f}',
|
239 |
+
'r_squared': r_value**2,
|
240 |
+
'p_value': p_value
|
241 |
+
}
|
242 |
+
except:
|
243 |
+
results['linear'] = {'error': 'Failed to fit linear model'}
|
244 |
+
|
245 |
+
# Polynomial fit (degree 2)
|
246 |
+
try:
|
247 |
+
coeffs = np.polyfit(x, y, 2)
|
248 |
+
p = np.poly1d(coeffs)
|
249 |
+
# Calculate R-squared
|
250 |
+
yhat = p(x)
|
251 |
+
ybar = np.mean(y)
|
252 |
+
ssreg = np.sum((yhat - ybar)**2)
|
253 |
+
sstot = np.sum((y - ybar)**2)
|
254 |
+
r_squared = ssreg / sstot
|
255 |
+
|
256 |
+
results['polynomial'] = {
|
257 |
+
'equation': f'y = {coeffs[0]:.4f}x² + {coeffs[1]:.4f}x + {coeffs[2]:.4f}',
|
258 |
+
'r_squared': r_squared
|
259 |
+
}
|
260 |
+
except:
|
261 |
+
results['polynomial'] = {'error': 'Failed to fit polynomial model'}
|
262 |
+
|
263 |
+
# Logarithmic fit (if x > 0)
|
264 |
+
if x_min > 0:
|
265 |
+
try:
|
266 |
+
coeffs = np.polyfit(np.log(x), y, 1)
|
267 |
+
# Calculate R-squared
|
268 |
+
yhat = coeffs[0] * np.log(x) + coeffs[1]
|
269 |
+
ybar = np.mean(y)
|
270 |
+
ssreg = np.sum((yhat - ybar)**2)
|
271 |
+
sstot = np.sum((y - ybar)**2)
|
272 |
+
r_squared = ssreg / sstot
|
273 |
+
|
274 |
+
results['logarithmic'] = {
|
275 |
+
'equation': f'y = {coeffs[0]:.4f}ln(x) + {coeffs[1]:.4f}',
|
276 |
+
'r_squared': r_squared
|
277 |
+
}
|
278 |
+
except:
|
279 |
+
results['logarithmic'] = {'error': 'Failed to fit logarithmic model'}
|
280 |
+
|
281 |
+
# Exponential fit (if y > 0)
|
282 |
+
if y_min > 0:
|
283 |
+
try:
|
284 |
+
coeffs = np.polyfit(x, np.log(y), 1)
|
285 |
+
# Calculate R-squared
|
286 |
+
yhat = np.exp(coeffs[1]) * np.exp(coeffs[0] * x)
|
287 |
+
ybar = np.mean(y)
|
288 |
+
ssreg = np.sum((yhat - ybar)**2)
|
289 |
+
sstot = np.sum((y - ybar)**2)
|
290 |
+
r_squared = ssreg / sstot
|
291 |
+
|
292 |
+
results['exponential'] = {
|
293 |
+
'equation': f'y = {np.exp(coeffs[1]):.4f}e^({coeffs[0]:.4f}x)',
|
294 |
+
'r_squared': r_squared
|
295 |
+
}
|
296 |
+
except:
|
297 |
+
results['exponential'] = {'error': 'Failed to fit exponential model'}
|
298 |
+
|
299 |
+
# Find best fit model
|
300 |
+
best_fit = {'model': None, 'r_squared': -1}
|
301 |
+
for model_type in results:
|
302 |
+
if 'r_squared' in results[model_type] and results[model_type]['r_squared'] > best_fit['r_squared']:
|
303 |
+
best_fit = {'model': model_type, 'r_squared': results[model_type]['r_squared']}
|
304 |
+
|
305 |
+
results['best_fit'] = best_fit
|
306 |
+
|
307 |
+
return results
|
308 |
+
|
309 |
+
def analyze_categorical_categorical(df, categorical_features, visualize=False, output_dir="bivariate_plots"):
|
310 |
+
"""Analyze relationships between categorical features"""
|
311 |
+
results = {
|
312 |
+
'chi_square_tests': {},
|
313 |
+
'cramers_v': {},
|
314 |
+
'contingency_tables': {},
|
315 |
+
'phi_coefficients': {},
|
316 |
+
'lambda_coefficients': {},
|
317 |
+
'significant_associations': []
|
318 |
+
}
|
319 |
+
|
320 |
+
# Create output directory if it doesn't exist and visualization is enabled
|
321 |
+
if visualize:
|
322 |
+
import os
|
323 |
+
os.makedirs(output_dir, exist_ok=True)
|
324 |
+
|
325 |
+
# Chi-square tests & Cramer's V
|
326 |
+
for i in range(len(categorical_features)):
|
327 |
+
for j in range(i+1, len(categorical_features)):
|
328 |
+
feat_i = categorical_features[i]
|
329 |
+
feat_j = categorical_features[j]
|
330 |
+
|
331 |
+
# Get clean data (remove NaNs)
|
332 |
+
data_ij = df[[feat_i, feat_j]].dropna()
|
333 |
+
|
334 |
+
# Skip if either feature has only one category after dropping NAs
|
335 |
+
if data_ij[feat_i].nunique() <= 1 or data_ij[feat_j].nunique() <= 1:
|
336 |
+
continue
|
337 |
+
|
338 |
+
# Create contingency table
|
339 |
+
contingency_table = pd.crosstab(data_ij[feat_i], data_ij[feat_j])
|
340 |
+
results['contingency_tables'][f"{feat_i}_vs_{feat_j}"] = contingency_table.to_dict()
|
341 |
+
|
342 |
+
# Create normalized contingency tables (row and column proportions)
|
343 |
+
row_proportions = contingency_table.div(contingency_table.sum(axis=1), axis=0)
|
344 |
+
col_proportions = contingency_table.div(contingency_table.sum(axis=0), axis=1)
|
345 |
+
|
346 |
+
# Check if we have enough samples for chi-square test
|
347 |
+
# Rule of thumb: 80% of cells should have expected frequencies >= 5
|
348 |
+
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
|
349 |
+
expected_array = np.array(expected)
|
350 |
+
|
351 |
+
# Calculate percentage of cells with expected frequency >= 5
|
352 |
+
valid_expected_percentage = np.sum(expected_array >= 5) / expected_array.size
|
353 |
+
|
354 |
+
# Calculate Cramer's V (measure of association)
|
355 |
+
n = contingency_table.sum().sum()
|
356 |
+
min_dim = min(contingency_table.shape) - 1
|
357 |
+
if min_dim > 0: # Avoid division by zero
|
358 |
+
cramers_v = np.sqrt(chi2 / (n * min_dim))
|
359 |
+
else:
|
360 |
+
cramers_v = np.nan
|
361 |
+
|
362 |
+
results['chi_square_tests'][f"{feat_i}_vs_{feat_j}"] = {
|
363 |
+
'chi2': chi2,
|
364 |
+
'p_value': p,
|
365 |
+
'degrees_of_freedom': dof,
|
366 |
+
'valid_expected_percentage': valid_expected_percentage,
|
367 |
+
'reliable_test': valid_expected_percentage >= 0.8 # Generally accepted threshold
|
368 |
+
}
|
369 |
+
|
370 |
+
results['cramers_v'][f"{feat_i}_vs_{feat_j}"] = cramers_v
|
371 |
+
|
372 |
+
# For 2x2 contingency tables, calculate Phi coefficient
|
373 |
+
if contingency_table.shape == (2, 2):
|
374 |
+
phi_coef = np.sqrt(chi2 / n)
|
375 |
+
results['phi_coefficients'][f"{feat_i}_vs_{feat_j}"] = phi_coef
|
376 |
+
|
377 |
+
# Calculate Lambda coefficient (asymmetric measure of association)
|
378 |
+
try:
|
379 |
+
# Lambda for predicting feat_j from feat_i
|
380 |
+
lambda_ij = calculate_lambda(contingency_table)
|
381 |
+
# Lambda for predicting feat_i from feat_j
|
382 |
+
lambda_ji = calculate_lambda(contingency_table.T)
|
383 |
+
|
384 |
+
results['lambda_coefficients'][f"{feat_i}_vs_{feat_j}"] = {
|
385 |
+
'lambda_ij': lambda_ij, # For predicting j from i
|
386 |
+
'lambda_ji': lambda_ji # For predicting i from j
|
387 |
+
}
|
388 |
+
except:
|
389 |
+
pass
|
390 |
+
|
391 |
+
# Visualize contingency table as heatmap if enabled
|
392 |
+
if visualize and valid_expected_percentage >= 0.8 and p < 0.05:
|
393 |
+
plt.figure(figsize=(10, 8))
|
394 |
+
sns.heatmap(contingency_table, annot=True, fmt='d', cmap='YlGnBu')
|
395 |
+
plt.title(f'{feat_i} vs {feat_j} (Cramer\'s V={cramers_v:.3f}, p={p:.4f})')
|
396 |
+
plt.savefig(f"{output_dir}/{feat_i}_vs_{feat_j}_contingency.png")
|
397 |
+
plt.close()
|
398 |
+
|
399 |
+
# Also plot normalized contingency table (row proportions)
|
400 |
+
plt.figure(figsize=(10, 8))
|
401 |
+
sns.heatmap(row_proportions, annot=True, fmt='.2f', cmap='YlGnBu')
|
402 |
+
plt.title(f'{feat_i} vs {feat_j} (Row Proportions)')
|
403 |
+
plt.savefig(f"{output_dir}/{feat_i}_vs_{feat_j}_row_proportions.png")
|
404 |
+
plt.close()
|
405 |
+
|
406 |
+
# Check for significant association
|
407 |
+
if p < 0.05 and valid_expected_percentage >= 0.8:
|
408 |
+
association_strength = ""
|
409 |
+
if cramers_v >= 0.5:
|
410 |
+
association_strength = "strong"
|
411 |
+
elif cramers_v >= 0.3:
|
412 |
+
association_strength = "moderate"
|
413 |
+
elif cramers_v >= 0.1:
|
414 |
+
association_strength = "weak"
|
415 |
+
else:
|
416 |
+
association_strength = "very weak"
|
417 |
+
|
418 |
+
# Check if we calculated lambda coefficients
|
419 |
+
lambda_info = {}
|
420 |
+
if f"{feat_i}_vs_{feat_j}" in results['lambda_coefficients']:
|
421 |
+
lambda_info = results['lambda_coefficients'][f"{feat_i}_vs_{feat_j}"]
|
422 |
+
|
423 |
+
# Check if we calculated phi coefficient
|
424 |
+
phi_coef = None
|
425 |
+
if f"{feat_i}_vs_{feat_j}" in results['phi_coefficients']:
|
426 |
+
phi_coef = results['phi_coefficients'][f"{feat_i}_vs_{feat_j}"]
|
427 |
+
|
428 |
+
results['significant_associations'].append({
|
429 |
+
'feature_1': feat_i,
|
430 |
+
'feature_2': feat_j,
|
431 |
+
'chi_square': chi2,
|
432 |
+
'p_value': p,
|
433 |
+
'cramers_v': cramers_v,
|
434 |
+
'association_strength': association_strength,
|
435 |
+
'phi_coefficient': phi_coef,
|
436 |
+
'lambda_coefficients': lambda_info
|
437 |
+
})
|
438 |
+
|
439 |
+
# Sort by Cramer's V
|
440 |
+
results['significant_associations'].sort(key=lambda x: x['cramers_v'], reverse=True)
|
441 |
+
|
442 |
+
return results
|
443 |
+
|
444 |
+
def calculate_lambda(contingency_table):
|
445 |
+
"""
|
446 |
+
Calculate Goodman and Kruskal's lambda
|
447 |
+
Lambda measures the proportional reduction in error when values of one variable are used to predict values of another
|
448 |
+
"""
|
449 |
+
# Convert to numpy array
|
450 |
+
cont_array = np.array(contingency_table)
|
451 |
+
|
452 |
+
# Total observations
|
453 |
+
n = np.sum(cont_array)
|
454 |
+
|
455 |
+
# Sum of maximum frequency in each row
|
456 |
+
row_max_sum = np.sum(np.max(cont_array, axis=1))
|
457 |
+
|
458 |
+
# Mode of column sums (most frequent category overall)
|
459 |
+
col_sums = np.sum(cont_array, axis=0)
|
460 |
+
overall_mode = np.max(col_sums)
|
461 |
+
|
462 |
+
# Calculate lambda
|
463 |
+
if overall_mode == n:
|
464 |
+
# No predictive ability (all observations are in one category)
|
465 |
+
return 0
|
466 |
+
else:
|
467 |
+
return (row_max_sum - overall_mode) / (n - overall_mode)
|
468 |
+
|
469 |
+
def analyze_numerical_categorical(df, numerical_features, categorical_features, max_categories=30, sample_size=None, skip_tukey=False, visualize=False, output_dir="bivariate_plots"):
|
470 |
+
"""
|
471 |
+
Optimized analysis of relationships between numerical and categorical features
|
472 |
+
|
473 |
+
Parameters:
|
474 |
+
-----------
|
475 |
+
df : pandas.DataFrame
|
476 |
+
The input dataframe
|
477 |
+
numerical_features : list
|
478 |
+
List of numerical feature names
|
479 |
+
categorical_features : list
|
480 |
+
List of categorical feature names
|
481 |
+
max_categories : int, default=30
|
482 |
+
Skip categorical features with more unique values than this
|
483 |
+
sample_size : int or None, default=None
|
484 |
+
If provided, analyze a random sample of this size for large datasets
|
485 |
+
skip_tukey : bool, default=True
|
486 |
+
Skip Tukey's test which can be computationally expensive
|
487 |
+
visualize : bool, default=True
|
488 |
+
Whether to generate visualizations
|
489 |
+
output_dir : str, default="bivariate_plots"
|
490 |
+
Directory to save visualization plots if visualize=True
|
491 |
+
|
492 |
+
Returns:
|
493 |
+
--------
|
494 |
+
dict
|
495 |
+
Results of numerical-categorical analysis
|
496 |
+
"""
|
497 |
+
results = {
|
498 |
+
'anova_tests': {},
|
499 |
+
'effect_sizes': {},
|
500 |
+
'group_statistics': {},
|
501 |
+
'point_biserial_correlations': {},
|
502 |
+
'significant_differences': []
|
503 |
+
}
|
504 |
+
|
505 |
+
# Create output directory if it doesn't exist and visualization is enabled
|
506 |
+
if visualize:
|
507 |
+
import os
|
508 |
+
os.makedirs(output_dir, exist_ok=True)
|
509 |
+
|
510 |
+
# Performance optimization: Use a sample for large datasets
|
511 |
+
if sample_size is not None and len(df) > sample_size:
|
512 |
+
df_sample = df.sample(sample_size, random_state=42)
|
513 |
+
else:
|
514 |
+
df_sample = df
|
515 |
+
|
516 |
+
# Pre-compute feature cardinality to avoid repeated calculations
|
517 |
+
cat_feature_cardinality = {feat: df_sample[feat].nunique() for feat in categorical_features}
|
518 |
+
|
519 |
+
for num_feat in numerical_features:
|
520 |
+
for cat_feat in categorical_features:
|
521 |
+
# Skip high cardinality categorical features
|
522 |
+
if cat_feature_cardinality[cat_feat] > max_categories:
|
523 |
+
continue
|
524 |
+
|
525 |
+
# Get clean data (remove NaNs)
|
526 |
+
data = df_sample[[num_feat, cat_feat]].dropna()
|
527 |
+
|
528 |
+
# Skip if categorical feature has only one category after dropping NAs
|
529 |
+
if data[cat_feat].nunique() <= 1:
|
530 |
+
continue
|
531 |
+
|
532 |
+
# Calculate group statistics more efficiently using agg
|
533 |
+
group_stats = data.groupby(cat_feat)[num_feat].agg(['count', 'mean', 'std', 'min', 'max', 'median'])
|
534 |
+
|
535 |
+
# Calculate IQR and outlier bounds for each group
|
536 |
+
group_stats['q1'] = data.groupby(cat_feat)[num_feat].quantile(0.25)
|
537 |
+
group_stats['q3'] = data.groupby(cat_feat)[num_feat].quantile(0.75)
|
538 |
+
group_stats['iqr'] = group_stats['q3'] - group_stats['q1']
|
539 |
+
group_stats['lower_bound'] = group_stats['q1'] - 1.5 * group_stats['iqr']
|
540 |
+
group_stats['upper_bound'] = group_stats['q3'] + 1.5 * group_stats['iqr']
|
541 |
+
|
542 |
+
# Convert to dictionary with simplified structure to save memory
|
543 |
+
results['group_statistics'][f"{num_feat}_by_{cat_feat}"] = {
|
544 |
+
'counts': group_stats['count'].to_dict(),
|
545 |
+
'means': group_stats['mean'].to_dict(),
|
546 |
+
'stds': group_stats['std'].to_dict(),
|
547 |
+
'medians': group_stats['median'].to_dict(),
|
548 |
+
'q1': group_stats['q1'].to_dict(),
|
549 |
+
'q3': group_stats['q3'].to_dict()
|
550 |
+
}
|
551 |
+
|
552 |
+
# Calculate point-biserial correlation for binary categorical variables
|
553 |
+
if data[cat_feat].nunique() == 2:
|
554 |
+
try:
|
555 |
+
# For binary categories, use point-biserial correlation
|
556 |
+
# Need to convert categorical to numeric first
|
557 |
+
cat_values = data[cat_feat].unique()
|
558 |
+
binary_map = {cat_values[0]: 0, cat_values[1]: 1}
|
559 |
+
data_numeric = data.copy()
|
560 |
+
data_numeric[cat_feat] = data_numeric[cat_feat].map(binary_map)
|
561 |
+
|
562 |
+
# Calculate point-biserial correlation
|
563 |
+
corr, p_value = pointbiserialr(data_numeric[cat_feat], data_numeric[num_feat])
|
564 |
+
|
565 |
+
results['point_biserial_correlations'][f"{num_feat}_by_{cat_feat}"] = {
|
566 |
+
'correlation': corr,
|
567 |
+
'p_value': p_value,
|
568 |
+
'categorical_mapping': binary_map
|
569 |
+
}
|
570 |
+
except Exception as e:
|
571 |
+
pass
|
572 |
+
|
573 |
+
# Get groups for ANOVA more efficiently
|
574 |
+
groups = [data[data[cat_feat] == cat][num_feat].values
|
575 |
+
for cat in data[cat_feat].unique()
|
576 |
+
if len(data[data[cat_feat] == cat]) > 0]
|
577 |
+
|
578 |
+
group_labels = [cat for cat in data[cat_feat].unique()
|
579 |
+
if len(data[data[cat_feat] == cat]) > 0]
|
580 |
+
|
581 |
+
# Create visualizations if enabled
|
582 |
+
if visualize:
|
583 |
+
# Create box plot
|
584 |
+
plt.figure(figsize=(12, 6))
|
585 |
+
sns.boxplot(x=cat_feat, y=num_feat, data=data)
|
586 |
+
plt.title(f'{num_feat} by {cat_feat}')
|
587 |
+
plt.xticks(rotation=45)
|
588 |
+
plt.tight_layout()
|
589 |
+
plt.savefig(f"{output_dir}/{num_feat}_by_{cat_feat}_boxplot.png")
|
590 |
+
plt.close()
|
591 |
+
|
592 |
+
# Create violin plot for more detail on distributions
|
593 |
+
plt.figure(figsize=(12, 6))
|
594 |
+
sns.violinplot(x=cat_feat, y=num_feat, data=data)
|
595 |
+
plt.title(f'{num_feat} by {cat_feat} (Distribution)')
|
596 |
+
plt.xticks(rotation=45)
|
597 |
+
plt.tight_layout()
|
598 |
+
plt.savefig(f"{output_dir}/{num_feat}_by_{cat_feat}_violinplot.png")
|
599 |
+
plt.close()
|
600 |
+
|
601 |
+
# Run ANOVA if we have at least 2 groups
|
602 |
+
if len(groups) >= 2:
|
603 |
+
try:
|
604 |
+
# One-way ANOVA
|
605 |
+
f_stat, p_value = stats.f_oneway(*groups)
|
606 |
+
|
607 |
+
# Calculate effect size (eta-squared) more efficiently
|
608 |
+
grand_mean = data[num_feat].mean()
|
609 |
+
|
610 |
+
# Vectorized computation of SS between
|
611 |
+
group_means = np.array([group.mean() for group in groups])
|
612 |
+
group_sizes = np.array([len(group) for group in groups])
|
613 |
+
ss_between = np.sum(group_sizes * (group_means - grand_mean)**2)
|
614 |
+
|
615 |
+
# Vectorized computation of SS total
|
616 |
+
ss_total = np.sum((data[num_feat].values - grand_mean)**2)
|
617 |
+
|
618 |
+
eta_squared = ss_between / ss_total if ss_total != 0 else 0
|
619 |
+
|
620 |
+
# Calculate omega-squared (less biased estimate of effect size than eta-squared)
|
621 |
+
k = len(groups) # Number of groups
|
622 |
+
n = len(data) # Total sample size
|
623 |
+
df_between = k - 1
|
624 |
+
df_within = n - k
|
625 |
+
ms_between = ss_between / df_between if df_between > 0 else 0
|
626 |
+
ss_within = ss_total - ss_between
|
627 |
+
ms_within = ss_within / df_within if df_within > 0 else 0
|
628 |
+
|
629 |
+
omega_squared = (ss_between - (df_between * ms_within)) / (ss_total + ms_within) if (ss_total + ms_within) != 0 else 0
|
630 |
+
omega_squared = max(0, omega_squared) # Ensure non-negative
|
631 |
+
|
632 |
+
results['anova_tests'][f"{num_feat}_by_{cat_feat}"] = {
|
633 |
+
'f_statistic': float(f_stat), # Convert to native Python types to reduce memory
|
634 |
+
'p_value': float(p_value),
|
635 |
+
'significant': p_value < 0.05,
|
636 |
+
'degrees_of_freedom_between': df_between,
|
637 |
+
'degrees_of_freedom_within': df_within,
|
638 |
+
'ss_between': float(ss_between),
|
639 |
+
'ss_within': float(ss_within),
|
640 |
+
'ss_total': float(ss_total)
|
641 |
+
}
|
642 |
+
|
643 |
+
results['effect_sizes'][f"{num_feat}_by_{cat_feat}"] = {
|
644 |
+
'eta_squared': float(eta_squared),
|
645 |
+
'omega_squared': float(omega_squared)
|
646 |
+
}
|
647 |
+
|
648 |
+
# Calculate Levene's test for homogeneity of variances
|
649 |
+
try:
|
650 |
+
levene_stat, levene_p = stats.levene(*groups)
|
651 |
+
results['anova_tests'][f"{num_feat}_by_{cat_feat}"]["levene_test"] = {
|
652 |
+
'statistic': float(levene_stat),
|
653 |
+
'p_value': float(levene_p),
|
654 |
+
'equal_variances': levene_p >= 0.05
|
655 |
+
}
|
656 |
+
|
657 |
+
# If variances are not equal, calculate Welch's ANOVA
|
658 |
+
if levene_p < 0.05:
|
659 |
+
try:
|
660 |
+
from scipy.stats import f_oneway
|
661 |
+
import statsmodels.api as sm
|
662 |
+
from statsmodels.formula.api import ols
|
663 |
+
|
664 |
+
# Create a new dataframe for Welch's test
|
665 |
+
welch_data = pd.DataFrame({
|
666 |
+
'value': data[num_feat],
|
667 |
+
'group': data[cat_feat]
|
668 |
+
})
|
669 |
+
|
670 |
+
# Fit the model
|
671 |
+
model = ols('value ~ C(group)', data=welch_data).fit()
|
672 |
+
|
673 |
+
# Perform Welch's ANOVA
|
674 |
+
welch_table = sm.stats.anova_lm(model, typ=2)
|
675 |
+
|
676 |
+
# Extract statistics
|
677 |
+
welch_f = welch_table.loc['C(group)', 'F']
|
678 |
+
welch_p = welch_table.loc['C(group)', 'PR(>F)']
|
679 |
+
|
680 |
+
results['anova_tests'][f"{num_feat}_by_{cat_feat}"]["welch_anova"] = {
|
681 |
+
'f_statistic': float(welch_f),
|
682 |
+
'p_value': float(welch_p),
|
683 |
+
'significant': welch_p < 0.05
|
684 |
+
}
|
685 |
+
except:
|
686 |
+
# If Welch's ANOVA fails, skip it
|
687 |
+
pass
|
688 |
+
except:
|
689 |
+
# If Levene's test fails, skip it
|
690 |
+
pass
|
691 |
+
|
692 |
+
# If ANOVA is significant, perform post-hoc Tukey's test (optional)
|
693 |
+
if p_value < 0.05 and len(groups) > 2 and not skip_tukey:
|
694 |
+
# Prepare data for Tukey's test
|
695 |
+
all_data = np.concatenate(groups)
|
696 |
+
group_labels_for_tukey = np.repeat(group_labels, [len(group) for group in groups])
|
697 |
+
|
698 |
+
# Perform Tukey's test (computationally expensive)
|
699 |
+
try:
|
700 |
+
tukey_results = pairwise_tukeyhsd(all_data, group_labels_for_tukey)
|
701 |
+
tukey_summary = pd.DataFrame(data=tukey_results._results_table.data[1:],
|
702 |
+
columns=tukey_results._results_table.data[0])
|
703 |
+
|
704 |
+
# Store only significant pairs to save memory
|
705 |
+
significant_pairs = tukey_summary[tukey_summary['p-adj'] < 0.05]
|
706 |
+
significant_pairs_dict = significant_pairs.to_dict('records') if not significant_pairs.empty else []
|
707 |
+
|
708 |
+
results['anova_tests'][f"{num_feat}_by_{cat_feat}"]["tukey_posthoc"] = {
|
709 |
+
'significant_pairs': significant_pairs_dict
|
710 |
+
}
|
711 |
+
|
712 |
+
# Visualize Tukey's test results if enabled
|
713 |
+
if visualize and not significant_pairs.empty:
|
714 |
+
plt.figure(figsize=(12, len(significant_pairs) * 0.5 + 2))
|
715 |
+
significant_pairs_plot = [(f"{row['group1']} vs {row['group2']}",
|
716 |
+
row['meandiff'],
|
717 |
+
row['lower'],
|
718 |
+
row['upper'])
|
719 |
+
for _, row in significant_pairs.iterrows()]
|
720 |
+
|
721 |
+
# Sort by mean difference
|
722 |
+
significant_pairs_plot.sort(key=lambda x: x[1])
|
723 |
+
|
724 |
+
# Plot
|
725 |
+
for i, (pair, diff, lower, upper) in enumerate(significant_pairs_plot):
|
726 |
+
plt.plot([lower, upper], [i, i], 'b-')
|
727 |
+
plt.plot([diff], [i], 'bo')
|
728 |
+
|
729 |
+
plt.axvline(x=0, color='r', linestyle='--')
|
730 |
+
plt.yticks(range(len(significant_pairs_plot)), [pair for pair, _, _, _ in significant_pairs_plot])
|
731 |
+
plt.xlabel('Mean Difference')
|
732 |
+
plt.title(f'Tukey\'s HSD: Significant Differences in {num_feat} by {cat_feat}')
|
733 |
+
plt.tight_layout()
|
734 |
+
plt.savefig(f"{output_dir}/{num_feat}_by_{cat_feat}_tukey.png")
|
735 |
+
plt.close()
|
736 |
+
except Exception as e:
|
737 |
+
pass
|
738 |
+
|
739 |
+
# Calculate Kruskal-Wallis test (non-parametric alternative to ANOVA)
|
740 |
+
try:
|
741 |
+
h_stat, kw_p_value = stats.kruskal(*groups)
|
742 |
+
results['anova_tests'][f"{num_feat}_by_{cat_feat}"]["kruskal_wallis"] = {
|
743 |
+
'h_statistic': float(h_stat),
|
744 |
+
'p_value': float(kw_p_value),
|
745 |
+
'significant': kw_p_value < 0.05
|
746 |
+
}
|
747 |
+
|
748 |
+
# If Kruskal-Wallis is significant, perform post-hoc Dunn's test
|
749 |
+
if kw_p_value < 0.05 and len(groups) > 2 and not skip_tukey:
|
750 |
+
try:
|
751 |
+
from scikit_posthocs import posthoc_dunn
|
752 |
+
|
753 |
+
# Create a new dataframe for Dunn's test
|
754 |
+
dunn_data = pd.DataFrame({
|
755 |
+
'value': data[num_feat],
|
756 |
+
'group': data[cat_feat]
|
757 |
+
})
|
758 |
+
|
759 |
+
# Perform Dunn's test
|
760 |
+
dunn_results = posthoc_dunn(dunn_data, val_col='value', group_col='group', p_adjust='bonferroni')
|
761 |
+
|
762 |
+
# Store results
|
763 |
+
results['anova_tests'][f"{num_feat}_by_{cat_feat}"]["dunn_posthoc"] = {
|
764 |
+
'p_values': dunn_results.to_dict()
|
765 |
+
}
|
766 |
+
except:
|
767 |
+
# If Dunn's test fails, skip it
|
768 |
+
pass
|
769 |
+
except:
|
770 |
+
# If Kruskal-Wallis test fails, skip it
|
771 |
+
pass
|
772 |
+
|
773 |
+
# Add to significant differences only if significant with reasonable effect size
|
774 |
+
if p_value < 0.05 and eta_squared >= 0.01: # Filter out very small effects
|
775 |
+
effect_size_category = ""
|
776 |
+
if eta_squared >= 0.14:
|
777 |
+
effect_size_category = "strong"
|
778 |
+
elif eta_squared >= 0.06:
|
779 |
+
effect_size_category = "moderate"
|
780 |
+
else:
|
781 |
+
effect_size_category = "weak"
|
782 |
+
|
783 |
+
# Calculate group means more efficiently
|
784 |
+
group_means = {group: float(data[data[cat_feat] == group][num_feat].mean())
|
785 |
+
for group in data[cat_feat].unique()}
|
786 |
+
|
787 |
+
# Get point-biserial correlation if available
|
788 |
+
point_biserial_info = {}
|
789 |
+
if f"{num_feat}_by_{cat_feat}" in results['point_biserial_correlations']:
|
790 |
+
point_biserial_info = results['point_biserial_correlations'][f"{num_feat}_by_{cat_feat}"]
|
791 |
+
|
792 |
+
results['significant_differences'].append({
|
793 |
+
'numerical_feature': num_feat,
|
794 |
+
'categorical_feature': cat_feat,
|
795 |
+
'f_statistic': float(f_stat),
|
796 |
+
'anova_p_value': float(p_value),
|
797 |
+
'eta_squared': float(eta_squared),
|
798 |
+
'omega_squared': float(omega_squared),
|
799 |
+
'effect_size_category': effect_size_category,
|
800 |
+
'group_means': group_means,
|
801 |
+
'point_biserial_correlation': point_biserial_info if point_biserial_info else None,
|
802 |
+
'equal_variances': results['anova_tests'][f"{num_feat}_by_{cat_feat}"].get("levene_test", {}).get("equal_variances", None)
|
803 |
+
})
|
804 |
+
|
805 |
+
except Exception as e:
|
806 |
+
# If tests fail, just continue to next pair
|
807 |
+
continue
|
808 |
+
|
809 |
+
# Calculate mutual information for only significant pairs found above
|
810 |
+
# This reduces unnecessary calculations
|
811 |
+
if results['significant_differences']:
|
812 |
+
mutual_info = {}
|
813 |
+
sig_pairs = [(d['numerical_feature'], d['categorical_feature']) for d in results['significant_differences']]
|
814 |
+
|
815 |
+
for num_feat, cat_feat in sig_pairs:
|
816 |
+
# Get clean data
|
817 |
+
data = df_sample[[num_feat, cat_feat]].dropna()
|
818 |
+
|
819 |
+
if data.empty or data[cat_feat].nunique() <= 1:
|
820 |
+
continue
|
821 |
+
|
822 |
+
# Discretize numerical feature (needed for mutual information)
|
823 |
+
try:
|
824 |
+
# Use quantiles to discretize into fewer bins for efficiency
|
825 |
+
num_bins = min(5, data[num_feat].nunique()) # Reduced from 10 to 5
|
826 |
+
if num_bins > 1:
|
827 |
+
data['num_binned'] = pd.qcut(data[num_feat], num_bins, duplicates='drop')
|
828 |
+
|
829 |
+
# Calculate mutual information
|
830 |
+
mi = mutual_info_score(
|
831 |
+
data['num_binned'].astype(str).values,
|
832 |
+
data[cat_feat].astype(str).values
|
833 |
+
)
|
834 |
+
mutual_info[f"{num_feat}_vs_{cat_feat}"] = float(mi)
|
835 |
+
except Exception as e:
|
836 |
+
pass
|
837 |
+
|
838 |
+
# Only add mutual_information if we calculated something
|
839 |
+
if mutual_info:
|
840 |
+
results['mutual_information'] = mutual_info
|
841 |
+
|
842 |
+
# Sort significant differences by effect size
|
843 |
+
results['significant_differences'].sort(key=lambda x: x['eta_squared'], reverse=True)
|
844 |
+
|
845 |
+
return results
|
846 |
+
|
847 |
+
def generate_summary(results):
|
848 |
+
|
849 |
+
"""Generate a summary of key findings from bivariate analysis"""
|
850 |
+
summary = {
|
851 |
+
'strongest_correlations': [],
|
852 |
+
'significant_category_associations': [],
|
853 |
+
'significant_group_differences': [],
|
854 |
+
'key_insights': []
|
855 |
+
}
|
856 |
+
|
857 |
+
# Extract strongest numerical correlations
|
858 |
+
if 'numerical_vs_numerical' in results and 'significant_correlations' in results['numerical_vs_numerical']:
|
859 |
+
for corr in results['numerical_vs_numerical']['significant_correlations'][:5]: # Top 5
|
860 |
+
summary['strongest_correlations'].append({
|
861 |
+
'features': f"{corr['feature_1']} and {corr['feature_2']}",
|
862 |
+
'correlation': corr['pearson_correlation'],
|
863 |
+
'type': corr['correlation_type'],
|
864 |
+
'strength': corr['correlation_strength'],
|
865 |
+
'p_value': corr['p_value'],
|
866 |
+
'relationship_type': corr.get('relationship_type', 'linear')
|
867 |
+
})
|
868 |
+
|
869 |
+
# Extract strongest categorical associations
|
870 |
+
if 'categorical_vs_categorical' in results and 'significant_associations' in results['categorical_vs_categorical']:
|
871 |
+
for assoc in results['categorical_vs_categorical']['significant_associations'][:5]: # Top 5
|
872 |
+
summary['significant_category_associations'].append({
|
873 |
+
'features': f"{assoc['feature_1']} and {assoc['feature_2']}",
|
874 |
+
'cramer_v': assoc['cramers_v'],
|
875 |
+
'strength': assoc['association_strength'],
|
876 |
+
'p_value': assoc['p_value'],
|
877 |
+
'phi_coefficient': assoc.get('phi_coefficient', None)
|
878 |
+
})
|
879 |
+
|
880 |
+
# Extract most significant group differences
|
881 |
+
if 'numerical_vs_categorical' in results and 'significant_differences' in results['numerical_vs_categorical']:
|
882 |
+
for diff in results['numerical_vs_categorical']['significant_differences'][:5]: # Top 5
|
883 |
+
summary['significant_group_differences'].append({
|
884 |
+
'numerical': diff['numerical_feature'],
|
885 |
+
'categorical': diff['categorical_feature'],
|
886 |
+
'eta_squared': diff['eta_squared'],
|
887 |
+
'omega_squared': diff.get('omega_squared', None),
|
888 |
+
'effect_size': diff['effect_size_category'],
|
889 |
+
'anova_p_value': diff['anova_p_value'],
|
890 |
+
'equal_variances': diff.get('equal_variances', None)
|
891 |
+
})
|
892 |
+
|
893 |
+
# Generate key insights
|
894 |
+
insights = []
|
895 |
+
|
896 |
+
# Insight from numerical correlations
|
897 |
+
if summary['strongest_correlations']:
|
898 |
+
top_corr = summary['strongest_correlations'][0]
|
899 |
+
insights.append(f"The strongest numerical relationship is between {top_corr['features']} with a {top_corr['strength']} "
|
900 |
+
f"{top_corr['type']} correlation of {top_corr['correlation']:.3f} (p={top_corr['p_value']:.4f}).")
|
901 |
+
|
902 |
+
# Additional insight on relationship type
|
903 |
+
if top_corr.get('relationship_type') == 'potentially non-linear':
|
904 |
+
insights.append(f"The relationship between {top_corr['features']} appears to be non-linear, "
|
905 |
+
f"as indicated by the difference between Pearson and Spearman correlations.")
|
906 |
+
|
907 |
+
# Insight from categorical associations
|
908 |
+
if summary['significant_category_associations']:
|
909 |
+
top_assoc = summary['significant_category_associations'][0]
|
910 |
+
insights.append(f"The strongest association between categorical variables is between {top_assoc['features']} "
|
911 |
+
f"with a {top_assoc['strength']} relationship (Cramer's V={top_assoc['cramer_v']:.3f}, p={top_assoc['p_value']:.4f}).")
|
912 |
+
|
913 |
+
# Insight from group differences
|
914 |
+
if summary['significant_group_differences']:
|
915 |
+
top_diff = summary['significant_group_differences'][0]
|
916 |
+
insights.append(f"The categorical variable {top_diff['categorical']} has a {top_diff['effect_size']} effect "
|
917 |
+
f"on {top_diff['numerical']} (η²={top_diff['eta_squared']:.3f}, p={top_diff['anova_p_value']:.4f}).")
|
918 |
+
|
919 |
+
# Add insight about equal variances assumption
|
920 |
+
if top_diff.get('equal_variances') is not None:
|
921 |
+
if top_diff['equal_variances']:
|
922 |
+
insights.append(f"The equal variances assumption is met for {top_diff['numerical']} across {top_diff['categorical']} groups, "
|
923 |
+
f"supporting the validity of the ANOVA results.")
|
924 |
+
else:
|
925 |
+
insights.append(f"The equal variances assumption is violated for {top_diff['numerical']} across {top_diff['categorical']} groups, "
|
926 |
+
f"suggesting Welch's ANOVA may be more appropriate.")
|
927 |
+
|
928 |
+
# Check for data quality issues
|
929 |
+
data_quality_issues = []
|
930 |
+
|
931 |
+
# Check for potentially misleading relationships
|
932 |
+
if 'numerical_vs_numerical' in results and 'significant_correlations' in results['numerical_vs_numerical']:
|
933 |
+
for corr in results['numerical_vs_numerical']['significant_correlations']:
|
934 |
+
if corr['pearson_correlation'] > 0.9 or corr['pearson_correlation'] < -0.9:
|
935 |
+
data_quality_issues.append(f"The very strong correlation between {corr['feature_1']} and {corr['feature_2']} "
|
936 |
+
f"may indicate multicollinearity issues in predictive modeling.")
|
937 |
+
|
938 |
+
# Check for imbalanced categorical variables
|
939 |
+
if 'numerical_vs_categorical' in results and 'group_statistics' in results['numerical_vs_categorical']:
|
940 |
+
for key, stats in results['numerical_vs_categorical']['group_statistics'].items():
|
941 |
+
if 'counts' in stats:
|
942 |
+
counts = list(stats['counts'].values())
|
943 |
+
if counts and max(counts) / min(counts) > 10: # Imbalanced if one group is 10x larger than smallest
|
944 |
+
features = key.split('_by_')
|
945 |
+
if len(features) == 2:
|
946 |
+
data_quality_issues.append(f"The categorical variable {features[1]} has highly imbalanced groups "
|
947 |
+
f"which may affect the reliability of {features[0]} analysis.")
|
948 |
+
|
949 |
+
# Add data quality issues as insights
|
950 |
+
for issue in data_quality_issues[:2]: # Limit to top 2 issues
|
951 |
+
insights.append(issue)
|
952 |
+
|
953 |
+
# Add final observations
|
954 |
+
if 'numerical_vs_numerical' in results:
|
955 |
+
# Check for non-linear relationships
|
956 |
+
non_linear_count = 0
|
957 |
+
if 'significant_correlations' in results['numerical_vs_numerical']:
|
958 |
+
for corr in results['numerical_vs_numerical']['significant_correlations']:
|
959 |
+
if corr.get('relationship_type') == 'potentially non-linear':
|
960 |
+
non_linear_count += 1
|
961 |
+
|
962 |
+
if non_linear_count > 0:
|
963 |
+
insights.append(f"Found {non_linear_count} potentially non-linear relationships among numerical variables, "
|
964 |
+
f"suggesting that linear models may not fully capture the complexity of the data.")
|
965 |
+
|
966 |
+
# Cross-analysis insights
|
967 |
+
has_num_num = 'numerical_vs_numerical' in results and 'significant_correlations' in results['numerical_vs_numerical']
|
968 |
+
has_cat_cat = 'categorical_vs_categorical' in results and 'significant_associations' in results['categorical_vs_categorical']
|
969 |
+
has_num_cat = 'numerical_vs_categorical' in results and 'significant_differences' in results['numerical_vs_categorical']
|
970 |
+
|
971 |
+
if has_num_num and has_num_cat:
|
972 |
+
insights.append("Both numerical correlations and categorical group differences were detected, "
|
973 |
+
"suggesting a mix of continuous and segmented relationships in the data.")
|
974 |
+
|
975 |
+
# Add mention of mutual information if present
|
976 |
+
if 'numerical_vs_categorical' in results and 'mutual_information' in results['numerical_vs_categorical']:
|
977 |
+
mutual_info = results['numerical_vs_categorical']['mutual_information']
|
978 |
+
if mutual_info:
|
979 |
+
# Find highest mutual information score
|
980 |
+
top_mi_pair = max(mutual_info.items(), key=lambda x: x[1])
|
981 |
+
features = top_mi_pair[0].split('_vs_')
|
982 |
+
if len(features) == 2:
|
983 |
+
insights.append(f"The strongest general statistical dependency (mutual information) was found between "
|
984 |
+
f"{features[0]} and {features[1]} with score {top_mi_pair[1]:.3f}, "
|
985 |
+
f"capturing both linear and non-linear relationships.")
|
986 |
+
|
987 |
+
# Overall data structure insight
|
988 |
+
if has_num_num and has_cat_cat and has_num_cat:
|
989 |
+
insights.append("The data shows a complex structure with significant relationships across all variable types, "
|
990 |
+
"suggesting potential for both feature engineering and dimensionality reduction.")
|
991 |
+
|
992 |
+
# Add insights to summary
|
993 |
+
summary['key_insights'] = insights
|
994 |
+
|
995 |
+
return summary
|
996 |
+
|
997 |
+
def bivariate_analysis(data_path: str):
|
998 |
+
"""
|
999 |
+
Perform a comprehensive bivariate analysis on a dataset.
|
1000 |
+
|
1001 |
+
Args:
|
1002 |
+
data_path: Path to a data file or a pandas DataFrame
|
1003 |
+
Returns:
|
1004 |
+
A dictionary containing all analysis results
|
1005 |
+
"""
|
1006 |
+
try:
|
1007 |
+
if isinstance(data_path, str):
|
1008 |
+
data = pd.read_csv(data_path)
|
1009 |
+
else:
|
1010 |
+
logger.error(f"Unsupported file format: {data_path}")
|
1011 |
+
return {"error": f"Unsupported file format: {data_path}"}
|
1012 |
+
|
1013 |
+
if not isinstance(data, pd.DataFrame):
|
1014 |
+
logger.error(f"Input is not a valid pandas DataFrame")
|
1015 |
+
return {"error": "Input is not a valid pandas DataFrame"}
|
1016 |
+
|
1017 |
+
logger.warn("Staring Analysis.....")
|
1018 |
+
|
1019 |
+
results = comprehensive_bivariate_analysis(df = data)
|
1020 |
+
summary = generate_summary(results=results)
|
1021 |
+
|
1022 |
+
serialized_summary = json.dumps(summary, default=lambda o: o.item() if isinstance(o, np.generic) else str(o))
|
1023 |
+
|
1024 |
+
return serialized_summary
|
1025 |
+
|
1026 |
+
except Exception as e:
|
1027 |
+
logger.error(f"Error in analyze_dataset: {str(e)}")
|
1028 |
+
return {"error": str(e)}
|
src/app/pipelines/eda/tools/analysis_tools/multivariate_analysis.py
ADDED
@@ -0,0 +1,1039 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from scipy import stats
|
4 |
+
import statsmodels.api as sm
|
5 |
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
6 |
+
from sklearn.preprocessing import StandardScaler
|
7 |
+
from sklearn.decomposition import PCA, FactorAnalysis
|
8 |
+
from sklearn.manifold import TSNE, MDS
|
9 |
+
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
|
10 |
+
from sklearn.mixture import GaussianMixture
|
11 |
+
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
|
12 |
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
13 |
+
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression, f_classif, f_regression, chi2
|
14 |
+
from sklearn.feature_selection import SelectKBest, RFE
|
15 |
+
from sklearn.neighbors import LocalOutlierFactor
|
16 |
+
from sklearn.ensemble import IsolationForest
|
17 |
+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
18 |
+
from sklearn.model_selection import cross_val_score
|
19 |
+
from sklearn.feature_selection import VarianceThreshold
|
20 |
+
from agno.utils.log import logger
|
21 |
+
import warnings
|
22 |
+
import json
|
23 |
+
|
24 |
+
warnings.filterwarnings("ignore")
|
25 |
+
|
26 |
+
def preprocess_dataframe(df, target_column=None, categorical_columns=None, verbose=True):
|
27 |
+
"""
|
28 |
+
Preprocess a dataframe by identifying categorical columns, handling missing values,
|
29 |
+
and separating the target variable if specified.
|
30 |
+
|
31 |
+
Parameters:
|
32 |
+
-----------
|
33 |
+
df : pandas.DataFrame
|
34 |
+
The dataframe to preprocess
|
35 |
+
target_column : str, optional
|
36 |
+
The name of the target variable column (if any)
|
37 |
+
categorical_columns : list, optional
|
38 |
+
List of categorical column names (will be auto-detected if None)
|
39 |
+
verbose : bool, default=True
|
40 |
+
Whether to print detailed messages during preprocessing
|
41 |
+
|
42 |
+
Returns:
|
43 |
+
--------
|
44 |
+
dict
|
45 |
+
A dictionary containing the preprocessed data and metadata
|
46 |
+
"""
|
47 |
+
# Create a deep copy to avoid modifying the original dataframe
|
48 |
+
data = df.copy()
|
49 |
+
|
50 |
+
# Initialize results dictionary to store preprocessing info
|
51 |
+
results = {}
|
52 |
+
|
53 |
+
# Separate target if specified
|
54 |
+
target = None
|
55 |
+
|
56 |
+
if target_column is not None and target_column in data.columns:
|
57 |
+
target = data[target_column].copy()
|
58 |
+
data = data.drop(columns=[target_column])
|
59 |
+
target_is_numeric = pd.api.types.is_numeric_dtype(target)
|
60 |
+
results['target_column'] = target_column
|
61 |
+
results['target_is_numeric'] = target_is_numeric
|
62 |
+
if verbose:
|
63 |
+
logger.warning(f"Target variable '{target_column}' detected as {'numeric' if target_is_numeric else 'categorical'}")
|
64 |
+
|
65 |
+
# Identify categorical and numerical columns
|
66 |
+
if categorical_columns is None:
|
67 |
+
categorical_columns = []
|
68 |
+
for col in data.columns:
|
69 |
+
if pd.api.types.is_object_dtype(data[col]) or pd.api.types.is_categorical_dtype(data[col]) or len(data[col].unique()) < 10:
|
70 |
+
categorical_columns.append(col)
|
71 |
+
|
72 |
+
numerical_columns = [col for col in data.columns if col not in categorical_columns]
|
73 |
+
|
74 |
+
results['categorical_columns'] = categorical_columns
|
75 |
+
results['numerical_columns'] = numerical_columns
|
76 |
+
|
77 |
+
if verbose:
|
78 |
+
logger.warning(f"Detected {len(numerical_columns)} numerical columns and {len(categorical_columns)} categorical columns")
|
79 |
+
|
80 |
+
# Handle missing values
|
81 |
+
missing_data = data.isnull().sum()
|
82 |
+
results['missing_values'] = {col: count for col, count in missing_data.items() if count > 0}
|
83 |
+
results['missing_values_percentage'] = {col: count/len(data)*100 for col, count in missing_data.items() if count > 0}
|
84 |
+
|
85 |
+
# For analysis, we'll do simple imputation to handle missing values
|
86 |
+
for col in numerical_columns:
|
87 |
+
if data[col].isnull().sum() > 0:
|
88 |
+
data[col].fillna(data[col].median(), inplace=True)
|
89 |
+
|
90 |
+
for col in categorical_columns:
|
91 |
+
if data[col].isnull().sum() > 0:
|
92 |
+
data[col].fillna(data[col].mode()[0], inplace=True)
|
93 |
+
|
94 |
+
# Create a dataframe of numerical data only for correlation analysis
|
95 |
+
numerical_data = data[numerical_columns].copy()
|
96 |
+
|
97 |
+
# Standardize numerical data
|
98 |
+
scaler = StandardScaler()
|
99 |
+
if len(numerical_columns) > 0:
|
100 |
+
scaled_data = scaler.fit_transform(numerical_data)
|
101 |
+
results['preprocessing'] = {
|
102 |
+
'scaler_mean': scaler.mean_.tolist(),
|
103 |
+
'scaler_scale': scaler.scale_.tolist()
|
104 |
+
}
|
105 |
+
else:
|
106 |
+
scaled_data = np.array([])
|
107 |
+
|
108 |
+
results['data'] = data
|
109 |
+
results['numerical_data'] = numerical_data
|
110 |
+
results['scaled_data'] = scaled_data
|
111 |
+
results['target'] = target
|
112 |
+
|
113 |
+
return results
|
114 |
+
|
115 |
+
def analyze_correlations(numerical_data, verbose=True):
|
116 |
+
"""
|
117 |
+
Perform correlation analysis on numerical data.
|
118 |
+
|
119 |
+
Parameters:
|
120 |
+
-----------
|
121 |
+
numerical_data : pandas.DataFrame
|
122 |
+
DataFrame containing only numerical columns
|
123 |
+
verbose : bool, default=True
|
124 |
+
Whether to print detailed messages during analysis
|
125 |
+
|
126 |
+
Returns:
|
127 |
+
--------
|
128 |
+
dict
|
129 |
+
A dictionary containing correlation analysis results
|
130 |
+
"""
|
131 |
+
if len(numerical_data.columns) <= 1:
|
132 |
+
if verbose:
|
133 |
+
logger.warning("Correlation analysis requires at least 2 numerical columns")
|
134 |
+
return {}
|
135 |
+
|
136 |
+
correlation_results = {}
|
137 |
+
|
138 |
+
# Generate correlation matrix
|
139 |
+
correlation_matrix = numerical_data.corr()
|
140 |
+
correlation_results['matrix'] = correlation_matrix
|
141 |
+
|
142 |
+
# Identify highly correlated features
|
143 |
+
high_corr_pairs = []
|
144 |
+
for i in range(len(correlation_matrix.columns)):
|
145 |
+
for j in range(i+1, len(correlation_matrix.columns)):
|
146 |
+
col1 = correlation_matrix.columns[i]
|
147 |
+
col2 = correlation_matrix.columns[j]
|
148 |
+
corr_value = correlation_matrix.iloc[i, j]
|
149 |
+
if abs(corr_value) > 0.7:
|
150 |
+
high_corr_pairs.append((col1, col2, corr_value))
|
151 |
+
|
152 |
+
correlation_results['high_correlation_pairs'] = high_corr_pairs
|
153 |
+
|
154 |
+
# Multicollinearity using VIF (Only for numeric predictors)
|
155 |
+
if len(numerical_data.columns) > 1:
|
156 |
+
try:
|
157 |
+
X = sm.add_constant(numerical_data)
|
158 |
+
vif_data = pd.DataFrame()
|
159 |
+
vif_data["Variable"] = X.columns
|
160 |
+
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
|
161 |
+
correlation_results['vif'] = vif_data
|
162 |
+
|
163 |
+
# Identify multicollinearity issues
|
164 |
+
multicollinearity_issues = vif_data[vif_data["VIF"] > 10].to_dict('records')
|
165 |
+
correlation_results['multicollinearity_issues'] = multicollinearity_issues
|
166 |
+
except Exception as e:
|
167 |
+
if verbose:
|
168 |
+
logger.error(f"VIF calculation failed: {str(e)}")
|
169 |
+
|
170 |
+
return correlation_results
|
171 |
+
|
172 |
+
def perform_dimensionality_reduction(scaled_data, numerical_columns, data_size, verbose=True):
|
173 |
+
"""
|
174 |
+
Perform various dimensionality reduction techniques on scaled numerical data.
|
175 |
+
|
176 |
+
Parameters:
|
177 |
+
-----------
|
178 |
+
scaled_data : numpy.ndarray
|
179 |
+
Standardized numerical data
|
180 |
+
numerical_columns : list
|
181 |
+
List of numerical column names
|
182 |
+
data_size : int
|
183 |
+
Number of rows in the dataset
|
184 |
+
verbose : bool, default=True
|
185 |
+
Whether to print detailed messages during analysis
|
186 |
+
|
187 |
+
Returns:
|
188 |
+
--------
|
189 |
+
dict
|
190 |
+
A dictionary containing dimensionality reduction results
|
191 |
+
"""
|
192 |
+
if len(numerical_columns) <= 1:
|
193 |
+
if verbose:
|
194 |
+
logger.warning("Dimensionality reduction requires at least 2 numerical columns")
|
195 |
+
return {}
|
196 |
+
|
197 |
+
dr_results = {}
|
198 |
+
|
199 |
+
# PCA
|
200 |
+
try:
|
201 |
+
pca = PCA(random_state=42)
|
202 |
+
pca_results = pca.fit_transform(scaled_data)
|
203 |
+
|
204 |
+
# Explained variance
|
205 |
+
explained_variance = pca.explained_variance_ratio_
|
206 |
+
cumulative_variance = np.cumsum(explained_variance)
|
207 |
+
|
208 |
+
# Find components that explain at least 80% of variance
|
209 |
+
components_80pct = np.argmax(cumulative_variance >= 0.8) + 1
|
210 |
+
|
211 |
+
# Component loadings
|
212 |
+
loadings = pd.DataFrame(
|
213 |
+
pca.components_.T,
|
214 |
+
columns=[f'PC{i+1}' for i in range(pca.n_components_)],
|
215 |
+
index=numerical_columns
|
216 |
+
)
|
217 |
+
|
218 |
+
dr_results['pca'] = {
|
219 |
+
'explained_variance_ratio': explained_variance.tolist(),
|
220 |
+
'cumulative_variance': cumulative_variance.tolist(),
|
221 |
+
'components_for_80_percent_variance': components_80pct,
|
222 |
+
'component_loadings': loadings,
|
223 |
+
'pca_object': pca
|
224 |
+
}
|
225 |
+
|
226 |
+
# Factor Analysis
|
227 |
+
if len(numerical_columns) >= 3:
|
228 |
+
try:
|
229 |
+
factor = FactorAnalysis(n_components=min(5, len(numerical_columns)), random_state=42)
|
230 |
+
factor.fit(scaled_data)
|
231 |
+
|
232 |
+
loadings = pd.DataFrame(
|
233 |
+
factor.components_.T,
|
234 |
+
columns=[f'Factor{i+1}' for i in range(factor.n_components)],
|
235 |
+
index=numerical_columns
|
236 |
+
)
|
237 |
+
|
238 |
+
dr_results['factor_analysis'] = {
|
239 |
+
'loadings': loadings,
|
240 |
+
'factor_object': factor
|
241 |
+
}
|
242 |
+
except Exception as e:
|
243 |
+
if verbose:
|
244 |
+
logger.error(f"Factor Analysis failed: {str(e)}")
|
245 |
+
|
246 |
+
# t-SNE (for datasets that aren't too large)
|
247 |
+
if data_size <= 5000:
|
248 |
+
try:
|
249 |
+
tsne = TSNE(n_components=2, random_state=42)
|
250 |
+
tsne_results = tsne.fit_transform(scaled_data)
|
251 |
+
|
252 |
+
dr_results['tsne'] = {
|
253 |
+
'coordinates': tsne_results,
|
254 |
+
'tsne_object': tsne
|
255 |
+
}
|
256 |
+
except Exception as e:
|
257 |
+
if verbose:
|
258 |
+
logger.error(f"t-SNE failed: {str(e)}")
|
259 |
+
|
260 |
+
# MDS (for smaller datasets)
|
261 |
+
if data_size <= 2000:
|
262 |
+
try:
|
263 |
+
mds = MDS(n_components=2, random_state=42)
|
264 |
+
mds_results = mds.fit_transform(scaled_data)
|
265 |
+
|
266 |
+
dr_results['mds'] = {
|
267 |
+
'coordinates': mds_results,
|
268 |
+
'mds_object': mds
|
269 |
+
}
|
270 |
+
except Exception as e:
|
271 |
+
if verbose:
|
272 |
+
logger.error(f"MDS failed: {str(e)}")
|
273 |
+
|
274 |
+
except Exception as e:
|
275 |
+
if verbose:
|
276 |
+
logger.error(f"PCA failed: {str(e)}")
|
277 |
+
|
278 |
+
return dr_results
|
279 |
+
|
280 |
+
def perform_cluster_analysis(scaled_data, numerical_data, data_size, verbose=True):
|
281 |
+
"""
|
282 |
+
Perform various clustering techniques on scaled numerical data.
|
283 |
+
|
284 |
+
Parameters:
|
285 |
+
-----------
|
286 |
+
scaled_data : numpy.ndarray
|
287 |
+
Standardized numerical data
|
288 |
+
numerical_data : pandas.DataFrame
|
289 |
+
DataFrame containing numerical data (for cluster analysis)
|
290 |
+
data_size : int
|
291 |
+
Number of rows in the dataset
|
292 |
+
verbose : bool, default=True
|
293 |
+
Whether to print detailed messages during analysis
|
294 |
+
|
295 |
+
Returns:
|
296 |
+
--------
|
297 |
+
dict
|
298 |
+
A dictionary containing clustering results
|
299 |
+
"""
|
300 |
+
if len(numerical_data.columns) <= 1 or data_size <= 10:
|
301 |
+
if verbose:
|
302 |
+
logger.warning("Clustering requires at least 2 numerical columns and more than 10 data points")
|
303 |
+
return {}
|
304 |
+
|
305 |
+
clustering_results = {}
|
306 |
+
|
307 |
+
# K-means clustering
|
308 |
+
try:
|
309 |
+
# Determine optimal number of clusters using silhouette score
|
310 |
+
sil_scores = []
|
311 |
+
max_clusters = min(10, data_size // 10) # reasonable max number of clusters
|
312 |
+
|
313 |
+
for k in range(2, max_clusters + 1):
|
314 |
+
kmeans = KMeans(n_clusters=k, random_state=42)
|
315 |
+
labels = kmeans.fit_predict(scaled_data)
|
316 |
+
sil_score = silhouette_score(scaled_data, labels)
|
317 |
+
sil_scores.append((k, sil_score))
|
318 |
+
|
319 |
+
# Find the best k
|
320 |
+
best_k = max(sil_scores, key=lambda x: x[1])[0]
|
321 |
+
|
322 |
+
# Run K-means with optimal k
|
323 |
+
kmeans = KMeans(n_clusters=best_k, random_state=42)
|
324 |
+
labels = kmeans.fit_predict(scaled_data)
|
325 |
+
|
326 |
+
# Calculate cluster evaluation metrics
|
327 |
+
sil_avg = silhouette_score(scaled_data, labels)
|
328 |
+
ch_score = calinski_harabasz_score(scaled_data, labels)
|
329 |
+
db_score = davies_bouldin_score(scaled_data, labels)
|
330 |
+
|
331 |
+
# Analyze cluster characteristics
|
332 |
+
cluster_analysis = {}
|
333 |
+
for cluster in range(best_k):
|
334 |
+
cluster_data = numerical_data.iloc[labels == cluster]
|
335 |
+
cluster_analysis[f'Cluster_{cluster}'] = {
|
336 |
+
'size': len(cluster_data),
|
337 |
+
'percentage': len(cluster_data) / data_size * 100,
|
338 |
+
'mean': cluster_data.mean().to_dict(),
|
339 |
+
'std': cluster_data.std().to_dict()
|
340 |
+
}
|
341 |
+
|
342 |
+
clustering_results['kmeans'] = {
|
343 |
+
'best_k': best_k,
|
344 |
+
'silhouette_scores': dict(sil_scores),
|
345 |
+
'silhouette_avg': sil_avg,
|
346 |
+
'calinski_harabasz_score': ch_score,
|
347 |
+
'davies_bouldin_score': db_score,
|
348 |
+
'cluster_analysis': cluster_analysis,
|
349 |
+
'kmeans_object': kmeans
|
350 |
+
}
|
351 |
+
|
352 |
+
# DBSCAN for density-based clustering
|
353 |
+
if data_size <= 5000: # DBSCAN can be slow on large datasets
|
354 |
+
try:
|
355 |
+
dbscan = DBSCAN(eps=0.5, min_samples=min(5, data_size // 100))
|
356 |
+
dbscan_labels = dbscan.fit_predict(scaled_data)
|
357 |
+
|
358 |
+
# Count number of clusters and noise points
|
359 |
+
n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
|
360 |
+
n_noise = list(dbscan_labels).count(-1)
|
361 |
+
|
362 |
+
# Only calculate silhouette if we have clusters and not all points are noise
|
363 |
+
if n_clusters > 1 and n_noise < data_size:
|
364 |
+
valid_data = scaled_data[dbscan_labels != -1]
|
365 |
+
valid_labels = dbscan_labels[dbscan_labels != -1]
|
366 |
+
if len(set(valid_labels)) > 1:
|
367 |
+
db_sil_score = silhouette_score(valid_data, valid_labels)
|
368 |
+
else:
|
369 |
+
db_sil_score = None
|
370 |
+
else:
|
371 |
+
db_sil_score = None
|
372 |
+
|
373 |
+
clustering_results['dbscan'] = {
|
374 |
+
'n_clusters': n_clusters,
|
375 |
+
'n_noise': n_noise,
|
376 |
+
'silhouette_score': db_sil_score,
|
377 |
+
'dbscan_object': dbscan
|
378 |
+
}
|
379 |
+
except Exception as e:
|
380 |
+
if verbose:
|
381 |
+
logger.error(f"DBSCAN failed: {str(e)}")
|
382 |
+
|
383 |
+
# Gaussian Mixture Models
|
384 |
+
try:
|
385 |
+
gmm = GaussianMixture(n_components=best_k, random_state=42)
|
386 |
+
gmm_labels = gmm.fit_predict(scaled_data)
|
387 |
+
|
388 |
+
gmm_sil_score = silhouette_score(scaled_data, gmm_labels)
|
389 |
+
gmm_ch_score = calinski_harabasz_score(scaled_data, gmm_labels)
|
390 |
+
gmm_db_score = davies_bouldin_score(scaled_data, gmm_labels)
|
391 |
+
|
392 |
+
clustering_results['gmm'] = {
|
393 |
+
'n_components': best_k,
|
394 |
+
'silhouette_score': gmm_sil_score,
|
395 |
+
'calinski_harabasz_score': gmm_ch_score,
|
396 |
+
'davies_bouldin_score': gmm_db_score,
|
397 |
+
'gmm_object': gmm
|
398 |
+
}
|
399 |
+
except Exception as e:
|
400 |
+
if verbose:
|
401 |
+
logger.error(f"GMM failed: {str(e)}")
|
402 |
+
|
403 |
+
except Exception as e:
|
404 |
+
if verbose:
|
405 |
+
logger.error(f"Clustering analysis failed: {str(e)}")
|
406 |
+
|
407 |
+
return clustering_results
|
408 |
+
|
409 |
+
def analyze_feature_importance(data, target, numerical_columns, categorical_columns, target_is_numeric, verbose=True):
|
410 |
+
"""
|
411 |
+
Analyze feature importance using various techniques.
|
412 |
+
|
413 |
+
Parameters:
|
414 |
+
-----------
|
415 |
+
data : pandas.DataFrame
|
416 |
+
The preprocessed dataframe
|
417 |
+
target : pandas.Series
|
418 |
+
The target variable
|
419 |
+
numerical_columns : list
|
420 |
+
List of numerical column names
|
421 |
+
categorical_columns : list
|
422 |
+
List of categorical column names
|
423 |
+
target_is_numeric : bool
|
424 |
+
Whether the target variable is numeric
|
425 |
+
verbose : bool, default=True
|
426 |
+
Whether to print detailed messages during analysis
|
427 |
+
|
428 |
+
Returns:
|
429 |
+
--------
|
430 |
+
dict
|
431 |
+
A dictionary containing feature importance analysis results
|
432 |
+
"""
|
433 |
+
if target is None:
|
434 |
+
if verbose:
|
435 |
+
logger.warning("Feature importance analysis requires a target variable")
|
436 |
+
return {}
|
437 |
+
|
438 |
+
feature_importance_results = {}
|
439 |
+
|
440 |
+
# Prepare data for feature importance
|
441 |
+
X = data.copy()
|
442 |
+
y = target.copy()
|
443 |
+
|
444 |
+
# Encode categorical features for feature importance
|
445 |
+
X_encoded = pd.get_dummies(X, columns=categorical_columns, drop_first=True)
|
446 |
+
|
447 |
+
# Univariate feature importance
|
448 |
+
try:
|
449 |
+
if target_is_numeric:
|
450 |
+
# Numeric target: use F-test for numeric features
|
451 |
+
selector = SelectKBest(score_func=f_regression, k='all')
|
452 |
+
selector.fit(X_encoded, y)
|
453 |
+
f_scores = pd.Series(selector.scores_, index=X_encoded.columns)
|
454 |
+
p_values = pd.Series(selector.pvalues_, index=X_encoded.columns)
|
455 |
+
|
456 |
+
# Mutual information
|
457 |
+
try:
|
458 |
+
var_thresh = VarianceThreshold(threshold=0.01)
|
459 |
+
X_encoded = pd.DataFrame(var_thresh.fit_transform(X_encoded), columns=X_encoded.columns[var_thresh.get_support()])
|
460 |
+
mi_scores = mutual_info_regression(X_encoded, y)
|
461 |
+
mi_series = pd.Series(mi_scores, index=X_encoded.columns)
|
462 |
+
feature_importance_results['mutual_info'] = mi_series.sort_values(ascending=False).to_dict()
|
463 |
+
except Exception as e:
|
464 |
+
if verbose:
|
465 |
+
logger.error(f"Mutual information calculation failed: {str(e)}")
|
466 |
+
|
467 |
+
feature_importance_results['f_regression'] = {
|
468 |
+
'scores': f_scores.sort_values(ascending=False).to_dict(),
|
469 |
+
'p_values': p_values.sort_values().to_dict()
|
470 |
+
}
|
471 |
+
else:
|
472 |
+
# Categorical target: use chi2 for numeric features
|
473 |
+
# Need non-negative features for chi2
|
474 |
+
X_chi = X_encoded.copy()
|
475 |
+
for col in X_chi.columns:
|
476 |
+
if X_chi[col].min() < 0:
|
477 |
+
X_chi[col] = X_chi[col] - X_chi[col].min()
|
478 |
+
|
479 |
+
selector = SelectKBest(score_func=chi2, k='all')
|
480 |
+
selector.fit(X_chi, y)
|
481 |
+
chi2_scores = pd.Series(selector.scores_, index=X_encoded.columns)
|
482 |
+
p_values = pd.Series(selector.pvalues_, index=X_encoded.columns)
|
483 |
+
|
484 |
+
# F-test for classification
|
485 |
+
f_selector = SelectKBest(score_func=f_classif, k='all')
|
486 |
+
f_selector.fit(X_encoded, y)
|
487 |
+
f_scores = pd.Series(f_selector.scores_, index=X_encoded.columns)
|
488 |
+
f_p_values = pd.Series(f_selector.pvalues_, index=X_encoded.columns)
|
489 |
+
|
490 |
+
# Mutual information
|
491 |
+
try:
|
492 |
+
mi_scores = mutual_info_classif(X_encoded, y)
|
493 |
+
mi_series = pd.Series(mi_scores, index=X_encoded.columns)
|
494 |
+
feature_importance_results['mutual_info'] = mi_series.sort_values(ascending=False).to_dict()
|
495 |
+
except Exception as e:
|
496 |
+
if verbose:
|
497 |
+
logger.error(f"Mutual information calculation failed: {str(e)}")
|
498 |
+
|
499 |
+
feature_importance_results['chi2'] = {
|
500 |
+
'scores': chi2_scores.sort_values(ascending=False).to_dict(),
|
501 |
+
'p_values': p_values.sort_values().to_dict()
|
502 |
+
}
|
503 |
+
|
504 |
+
feature_importance_results['f_classif'] = {
|
505 |
+
'scores': f_scores.sort_values(ascending=False).to_dict(),
|
506 |
+
'p_values': f_p_values.sort_values().to_dict()
|
507 |
+
}
|
508 |
+
|
509 |
+
# Tree-based feature importance
|
510 |
+
try:
|
511 |
+
if target_is_numeric:
|
512 |
+
model = RandomForestRegressor(n_estimators=100, random_state=42)
|
513 |
+
else:
|
514 |
+
model = RandomForestClassifier(n_estimators=100, random_state=42)
|
515 |
+
|
516 |
+
model.fit(X_encoded, y)
|
517 |
+
importances = pd.Series(model.feature_importances_, index=X_encoded.columns)
|
518 |
+
|
519 |
+
feature_importance_results['random_forest'] = importances.sort_values(ascending=False).to_dict()
|
520 |
+
|
521 |
+
# Recursive Feature Elimination
|
522 |
+
if len(X_encoded.columns) > 5:
|
523 |
+
rfe = RFE(estimator=model, n_features_to_select=min(10, len(X_encoded.columns)), step=1)
|
524 |
+
rfe.fit(X_encoded, y)
|
525 |
+
rfe_ranking = pd.Series(rfe.ranking_, index=X_encoded.columns)
|
526 |
+
|
527 |
+
feature_importance_results['rfe'] = {
|
528 |
+
'selected_features': X_encoded.columns[rfe.support_].tolist(),
|
529 |
+
'feature_ranking': rfe_ranking.sort_values().to_dict()
|
530 |
+
}
|
531 |
+
except Exception as e:
|
532 |
+
if verbose:
|
533 |
+
logger.error(f"Tree-based feature importance calculation failed: {str(e)}")
|
534 |
+
|
535 |
+
except Exception as e:
|
536 |
+
if verbose:
|
537 |
+
logger.error(f"Feature importance analysis failed: {str(e)}")
|
538 |
+
|
539 |
+
return feature_importance_results
|
540 |
+
|
541 |
+
def detect_outliers(scaled_data, data_size, verbose=True):
|
542 |
+
"""
|
543 |
+
Detect multivariate outliers using different techniques.
|
544 |
+
|
545 |
+
Parameters:
|
546 |
+
-----------
|
547 |
+
scaled_data : numpy.ndarray
|
548 |
+
Standardized numerical data
|
549 |
+
data_size : int
|
550 |
+
Number of rows in the dataset
|
551 |
+
verbose : bool, default=True
|
552 |
+
Whether to print detailed messages during analysis
|
553 |
+
|
554 |
+
Returns:
|
555 |
+
--------
|
556 |
+
dict
|
557 |
+
A dictionary containing outlier detection results
|
558 |
+
"""
|
559 |
+
if scaled_data.size == 0 or scaled_data.shape[1] <= 1:
|
560 |
+
if verbose:
|
561 |
+
logger.warning("Outlier detection requires at least 2 numerical columns")
|
562 |
+
return {}
|
563 |
+
|
564 |
+
outlier_results = {}
|
565 |
+
|
566 |
+
# Isolation Forest
|
567 |
+
try:
|
568 |
+
iso = IsolationForest(contamination=0.05, random_state=42)
|
569 |
+
outliers_isof = iso.fit_predict(scaled_data)
|
570 |
+
outlier_indices_isof = np.where(outliers_isof == -1)[0]
|
571 |
+
|
572 |
+
outlier_results['isolation_forest'] = {
|
573 |
+
'num_outliers': len(outlier_indices_isof),
|
574 |
+
'outlier_percentage': len(outlier_indices_isof) / data_size * 100
|
575 |
+
}
|
576 |
+
except Exception as e:
|
577 |
+
if verbose:
|
578 |
+
logger.error(f"Isolation Forest failed: {str(e)}")
|
579 |
+
|
580 |
+
# Local Outlier Factor (for smaller datasets)
|
581 |
+
if data_size <= 5000:
|
582 |
+
try:
|
583 |
+
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
|
584 |
+
outliers_lof = lof.fit_predict(scaled_data)
|
585 |
+
outlier_indices_lof = np.where(outliers_lof == -1)[0]
|
586 |
+
|
587 |
+
outlier_results['local_outlier_factor'] = {
|
588 |
+
'num_outliers': len(outlier_indices_lof),
|
589 |
+
'outlier_indices': outlier_indices_lof.tolist(),
|
590 |
+
'outlier_percentage': len(outlier_indices_lof) / data_size * 100
|
591 |
+
}
|
592 |
+
except Exception as e:
|
593 |
+
if verbose:
|
594 |
+
logger.error(f"Local Outlier Factor failed: {str(e)}")
|
595 |
+
|
596 |
+
return outlier_results
|
597 |
+
|
598 |
+
def perform_statistical_testing(data, target, scaled_data, numerical_columns, categorical_columns, target_is_numeric, verbose=True):
|
599 |
+
"""
|
600 |
+
Perform various statistical tests on the data.
|
601 |
+
|
602 |
+
Parameters:
|
603 |
+
-----------
|
604 |
+
data : pandas.DataFrame
|
605 |
+
The preprocessed dataframe
|
606 |
+
target : pandas.Series
|
607 |
+
The target variable
|
608 |
+
scaled_data : numpy.ndarray
|
609 |
+
Standardized numerical data
|
610 |
+
numerical_columns : list
|
611 |
+
List of numerical column names
|
612 |
+
categorical_columns : list
|
613 |
+
List of categorical column names
|
614 |
+
target_is_numeric : bool
|
615 |
+
Whether the target variable is numeric
|
616 |
+
verbose : bool, default=True
|
617 |
+
Whether to print detailed messages during analysis
|
618 |
+
|
619 |
+
Returns:
|
620 |
+
--------
|
621 |
+
dict
|
622 |
+
A dictionary containing statistical testing results
|
623 |
+
"""
|
624 |
+
if target is None:
|
625 |
+
if verbose:
|
626 |
+
print("Statistical testing requires a target variable")
|
627 |
+
return {}
|
628 |
+
|
629 |
+
statistical_testing_results = {}
|
630 |
+
|
631 |
+
# MANOVA (only for multivariate numeric data with categorical target)
|
632 |
+
if not target_is_numeric and len(numerical_columns) > 1:
|
633 |
+
try:
|
634 |
+
unique_classes = target.unique()
|
635 |
+
if len(unique_classes) > 1 and len(unique_classes) <= 10:
|
636 |
+
# Prepare groups for MANOVA
|
637 |
+
numerical_data = data[numerical_columns]
|
638 |
+
groups = [numerical_data.iloc[target == val].values for val in unique_classes]
|
639 |
+
|
640 |
+
# Run MANOVA
|
641 |
+
manova_result = stats.manova(groups)
|
642 |
+
|
643 |
+
statistical_testing_results['manova'] = {
|
644 |
+
'test_statistic': float(manova_result.statistic),
|
645 |
+
'p_value': float(manova_result.pvalue),
|
646 |
+
'significant': float(manova_result.pvalue) < 0.05
|
647 |
+
}
|
648 |
+
except Exception as e:
|
649 |
+
if verbose:
|
650 |
+
print(f"MANOVA failed: {str(e)}")
|
651 |
+
|
652 |
+
# Linear Discriminant Analysis (for classification problems)
|
653 |
+
if not target_is_numeric:
|
654 |
+
try:
|
655 |
+
lda = LinearDiscriminantAnalysis()
|
656 |
+
X_lda = lda.fit_transform(scaled_data, target)
|
657 |
+
|
658 |
+
statistical_testing_results['lda'] = {
|
659 |
+
'explained_variance_ratio': lda.explained_variance_ratio_.tolist(),
|
660 |
+
'coordinates': X_lda.tolist(),
|
661 |
+
'lda_object': lda
|
662 |
+
}
|
663 |
+
except Exception as e:
|
664 |
+
if verbose:
|
665 |
+
print(f"LDA failed: {str(e)}")
|
666 |
+
|
667 |
+
# # Multivariate regression for numeric target
|
668 |
+
# if target_is_numeric:
|
669 |
+
# # try:
|
670 |
+
# for col in numerical_columns:
|
671 |
+
# data[col] = pd.to_numeric(data[col], errors='coerce')
|
672 |
+
|
673 |
+
# data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].mean())
|
674 |
+
|
675 |
+
# X_encoded = pd.get_dummies(data[categorical_columns], drop_first=True)
|
676 |
+
|
677 |
+
# X_final = pd.concat([data[numerical_columns], X_encoded], axis=1)
|
678 |
+
# target = pd.to_numeric(target, errors='coerce').fillna(target.mean())
|
679 |
+
|
680 |
+
# X_final = X_final.apply(pd.to_numeric, errors='coerce')
|
681 |
+
# X_final = X_final.fillna(0) # optional: or use mean imputation
|
682 |
+
|
683 |
+
# X_final = X_final.astype('float64')
|
684 |
+
# target = target.astype('float64')
|
685 |
+
|
686 |
+
# X_sm = sm.add_constant(X_final)
|
687 |
+
|
688 |
+
# # Fit model
|
689 |
+
# model = sm.OLS(target, X_sm).fit()
|
690 |
+
|
691 |
+
# statistical_testing_results['multivariate_regression'] = {
|
692 |
+
# 'r_squared': model.rsquared,
|
693 |
+
# 'adj_r_squared': model.rsquared_adj,
|
694 |
+
# 'f_statistic': float(model.fvalue),
|
695 |
+
# 'f_pvalue': float(model.f_pvalue),
|
696 |
+
# 'coefficients': model.params.to_dict(),
|
697 |
+
# 'p_values': model.pvalues.to_dict(),
|
698 |
+
# 'significant_features': [feature for feature, p_val in model.pvalues.items() if p_val < 0.05]
|
699 |
+
# }
|
700 |
+
# # except Exception as e:
|
701 |
+
# if verbose:
|
702 |
+
# print(f"Multivariate regression failed: {str(e)}")
|
703 |
+
|
704 |
+
return statistical_testing_results
|
705 |
+
|
706 |
+
def generate_summary(results, df, verbose=True):
|
707 |
+
"""
|
708 |
+
Generate a summary of analysis results.
|
709 |
+
|
710 |
+
Parameters:
|
711 |
+
-----------
|
712 |
+
results : dict
|
713 |
+
Dictionary containing all analysis results
|
714 |
+
df : pandas.DataFrame
|
715 |
+
The original dataframe
|
716 |
+
verbose : bool, default=True
|
717 |
+
Whether to print the summary
|
718 |
+
|
719 |
+
Returns:
|
720 |
+
--------
|
721 |
+
dict
|
722 |
+
A dictionary containing the summary
|
723 |
+
"""
|
724 |
+
summary = {}
|
725 |
+
|
726 |
+
summary['dataset'] = {
|
727 |
+
'rows': len(df),
|
728 |
+
'columns': len(df.columns),
|
729 |
+
'numerical_features': len(results.get('numerical_columns', [])),
|
730 |
+
'categorical_features': len(results.get('categorical_columns', [])),
|
731 |
+
'missing_values': sum(df.isnull().sum() > 0)
|
732 |
+
}
|
733 |
+
|
734 |
+
if 'correlation' in results:
|
735 |
+
high_corr_pairs = results['correlation'].get('high_correlation_pairs', [])
|
736 |
+
high_corr_count = len(high_corr_pairs)
|
737 |
+
multicollinearity_issues = results['correlation'].get('multicollinearity_issues', [])
|
738 |
+
|
739 |
+
summary['correlation'] = {
|
740 |
+
'highly_correlated_pairs': high_corr_count,
|
741 |
+
'correlation_threshold': 0.7,
|
742 |
+
'top_correlations': high_corr_pairs[:5] if high_corr_pairs else [],
|
743 |
+
'multicollinearity_issues': len(multicollinearity_issues),
|
744 |
+
'vif_threshold': 10
|
745 |
+
}
|
746 |
+
|
747 |
+
# Dimensionality reduction summary
|
748 |
+
if 'dimensionality_reduction' in results and 'pca' in results['dimensionality_reduction']:
|
749 |
+
pca_results = results['dimensionality_reduction']['pca']
|
750 |
+
|
751 |
+
summary['dimensionality_reduction'] = {
|
752 |
+
'components_for_80_percent_variance': pca_results['components_for_80_percent_variance'],
|
753 |
+
'total_components': len(pca_results['explained_variance_ratio']),
|
754 |
+
'first_component_variance': pca_results['explained_variance_ratio'][0] * 100
|
755 |
+
}
|
756 |
+
|
757 |
+
# Clustering summary
|
758 |
+
if 'clustering' in results and 'kmeans' in results['clustering']:
|
759 |
+
kmeans_results = results['clustering']['kmeans']
|
760 |
+
|
761 |
+
summary['clustering'] = {
|
762 |
+
'optimal_clusters': kmeans_results['best_k'],
|
763 |
+
'silhouette_score': kmeans_results['silhouette_avg'],
|
764 |
+
'clearly_separable': kmeans_results['silhouette_avg'] > 0.5
|
765 |
+
}
|
766 |
+
|
767 |
+
# Feature importance summary
|
768 |
+
if 'feature_importance' in results:
|
769 |
+
top_features_with_explanations = []
|
770 |
+
|
771 |
+
# Get top features from RF if available
|
772 |
+
if 'random_forest' in results['feature_importance']:
|
773 |
+
rf_importances = results['feature_importance']['random_forest']
|
774 |
+
top_rf = sorted(rf_importances.items(), key=lambda x: x[1], reverse=True)[:5]
|
775 |
+
for feature, importance in top_rf:
|
776 |
+
explanation = f"{feature}: Importance {importance:.3f} (Random Forest: relative feature contribution to prediction accuracy)."
|
777 |
+
top_features_with_explanations.append(explanation)
|
778 |
+
|
779 |
+
# Get top features from mutual info if available
|
780 |
+
elif 'mutual_info' in results['feature_importance']:
|
781 |
+
mi_importances = results['feature_importance']['mutual_info']
|
782 |
+
top_mi = sorted(mi_importances.items(), key=lambda x: x[1], reverse=True)[:5]
|
783 |
+
for feature, importance in top_mi:
|
784 |
+
explanation = f"{feature}: Importance {importance:.3f} (Mutual Information: degree of dependency with target)."
|
785 |
+
top_features_with_explanations.append(explanation)
|
786 |
+
|
787 |
+
summary['feature_importance'] = {
|
788 |
+
'top_features': top_features_with_explanations
|
789 |
+
}
|
790 |
+
|
791 |
+
# Outlier summary
|
792 |
+
if 'outlier_detection' in results and 'isolation_forest' in results['outlier_detection']:
|
793 |
+
iso_results = results['outlier_detection']['isolation_forest']
|
794 |
+
|
795 |
+
summary['outliers'] = {
|
796 |
+
'percentage': iso_results['outlier_percentage'],
|
797 |
+
'count': iso_results['num_outliers']
|
798 |
+
}
|
799 |
+
|
800 |
+
# Statistical testing summary
|
801 |
+
if 'statistical_testing' in results:
|
802 |
+
stat_tests = []
|
803 |
+
|
804 |
+
if 'manova' in results['statistical_testing']:
|
805 |
+
significant = results['statistical_testing']['manova']['significant']
|
806 |
+
stat_tests.append(f"MANOVA: {'Significant' if significant else 'Not significant'}")
|
807 |
+
|
808 |
+
if 'lda' in results['statistical_testing']:
|
809 |
+
stat_tests.append("LDA performed")
|
810 |
+
|
811 |
+
if 'multivariate_regression' in results['statistical_testing']:
|
812 |
+
reg_results = results['statistical_testing']['multivariate_regression']
|
813 |
+
r_squared = reg_results['r_squared']
|
814 |
+
significant_features = len(reg_results['significant_features'])
|
815 |
+
stat_tests.append(f"Regression: R² = {r_squared:.3f}, Significant features: {significant_features}")
|
816 |
+
|
817 |
+
summary['statistical_tests'] = stat_tests
|
818 |
+
|
819 |
+
output_string = ""
|
820 |
+
|
821 |
+
output_string = ""
|
822 |
+
|
823 |
+
if verbose:
|
824 |
+
output_string += "\n=== MULTIVARIATE ANALYSIS SUMMARY ===\n"
|
825 |
+
output_string += f"Dataset: {summary['dataset']['rows']} rows, {summary['dataset']['columns']} columns\n"
|
826 |
+
output_string += f"Features: {summary['dataset']['numerical_features']} numerical, {summary['dataset']['categorical_features']} categorical\n"
|
827 |
+
|
828 |
+
if 'correlation' in summary:
|
829 |
+
output_string += f"Correlations (Threshold: {summary['correlation']['correlation_threshold']}):\n"
|
830 |
+
if summary['correlation']['highly_correlated_pairs'] > 0:
|
831 |
+
for col1, col2, corr_value in summary['correlation']['top_correlations']:
|
832 |
+
output_string += f" - {col1} & {col2}: {corr_value:.3f}\n"
|
833 |
+
else:
|
834 |
+
output_string += " - No highly correlated feature pairs found.\n"
|
835 |
+
|
836 |
+
output_string += f"Multicollinearity (VIF Threshold: {summary['correlation']['vif_threshold']}): {summary['correlation']['multicollinearity_issues']} issues\n"
|
837 |
+
|
838 |
+
if 'dimensionality_reduction' in summary and pca_results:
|
839 |
+
output_string += f"\nPCA: {pca_results['components_for_80_percent_variance']} components explain 80% of variance\n"
|
840 |
+
output_string += f"First component explains {pca_results['explained_variance_ratio'][0] * 100:.2f}% of variance\n"
|
841 |
+
|
842 |
+
output_string += "\nComponent Loadings:\n"
|
843 |
+
output_string += str(pca_results['component_loadings']) + "\n"
|
844 |
+
|
845 |
+
output_string += "\nComponent Interpretation:\n"
|
846 |
+
for pc in pca_results['component_loadings'].columns:
|
847 |
+
output_string += f" - {pc}:\n"
|
848 |
+
top_features = pca_results['component_loadings'][pc].abs().sort_values(ascending=False).head(3)
|
849 |
+
for feature, loading in top_features.items():
|
850 |
+
output_string += f" {feature}: {loading:.3f}\n"
|
851 |
+
output_string += "-" * 10 + "\n"
|
852 |
+
|
853 |
+
output_string += "\nPCA Meaning:\n"
|
854 |
+
output_string += " - PCA has reduced the dimensionality of the data while retaining 80% of the variance.\n"
|
855 |
+
output_string += " - The components represent combinations of the original variables, with loadings indicating the strength and direction of each variable's influence on the component.\n"
|
856 |
+
output_string += " - The top loading features for each component can help in understanding what the components represent.\n"
|
857 |
+
|
858 |
+
if 'clustering' in summary and kmeans_results:
|
859 |
+
output_string += f"\nOptimal clusters: {kmeans_results['best_k']}\n"
|
860 |
+
output_string += f"Cluster separation: {kmeans_results['silhouette_avg']:.3f} (silhouette score)\n"
|
861 |
+
|
862 |
+
output_string += "\nCluster Descriptions:\n"
|
863 |
+
for cluster, analysis in kmeans_results['cluster_analysis'].items():
|
864 |
+
output_string += f" - {cluster}: Size={analysis['size']} ({analysis['percentage']:.2f}%)\n"
|
865 |
+
output_string += " Mean values:\n"
|
866 |
+
for feature, mean_val in analysis['mean'].items():
|
867 |
+
output_string += f" {feature}: {mean_val:.3f}\n"
|
868 |
+
output_string += " Standard deviations:\n"
|
869 |
+
for feature, std_val in analysis['std'].items():
|
870 |
+
output_string += f" {feature}: {std_val:.3f}\n"
|
871 |
+
output_string += "-" * 20 + "\n"
|
872 |
+
|
873 |
+
output_string += "\nCluster Differences:\n"
|
874 |
+
if kmeans_results['best_k'] > 1:
|
875 |
+
feature_means = {}
|
876 |
+
for cluster, analysis in kmeans_results['cluster_analysis'].items():
|
877 |
+
for feature, mean_val in analysis['mean'].items():
|
878 |
+
if feature not in feature_means:
|
879 |
+
feature_means[feature] = {}
|
880 |
+
feature_means[feature][cluster] = mean_val
|
881 |
+
|
882 |
+
for feature, cluster_means in feature_means.items():
|
883 |
+
output_string += f" - {feature}:\n"
|
884 |
+
for cluster, mean_val in cluster_means.items():
|
885 |
+
output_string += f" {cluster}: {mean_val:.3f}\n"
|
886 |
+
|
887 |
+
mean_values = list(cluster_means.values())
|
888 |
+
if len(set(mean_values)) > 1:
|
889 |
+
max_diff = max(mean_values) - min(mean_values)
|
890 |
+
output_string += f" Max difference: {max_diff:.3f}\n"
|
891 |
+
output_string += "-" * 10 + "\n"
|
892 |
+
|
893 |
+
output_string += "\nClustering Meaning:\n"
|
894 |
+
output_string += " - The clusters represent distinct groups within the data, characterized by differences in the mean values of the numerical features.\n"
|
895 |
+
if kmeans_results['silhouette_avg'] > 0.5:
|
896 |
+
output_string += " - The high silhouette score indicates that the clusters are well-separated and meaningful.\n"
|
897 |
+
else:
|
898 |
+
output_string += " - The silhouette score suggests the clusters are reasonably separated.\n"
|
899 |
+
|
900 |
+
if 'feature_importance' in summary:
|
901 |
+
output_string += "Top Features:\n"
|
902 |
+
for feature_explanation in summary['feature_importance']['top_features']:
|
903 |
+
output_string += f" - {feature_explanation}\n"
|
904 |
+
|
905 |
+
if 'outliers' in summary:
|
906 |
+
output_string += f"Outliers: {summary['outliers']['count']} ({summary['outliers']['percentage']:.2f}%)\n"
|
907 |
+
|
908 |
+
if 'statistical_tests' in summary:
|
909 |
+
output_string += "Statistical tests:\n"
|
910 |
+
for test in summary['statistical_tests']:
|
911 |
+
output_string += f" - {test}\n"
|
912 |
+
|
913 |
+
return output_string
|
914 |
+
|
915 |
+
def safe_prepare(results):
|
916 |
+
cleaned_results = []
|
917 |
+
for item in results:
|
918 |
+
if item is None:
|
919 |
+
continue
|
920 |
+
elif isinstance(item, dict):
|
921 |
+
cleaned_results.append(json.dumps(item))
|
922 |
+
elif isinstance(item, (list, str)):
|
923 |
+
cleaned_results.append(item)
|
924 |
+
else:
|
925 |
+
cleaned_results.append(str(item))
|
926 |
+
return json.dumps(cleaned_results)
|
927 |
+
|
928 |
+
def multivariate_analysis(data_path: str, target_column_name: str):
|
929 |
+
"""
|
930 |
+
Perform comprehensive multivariate analysis on a dataset from the specified file path.
|
931 |
+
|
932 |
+
Parameters:
|
933 |
+
-----------
|
934 |
+
data_path : str
|
935 |
+
Path to the data file (supports CSV, Excel, etc. based on pandas read capabilities)
|
936 |
+
target_column_name : str
|
937 |
+
Name of the target column / feature
|
938 |
+
|
939 |
+
Returns:
|
940 |
+
--------
|
941 |
+
dict
|
942 |
+
A dictionary containing all analysis results
|
943 |
+
"""
|
944 |
+
target_column = target_column_name
|
945 |
+
# target_column='Transported'
|
946 |
+
categorical_columns = None
|
947 |
+
verbose=True
|
948 |
+
|
949 |
+
try:
|
950 |
+
# Determine file type and read data
|
951 |
+
if verbose:
|
952 |
+
logger.warning(f"Reading data from {data_path}...")
|
953 |
+
|
954 |
+
if data_path.endswith('.csv'):
|
955 |
+
df = pd.read_csv(data_path)
|
956 |
+
elif data_path.endswith(('.xls', '.xlsx')):
|
957 |
+
df = pd.read_excel(data_path)
|
958 |
+
elif data_path.endswith('.json'):
|
959 |
+
df = pd.read_json(data_path)
|
960 |
+
elif data_path.endswith('.parquet'):
|
961 |
+
df = pd.read_parquet(data_path)
|
962 |
+
else:
|
963 |
+
raise ValueError("Unsupported file format. Please provide a CSV, Excel, JSON, or Parquet file.")
|
964 |
+
|
965 |
+
if verbose:
|
966 |
+
logger.warning(f"Data loaded successfully. Shape: {df.shape}")
|
967 |
+
|
968 |
+
results = {}
|
969 |
+
|
970 |
+
if verbose:
|
971 |
+
logger.warning("Preprocessing data...")
|
972 |
+
|
973 |
+
preprocess_results = preprocess_dataframe(df, target_column, categorical_columns, verbose)
|
974 |
+
results.update(preprocess_results)
|
975 |
+
|
976 |
+
data = results['data']
|
977 |
+
numerical_data = results['numerical_data']
|
978 |
+
scaled_data = results['scaled_data']
|
979 |
+
target = results['target']
|
980 |
+
numerical_columns = results['numerical_columns']
|
981 |
+
categorical_columns = results['categorical_columns']
|
982 |
+
target_is_numeric = results.get('target_is_numeric', None)
|
983 |
+
|
984 |
+
if verbose:
|
985 |
+
logger.warning("Analyzing correlations...")
|
986 |
+
|
987 |
+
correlation_results = analyze_correlations(numerical_data, verbose)
|
988 |
+
results['correlation'] = correlation_results
|
989 |
+
|
990 |
+
if verbose:
|
991 |
+
logger.warning("Performing dimensionality reduction...")
|
992 |
+
|
993 |
+
dr_results = perform_dimensionality_reduction(scaled_data, numerical_columns, len(data), verbose)
|
994 |
+
results['dimensionality_reduction'] = dr_results
|
995 |
+
|
996 |
+
if verbose:
|
997 |
+
logger.warning("Performing cluster analysis...")
|
998 |
+
|
999 |
+
clustering_results = perform_cluster_analysis(scaled_data, numerical_data, len(data), verbose)
|
1000 |
+
results['clustering'] = clustering_results
|
1001 |
+
|
1002 |
+
if target is not None:
|
1003 |
+
if verbose:
|
1004 |
+
logger.warning("Analyzing feature importance...")
|
1005 |
+
|
1006 |
+
feature_importance_results = analyze_feature_importance(
|
1007 |
+
data, target, numerical_columns, categorical_columns, target_is_numeric, verbose
|
1008 |
+
)
|
1009 |
+
results['feature_importance'] = feature_importance_results
|
1010 |
+
|
1011 |
+
if verbose:
|
1012 |
+
logger.warning("Detecting outliers...")
|
1013 |
+
|
1014 |
+
outlier_results = detect_outliers(scaled_data, len(data), verbose)
|
1015 |
+
results['outlier_detection'] = outlier_results
|
1016 |
+
|
1017 |
+
if target is not None:
|
1018 |
+
if verbose:
|
1019 |
+
logger.warning("Performing statistical tests...")
|
1020 |
+
|
1021 |
+
statistical_testing_results = perform_statistical_testing(
|
1022 |
+
data, target, scaled_data, numerical_columns, categorical_columns, target_is_numeric, verbose
|
1023 |
+
)
|
1024 |
+
results['statistical_testing'] = statistical_testing_results
|
1025 |
+
|
1026 |
+
if verbose:
|
1027 |
+
logger.warning("Generating summary...")
|
1028 |
+
|
1029 |
+
summary = generate_summary(results, df, verbose)
|
1030 |
+
results['summary'] = summary
|
1031 |
+
|
1032 |
+
if verbose:
|
1033 |
+
logger.warning("Multivariate analysis completed successfully!")
|
1034 |
+
|
1035 |
+
return results['summary']
|
1036 |
+
|
1037 |
+
except Exception as e:
|
1038 |
+
logger.error(f"Error in multivariate analysis: {str(e)}")
|
1039 |
+
return {"error": str(e)}
|
src/app/pipelines/eda/tools/analysis_tools/univariate_analysis.py
ADDED
@@ -0,0 +1,517 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from scipy import stats
|
4 |
+
import missingno as msno
|
5 |
+
from typing import Dict, List, Tuple, Optional, Union
|
6 |
+
from agno.utils.log import logger
|
7 |
+
import json
|
8 |
+
|
9 |
+
def data_overview(data: pd.DataFrame, categorical_threshold) -> Dict:
|
10 |
+
"""
|
11 |
+
Generate a high-level overview of the dataset structure.
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
A dictionary containing dataset dimensions and data types
|
15 |
+
"""
|
16 |
+
logger.warn("Analyzing data overview...")
|
17 |
+
|
18 |
+
rows, cols = data.shape
|
19 |
+
|
20 |
+
dtypes = data.dtypes.value_counts().to_dict()
|
21 |
+
|
22 |
+
total_observations = rows * cols
|
23 |
+
total_missing = data.isna().sum().sum()
|
24 |
+
completeness_ratio = (total_observations - total_missing) / total_observations
|
25 |
+
|
26 |
+
feature_types = {}
|
27 |
+
for col in data.columns:
|
28 |
+
unique_count = data[col].nunique()
|
29 |
+
dtype = data[col].dtype
|
30 |
+
|
31 |
+
if pd.api.types.is_numeric_dtype(dtype):
|
32 |
+
if unique_count <= categorical_threshold:
|
33 |
+
feature_types[col] = 'categorical (numeric)'
|
34 |
+
else:
|
35 |
+
feature_types[col] = 'numerical'
|
36 |
+
elif pd.api.types.is_string_dtype(dtype) or pd.api.types.is_object_dtype(dtype):
|
37 |
+
if unique_count <= categorical_threshold:
|
38 |
+
feature_types[col] = 'categorical'
|
39 |
+
else:
|
40 |
+
feature_types[col] = 'text'
|
41 |
+
elif pd.api.types.is_datetime64_dtype(dtype):
|
42 |
+
feature_types[col] = 'datetime'
|
43 |
+
elif pd.api.types.is_bool_dtype(dtype):
|
44 |
+
feature_types[col] = 'boolean'
|
45 |
+
else:
|
46 |
+
feature_types[col] = 'other'
|
47 |
+
|
48 |
+
overview = {
|
49 |
+
'dimensions': {'rows': rows, 'columns': cols},
|
50 |
+
'data_types': dtypes,
|
51 |
+
'feature_types': feature_types,
|
52 |
+
'observations': {
|
53 |
+
'total': total_observations,
|
54 |
+
'missing': total_missing,
|
55 |
+
'completeness_ratio': completeness_ratio
|
56 |
+
}
|
57 |
+
}
|
58 |
+
|
59 |
+
return overview
|
60 |
+
|
61 |
+
def missing_values_analysis(data: pd.DataFrame) -> Dict:
|
62 |
+
"""
|
63 |
+
Analyze missing values in the dataset.
|
64 |
+
|
65 |
+
Returns:
|
66 |
+
A dictionary containing missing value statistics
|
67 |
+
"""
|
68 |
+
logger.warn("Analyzing missing values...")
|
69 |
+
|
70 |
+
missing_counts = data.isnull().sum()
|
71 |
+
missing_percentage = (missing_counts / len(data)) * 100
|
72 |
+
|
73 |
+
columns_with_missing = missing_counts[missing_counts > 0].index.tolist()
|
74 |
+
|
75 |
+
missing_correlation = None
|
76 |
+
if len(columns_with_missing) > 1:
|
77 |
+
missing_mask = data[columns_with_missing].isna()
|
78 |
+
missing_correlation = missing_mask.corr()
|
79 |
+
|
80 |
+
missing_analysis = {
|
81 |
+
'missing_counts': missing_counts.to_dict(),
|
82 |
+
'missing_percentage': missing_percentage.to_dict(),
|
83 |
+
'columns_with_missing': columns_with_missing,
|
84 |
+
'total_missing_percentage': (missing_counts.sum() / (len(data) * len(data.columns))) * 100
|
85 |
+
}
|
86 |
+
|
87 |
+
if missing_correlation is not None:
|
88 |
+
missing_analysis['missing_correlation'] = missing_correlation.to_dict()
|
89 |
+
|
90 |
+
return missing_analysis
|
91 |
+
|
92 |
+
def analyze_features(data: pd.DataFrame, results: dict) -> Dict:
|
93 |
+
"""
|
94 |
+
Perform type-specific analysis for each feature.
|
95 |
+
|
96 |
+
Returns:
|
97 |
+
A dictionary containing feature-level analysis results
|
98 |
+
"""
|
99 |
+
logger.warn("Analyzing individual features...")
|
100 |
+
|
101 |
+
feature_results = {}
|
102 |
+
|
103 |
+
for column in data.columns:
|
104 |
+
feature_type = results['data_overview']['feature_types'][column]
|
105 |
+
|
106 |
+
if 'numerical' in feature_type:
|
107 |
+
feature_results[column] = analyze_numerical(data, column)
|
108 |
+
elif 'categorical' in feature_type or feature_type == 'boolean':
|
109 |
+
feature_results[column] = analyze_categorical(data, column)
|
110 |
+
elif feature_type == 'datetime':
|
111 |
+
feature_results[column] = analyze_datetime(data, column)
|
112 |
+
elif feature_type == 'text':
|
113 |
+
feature_results[column] = analyze_text(data, column)
|
114 |
+
else:
|
115 |
+
feature_results[column] = {'type': feature_type, 'message': 'No specific analysis available for this type'}
|
116 |
+
|
117 |
+
return feature_results
|
118 |
+
|
119 |
+
def analyze_numerical(df: pd.DataFrame, column: str) -> Dict:
|
120 |
+
"""
|
121 |
+
Perform comprehensive analysis for a numerical feature.
|
122 |
+
|
123 |
+
Args:
|
124 |
+
column: The name of the column to analyze
|
125 |
+
|
126 |
+
Returns:
|
127 |
+
A dictionary containing numerical statistics
|
128 |
+
"""
|
129 |
+
data = df[column].dropna()
|
130 |
+
|
131 |
+
if len(data) == 0:
|
132 |
+
return {'type': 'numerical', 'error': 'No non-null values found'}
|
133 |
+
|
134 |
+
try:
|
135 |
+
stats_dict = {
|
136 |
+
'type': 'numerical',
|
137 |
+
'count': len(data),
|
138 |
+
'missing': df[column].isna().sum(),
|
139 |
+
'min': float(data.min()),
|
140 |
+
'max': float(data.max()),
|
141 |
+
'range': float(data.max() - data.min()),
|
142 |
+
'mean': float(data.mean()),
|
143 |
+
'median': float(data.median()),
|
144 |
+
'std': float(data.std()),
|
145 |
+
'variance': float(data.var()),
|
146 |
+
'quantiles': {
|
147 |
+
'25%': float(data.quantile(0.25)),
|
148 |
+
'50%': float(data.quantile(0.5)),
|
149 |
+
'75%': float(data.quantile(0.75)),
|
150 |
+
'90%': float(data.quantile(0.9)),
|
151 |
+
'95%': float(data.quantile(0.95)),
|
152 |
+
'99%': float(data.quantile(0.99))
|
153 |
+
}
|
154 |
+
}
|
155 |
+
|
156 |
+
stats_dict['skewness'] = float(stats.skew(data))
|
157 |
+
stats_dict['kurtosis'] = float(stats.kurtosis(data))
|
158 |
+
|
159 |
+
if len(data) >= 8: # Minimum required for Shapiro-Wilk test
|
160 |
+
# Sample data if too large for tests
|
161 |
+
sample_data = data if len(data) < 5000 else data.sample(5000)
|
162 |
+
|
163 |
+
try:
|
164 |
+
shapiro_test = stats.shapiro(sample_data)
|
165 |
+
stats_dict['normality_tests'] = {
|
166 |
+
'shapiro_wilk': {
|
167 |
+
'statistic': float(shapiro_test[0]),
|
168 |
+
'p_value': float(shapiro_test[1]),
|
169 |
+
'is_normal': shapiro_test[1] > 0.05
|
170 |
+
}
|
171 |
+
}
|
172 |
+
|
173 |
+
if len(data) >= 20: # Minimum for Anderson-Darling
|
174 |
+
anderson_test = stats.anderson(sample_data, dist='norm')
|
175 |
+
stats_dict['normality_tests']['anderson_darling'] = {
|
176 |
+
'statistic': float(anderson_test.statistic),
|
177 |
+
'critical_values': list(anderson_test.critical_values),
|
178 |
+
'significance_levels': list(anderson_test.significance_level),
|
179 |
+
'is_normal': anderson_test.statistic < anderson_test.critical_values[2] # 5% significance
|
180 |
+
}
|
181 |
+
except:
|
182 |
+
stats_dict['normality_tests'] = {'error': 'Could not perform normality tests on this data'}
|
183 |
+
|
184 |
+
q1 = data.quantile(0.25)
|
185 |
+
q3 = data.quantile(0.75)
|
186 |
+
iqr = q3 - q1
|
187 |
+
lower_bound = q1 - 1.5 * iqr
|
188 |
+
upper_bound = q3 + 1.5 * iqr
|
189 |
+
|
190 |
+
outliers = data[(data < lower_bound) | (data > upper_bound)]
|
191 |
+
|
192 |
+
stats_dict['outliers'] = {
|
193 |
+
'count': len(outliers),
|
194 |
+
'percentage': (len(outliers) / len(data)) * 100 if len(data) > 0 else 0,
|
195 |
+
'lower_bound': float(lower_bound),
|
196 |
+
'upper_bound': float(upper_bound)
|
197 |
+
}
|
198 |
+
|
199 |
+
value_counts = data.value_counts().head(10).to_dict()
|
200 |
+
stats_dict['frequent_values'] = {str(k): int(v) for k, v in value_counts.items()}
|
201 |
+
|
202 |
+
return stats_dict
|
203 |
+
|
204 |
+
except Exception as e:
|
205 |
+
logger.warn(f"Error analyzing numerical column {column}: {str(e)}")
|
206 |
+
return {'type': 'numerical', 'error': str(e)}
|
207 |
+
|
208 |
+
def analyze_categorical(df: pd.DataFrame, column: str) -> Dict:
|
209 |
+
"""
|
210 |
+
Perform comprehensive analysis for a categorical feature.
|
211 |
+
|
212 |
+
Args:
|
213 |
+
column: The name of the column to analyze
|
214 |
+
|
215 |
+
Returns:
|
216 |
+
A dictionary containing categorical statistics
|
217 |
+
"""
|
218 |
+
data = df[column].dropna()
|
219 |
+
|
220 |
+
# Skip if no data available
|
221 |
+
if len(data) == 0:
|
222 |
+
return {'type': 'categorical', 'error': 'No non-null values found'}
|
223 |
+
|
224 |
+
try:
|
225 |
+
# Basic statistics
|
226 |
+
value_counts = data.value_counts()
|
227 |
+
value_percentages = data.value_counts(normalize=True) * 100
|
228 |
+
|
229 |
+
stats_dict = {
|
230 |
+
'type': 'categorical',
|
231 |
+
'count': len(data),
|
232 |
+
'missing': df[column].isna().sum(),
|
233 |
+
'unique_values': data.nunique(),
|
234 |
+
'mode': str(data.mode().iloc[0]) if not data.mode().empty else None,
|
235 |
+
'entropy': float(stats.entropy(value_counts / len(data))) if len(value_counts) > 1 else 0
|
236 |
+
}
|
237 |
+
|
238 |
+
# Category frequencies
|
239 |
+
max_categories = 30 # Limit to top 30 categories to prevent huge outputs
|
240 |
+
if len(value_counts) <= max_categories:
|
241 |
+
stats_dict['categories'] = {
|
242 |
+
str(category): {
|
243 |
+
'count': int(count),
|
244 |
+
'percentage': float(value_percentages[category])
|
245 |
+
} for category, count in value_counts.items()
|
246 |
+
}
|
247 |
+
else:
|
248 |
+
# Include top categories and group the rest as "other"
|
249 |
+
top_categories = value_counts.head(max_categories)
|
250 |
+
other_count = value_counts.iloc[max_categories:].sum()
|
251 |
+
other_percentage = value_percentages.iloc[max_categories:].sum()
|
252 |
+
|
253 |
+
stats_dict['categories'] = {
|
254 |
+
str(category): {
|
255 |
+
'count': int(count),
|
256 |
+
'percentage': float(value_percentages[category])
|
257 |
+
} for category, count in top_categories.items()
|
258 |
+
}
|
259 |
+
|
260 |
+
stats_dict['categories']['other'] = {
|
261 |
+
'count': int(other_count),
|
262 |
+
'percentage': float(other_percentage)
|
263 |
+
}
|
264 |
+
|
265 |
+
stats_dict['note'] = f"Showing top {max_categories} of {len(value_counts)} categories. Remaining grouped as 'other'."
|
266 |
+
|
267 |
+
return stats_dict
|
268 |
+
|
269 |
+
except Exception as e:
|
270 |
+
logger.warn(f"Error analyzing categorical column {column}: {str(e)}")
|
271 |
+
return {'type': 'categorical', 'error': str(e)}
|
272 |
+
|
273 |
+
def analyze_datetime(df: pd.DataFrame, column: str) -> Dict:
|
274 |
+
"""
|
275 |
+
Perform comprehensive analysis for a datetime feature.
|
276 |
+
|
277 |
+
Args:
|
278 |
+
column: The name of the column to analyze
|
279 |
+
|
280 |
+
Returns:
|
281 |
+
A dictionary containing datetime statistics
|
282 |
+
"""
|
283 |
+
data = df[column].dropna()
|
284 |
+
|
285 |
+
# Skip if no data available
|
286 |
+
if len(data) == 0:
|
287 |
+
return {'type': 'datetime', 'error': 'No non-null values found'}
|
288 |
+
|
289 |
+
try:
|
290 |
+
stats_dict = {
|
291 |
+
'type': 'datetime',
|
292 |
+
'count': len(data),
|
293 |
+
'missing': df[column].isna().sum(),
|
294 |
+
'min_date': str(data.min()),
|
295 |
+
'max_date': str(data.max()),
|
296 |
+
'range_days': (data.max() - data.min()).days if hasattr((data.max() - data.min()), 'days') else None
|
297 |
+
}
|
298 |
+
|
299 |
+
# Distribution by year, month, day of week if there are enough dates
|
300 |
+
if len(data) >= 10:
|
301 |
+
try:
|
302 |
+
stats_dict['year_counts'] = data.dt.year.value_counts().sort_index().to_dict()
|
303 |
+
stats_dict['month_counts'] = data.dt.month.value_counts().sort_index().to_dict()
|
304 |
+
stats_dict['day_of_week_counts'] = data.dt.dayofweek.value_counts().sort_index().to_dict()
|
305 |
+
stats_dict['hour_counts'] = data.dt.hour.value_counts().sort_index().to_dict() if hasattr(data.dt, 'hour') else None
|
306 |
+
except:
|
307 |
+
# Some datetime objects might not support certain attributes
|
308 |
+
pass
|
309 |
+
|
310 |
+
return stats_dict
|
311 |
+
|
312 |
+
except Exception as e:
|
313 |
+
logger.warn(f"Error analyzing datetime column {column}: {str(e)}")
|
314 |
+
return {'type': 'datetime', 'error': str(e)}
|
315 |
+
|
316 |
+
def analyze_text(df: pd.DataFrame, column: str) -> Dict:
|
317 |
+
"""
|
318 |
+
Perform basic analysis for a text feature.
|
319 |
+
|
320 |
+
Args:
|
321 |
+
column: The name of the column to analyze
|
322 |
+
|
323 |
+
Returns:
|
324 |
+
A dictionary containing text statistics
|
325 |
+
"""
|
326 |
+
data = df[column].dropna().astype(str)
|
327 |
+
|
328 |
+
# Skip if no data available
|
329 |
+
if len(data) == 0:
|
330 |
+
return {'type': 'text', 'error': 'No non-null values found'}
|
331 |
+
|
332 |
+
try:
|
333 |
+
text_lengths = data.str.len()
|
334 |
+
word_counts = data.str.split().str.len()
|
335 |
+
|
336 |
+
stats_dict = {
|
337 |
+
'type': 'text',
|
338 |
+
'count': len(data),
|
339 |
+
'missing': df[column].isna().sum(),
|
340 |
+
'unique_values': data.nunique(),
|
341 |
+
'text_length': {
|
342 |
+
'min': int(text_lengths.min()),
|
343 |
+
'max': int(text_lengths.max()),
|
344 |
+
'mean': float(text_lengths.mean()),
|
345 |
+
'median': float(text_lengths.median())
|
346 |
+
}
|
347 |
+
}
|
348 |
+
|
349 |
+
# Word count statistics if available
|
350 |
+
if not word_counts.isna().all():
|
351 |
+
stats_dict['word_count'] = {
|
352 |
+
'min': int(word_counts.min()),
|
353 |
+
'max': int(word_counts.max()),
|
354 |
+
'mean': float(word_counts.mean()),
|
355 |
+
'median': float(word_counts.median())
|
356 |
+
}
|
357 |
+
|
358 |
+
# Sample values (first few characters)
|
359 |
+
max_samples = 5
|
360 |
+
max_length = 100 # Show only first 100 chars for each sample
|
361 |
+
|
362 |
+
if len(data) <= max_samples:
|
363 |
+
samples = data.tolist()
|
364 |
+
else:
|
365 |
+
samples = data.sample(max_samples).tolist()
|
366 |
+
|
367 |
+
stats_dict['samples'] = [f"{s[:max_length]}{'...' if len(s) > max_length else ''}" for s in samples]
|
368 |
+
|
369 |
+
return stats_dict
|
370 |
+
|
371 |
+
except Exception as e:
|
372 |
+
logger.warning(f"Error analyzing text column {column}: {str(e)}")
|
373 |
+
return {'type': 'text', 'error': str(e)}
|
374 |
+
|
375 |
+
def generate_summary_report(results: dict) -> Dict:
|
376 |
+
"""
|
377 |
+
Generate a summary of the univariate analysis results.
|
378 |
+
|
379 |
+
Returns:
|
380 |
+
A dictionary containing the summary report
|
381 |
+
"""
|
382 |
+
logger.info("Generating summary report...")
|
383 |
+
|
384 |
+
# Extract key insights
|
385 |
+
summary = {
|
386 |
+
'dataset_summary': {
|
387 |
+
'dimensions': results['data_overview']['dimensions'],
|
388 |
+
'completeness': f"{results['data_overview']['observations']['completeness_ratio']*100:.2f}%"
|
389 |
+
},
|
390 |
+
'feature_types': {
|
391 |
+
'numerical': [],
|
392 |
+
'categorical': [],
|
393 |
+
'datetime': [],
|
394 |
+
'text': [],
|
395 |
+
'other': []
|
396 |
+
},
|
397 |
+
'missing_values': {
|
398 |
+
'total_missing_percentage': f"{results['missing_values']['total_missing_percentage']:.2f}%",
|
399 |
+
'features_with_high_missingness': []
|
400 |
+
},
|
401 |
+
'numerical_features': {
|
402 |
+
'highly_skewed': [],
|
403 |
+
'potentially_non_normal': []
|
404 |
+
},
|
405 |
+
'categorical_features': {
|
406 |
+
'high_cardinality': [],
|
407 |
+
'binary': []
|
408 |
+
}
|
409 |
+
}
|
410 |
+
|
411 |
+
# Categorize features by type
|
412 |
+
for col, feat_type in results['data_overview']['feature_types'].items():
|
413 |
+
if 'numerical' in feat_type:
|
414 |
+
summary['feature_types']['numerical'].append(col)
|
415 |
+
elif 'categorical' in feat_type or feat_type == 'boolean':
|
416 |
+
summary['feature_types']['categorical'].append(col)
|
417 |
+
elif feat_type == 'datetime':
|
418 |
+
summary['feature_types']['datetime'].append(col)
|
419 |
+
elif feat_type == 'text':
|
420 |
+
summary['feature_types']['text'].append(col)
|
421 |
+
else:
|
422 |
+
summary['feature_types']['other'].append(col)
|
423 |
+
|
424 |
+
# Features with high missingness (>10%)
|
425 |
+
for col, miss_pct in results['missing_values']['missing_percentage'].items():
|
426 |
+
if miss_pct > 10:
|
427 |
+
summary['missing_values']['features_with_high_missingness'].append({
|
428 |
+
'feature': col,
|
429 |
+
'missing_percentage': f"{miss_pct:.2f}%"
|
430 |
+
})
|
431 |
+
|
432 |
+
# Analyze numerical features
|
433 |
+
for col in summary['feature_types']['numerical']:
|
434 |
+
if col in results['feature_analysis']:
|
435 |
+
analysis = results['feature_analysis'][col]
|
436 |
+
|
437 |
+
if 'skewness' in analysis and abs(analysis['skewness']) > 1:
|
438 |
+
summary['numerical_features']['highly_skewed'].append({
|
439 |
+
'feature': col,
|
440 |
+
'skewness': analysis['skewness']
|
441 |
+
})
|
442 |
+
|
443 |
+
if 'normality_tests' in analysis and isinstance(analysis['normality_tests'], dict):
|
444 |
+
for test, res in analysis['normality_tests'].items():
|
445 |
+
if isinstance(res, dict) and 'is_normal' in res and not res['is_normal']:
|
446 |
+
summary['numerical_features']['potentially_non_normal'].append({
|
447 |
+
'feature': col,
|
448 |
+
'test': test
|
449 |
+
})
|
450 |
+
break
|
451 |
+
|
452 |
+
# Analyze categorical features
|
453 |
+
for col in summary['feature_types']['categorical']:
|
454 |
+
if col in results['feature_analysis']:
|
455 |
+
analysis = results['feature_analysis'][col]
|
456 |
+
|
457 |
+
if 'unique_values' in analysis:
|
458 |
+
if analysis['unique_values'] > 20:
|
459 |
+
summary['categorical_features']['high_cardinality'].append({
|
460 |
+
'feature': col,
|
461 |
+
'unique_values': analysis['unique_values']
|
462 |
+
})
|
463 |
+
elif analysis['unique_values'] == 2:
|
464 |
+
summary['categorical_features']['binary'].append(col)
|
465 |
+
|
466 |
+
return summary
|
467 |
+
|
468 |
+
def univariate_analysis(data_path: str, categorical_threshold: int = 10):
|
469 |
+
"""
|
470 |
+
Perform a comprehensive univariate analysis on a dataset.
|
471 |
+
|
472 |
+
Args:
|
473 |
+
data_path: Path to a data file or a pandas DataFrame
|
474 |
+
categorical_threshold: Maximum unique values to consider a feature categorical
|
475 |
+
Returns:
|
476 |
+
A dictionary containing all analysis results
|
477 |
+
"""
|
478 |
+
try:
|
479 |
+
if isinstance(data_path, str):
|
480 |
+
data = pd.read_csv(data_path)
|
481 |
+
else:
|
482 |
+
logger.error(f"Unsupported file format: {data_path}")
|
483 |
+
return {"error": f"Unsupported file format: {data_path}"}
|
484 |
+
|
485 |
+
if not isinstance(data, pd.DataFrame):
|
486 |
+
logger.error(f"Input is not a valid pandas DataFrame")
|
487 |
+
return {"error": "Input is not a valid pandas DataFrame"}
|
488 |
+
|
489 |
+
logger.warn("Staring Analysis.....")
|
490 |
+
|
491 |
+
copy_data = data.copy()
|
492 |
+
|
493 |
+
results = {}
|
494 |
+
|
495 |
+
results['data_overview'] = data_overview(
|
496 |
+
data=copy_data,
|
497 |
+
categorical_threshold=categorical_threshold
|
498 |
+
)
|
499 |
+
|
500 |
+
results['missing_values'] = missing_values_analysis(
|
501 |
+
data=copy_data
|
502 |
+
)
|
503 |
+
|
504 |
+
results['feature_analysis'] = analyze_features(
|
505 |
+
data=copy_data,
|
506 |
+
results = results
|
507 |
+
)
|
508 |
+
|
509 |
+
analysis_summary = generate_summary_report(results)
|
510 |
+
|
511 |
+
return json.dumps(analysis_summary)
|
512 |
+
|
513 |
+
except Exception as e:
|
514 |
+
logger.error(f"Error in analyze_dataset: {str(e)}")
|
515 |
+
return {"error": str(e)}
|
516 |
+
|
517 |
+
|
src/app/pipelines/eda/tools/data_cleaning_tools/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .handle_missing_values import handle_missing_values
|
2 |
+
from .handle_outliers import handle_outliers
|
src/app/pipelines/eda/tools/data_cleaning_tools/handle_missing_values.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from agno.utils.log import logger
|
3 |
+
import json
|
4 |
+
|
5 |
+
def handle_missing_values(file_path: str) -> list:
|
6 |
+
"""
|
7 |
+
Loads a CSV file using pandas, handles missing values using statistical methods,
|
8 |
+
and saves the processed file to storage "outputs/".
|
9 |
+
|
10 |
+
Args:
|
11 |
+
file_path (str): Path to the CSV file.
|
12 |
+
Returns:
|
13 |
+
Status of the entire process
|
14 |
+
"""
|
15 |
+
try:
|
16 |
+
df = pd.read_csv(file_path)
|
17 |
+
logger.info(f"Started Missing Values Handler. CSV Loaded with shape: {df.shape} ")
|
18 |
+
|
19 |
+
missing_before = df.isnull().sum().to_dict()
|
20 |
+
|
21 |
+
result_df = df.copy()
|
22 |
+
|
23 |
+
numeric_cols = result_df.select_dtypes(include=['number']).columns.tolist()
|
24 |
+
categorical_cols = result_df.select_dtypes(include=['category']).columns.tolist()
|
25 |
+
text_cols = result_df.select_dtypes(include=['object']).columns.tolist()
|
26 |
+
|
27 |
+
|
28 |
+
for col in numeric_cols:
|
29 |
+
if result_df[col].isnull().any():
|
30 |
+
median_value = result_df[col].median()
|
31 |
+
result_df[col] = result_df[col].fillna(median_value)
|
32 |
+
logger.info(f"Filled {col} missing values with median: {median_value}")
|
33 |
+
|
34 |
+
for col in categorical_cols:
|
35 |
+
if result_df[col].isnull().any():
|
36 |
+
if not result_df[col].mode().empty:
|
37 |
+
mode_value = result_df[col].mode()[0]
|
38 |
+
result_df[col] = result_df[col].fillna(mode_value)
|
39 |
+
logger.info(f"Filled {col} missing values with mode: {mode_value}")
|
40 |
+
else:
|
41 |
+
logger.warning(f"Column {col} has no mode. Missing values remain.")
|
42 |
+
|
43 |
+
for col in text_cols:
|
44 |
+
if result_df[col].isnull().any():
|
45 |
+
if not result_df[col].mode().empty:
|
46 |
+
mode_value = result_df[col].mode()[0]
|
47 |
+
result_df[col] = result_df[col].fillna(mode_value)
|
48 |
+
logger.info(f"Filled text column {col} missing values with most frequent value")
|
49 |
+
else:
|
50 |
+
result_df[col] = result_df[col].fillna("")
|
51 |
+
logger.info(f"Filled text column {col} missing values with empty string")
|
52 |
+
|
53 |
+
output_dir = "src/core/cache/dataset_logs"
|
54 |
+
|
55 |
+
missing_after = result_df.isnull().sum().to_dict()
|
56 |
+
result_df.to_csv(f"{output_dir}/handle_missing_values_output.csv", index=False)
|
57 |
+
|
58 |
+
logger.info(f"CSV output stored with shape: {result_df.shape} ")
|
59 |
+
|
60 |
+
return json.dumps({'status': 'success'})
|
61 |
+
|
62 |
+
except Exception as e:
|
63 |
+
logger.warning(f"Failed to process CSV file: {e}")
|
64 |
+
return str(e)
|
src/app/pipelines/eda/tools/data_cleaning_tools/handle_outliers.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from agno.tools import Toolkit
|
4 |
+
from agno.utils.log import logger
|
5 |
+
import json
|
6 |
+
|
7 |
+
def _detect_outliers(df: pd.DataFrame, threshold: float = 3.5, remove_outliers: bool = True) -> pd.DataFrame:
|
8 |
+
"""
|
9 |
+
Detect and handle outliers in numerical columns using the Modified Z-Score method.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
df (pd.DataFrame): Input dataframe.
|
13 |
+
threshold (float): The threshold for detecting outliers (default is 3.5).
|
14 |
+
remove_outliers (bool): Whether to remove outliers (True) or just mark them (False).
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
pd.DataFrame: DataFrame with outliers handled.
|
18 |
+
"""
|
19 |
+
result_df = df.copy()
|
20 |
+
numeric_cols = result_df.select_dtypes(include=[np.number]).columns.tolist()
|
21 |
+
|
22 |
+
if not numeric_cols:
|
23 |
+
logger.warning("No numerical columns found. Skipping outlier handling.")
|
24 |
+
return result_df
|
25 |
+
|
26 |
+
outlier_flags = pd.DataFrame(index=result_df.index)
|
27 |
+
|
28 |
+
for col in numeric_cols:
|
29 |
+
median = result_df[col].median()
|
30 |
+
mad = np.median(np.abs(result_df[col] - median))
|
31 |
+
|
32 |
+
if mad == 0: # Prevent division by zero
|
33 |
+
continue
|
34 |
+
|
35 |
+
# Compute Modified Z-Score
|
36 |
+
mod_z_score = 0.6745 * (result_df[col] - median) / mad
|
37 |
+
|
38 |
+
# Mark outliers
|
39 |
+
outlier_flags[col + '_outlier'] = np.abs(mod_z_score) > threshold
|
40 |
+
|
41 |
+
result_df['is_outlier'] = outlier_flags.any(axis=1)
|
42 |
+
|
43 |
+
if remove_outliers:
|
44 |
+
cleaned_df = result_df[~result_df['is_outlier']].drop(columns=['is_outlier'])
|
45 |
+
logger.info(f"Removed {result_df['is_outlier'].sum()} outliers.")
|
46 |
+
else:
|
47 |
+
cleaned_df = result_df
|
48 |
+
|
49 |
+
return cleaned_df
|
50 |
+
|
51 |
+
def handle_outliers(file_path: str, threshold: float = 3.5, remove_outliers: bool = True) -> dict:
|
52 |
+
"""
|
53 |
+
Loads a CSV file, detects/removes outliers using the Modified Z-Score method, and saves the cleaned data to /outputs
|
54 |
+
|
55 |
+
Args:
|
56 |
+
file_path (str): Path to the CSV file.
|
57 |
+
threshold (float): The threshold for detecting outliers.
|
58 |
+
remove_outliers (bool): Whether to remove outliers or just mark them.
|
59 |
+
|
60 |
+
Returns:
|
61 |
+
dict: Process status, outlier statistics, and output file path.
|
62 |
+
"""
|
63 |
+
try:
|
64 |
+
df = pd.read_csv(file_path)
|
65 |
+
logger.warn(f"Started Outlier Detection. CSV Loaded with shape: {df.shape}")
|
66 |
+
|
67 |
+
outliers_before = df.isnull().sum().to_dict()
|
68 |
+
df = _detect_outliers(df, threshold, remove_outliers)
|
69 |
+
|
70 |
+
outliers_after = df.isnull().sum().to_dict()
|
71 |
+
|
72 |
+
output_dir = "src/core/cache/dataset_logs"
|
73 |
+
|
74 |
+
output_path = f"{output_dir}/outlier_detection_output.csv"
|
75 |
+
df.to_csv(output_path, index=False)
|
76 |
+
|
77 |
+
logger.info(f"CSV output stored at {output_path} with shape: {df.shape}")
|
78 |
+
|
79 |
+
return json.dumps(f"Total outliers before: {outliers_before}. Total outliers after: {outliers_after}. The threshold used for dealing with outliers is {threshold}. The file is stored in path: '{output_path}' ")
|
80 |
+
|
81 |
+
except Exception as e:
|
82 |
+
logger.warning(f"Failed to process CSV file: {e}")
|
83 |
+
return {"status": "Error", "message": str(e)}
|
src/app/pipelines/eda/tools/lib.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .analysis_tools import univariate_analysis, bivariate_analysis, multivariate_analysis
|
2 |
+
from .data_cleaning_tools import handle_outliers, handle_missing_values
|
3 |
+
|
4 |
+
tool_library = {
|
5 |
+
# "HandleMissingValues": {
|
6 |
+
# "name": "Missing Values Handler",
|
7 |
+
# "function": handle_missing_values,
|
8 |
+
# "metadata": '''
|
9 |
+
# 1. Fills missing values with the median of each column.
|
10 |
+
# 2. Fills missing values with the mode, if available; otherwise, logs a warning.
|
11 |
+
# 3. Fills missing values with the most frequent value or an empty string if mode is unavailable.
|
12 |
+
# ''',
|
13 |
+
# },
|
14 |
+
"handle_outliers": {
|
15 |
+
"name": "Outlier Handler",
|
16 |
+
"function": handle_outliers,
|
17 |
+
"metadata": '''
|
18 |
+
1. Uses median and MAD (Median Absolute Deviation) to detect outliers.
|
19 |
+
2. Identifies extreme values based on a set threshold and either excludes them from the dataset or keeps them marked for reference.
|
20 |
+
''',
|
21 |
+
},
|
22 |
+
'univariate_analysis': {
|
23 |
+
"name": "Univariate Analysis",
|
24 |
+
"function": univariate_analysis,
|
25 |
+
"metadata": '''
|
26 |
+
1. Provides a high-level summary of dataset structure, data types, and missing value statistics.
|
27 |
+
2. Analyzes missing values, their distribution, and correlation between missing columns.
|
28 |
+
3. Performs feature-specific analysis based on detected data types
|
29 |
+
4. Computes descriptive statistics, normality tests, and outlier detection for numerical columns.
|
30 |
+
5. Analyzes categorical distributions, entropy, and category frequencies with top values.
|
31 |
+
6. (Truncated but likely) extracts patterns, ranges, and trends from datetime columns.
|
32 |
+
''',
|
33 |
+
},
|
34 |
+
'bivariate_analysis': {
|
35 |
+
"name": "Bivariate Analysis",
|
36 |
+
"function": bivariate_analysis,
|
37 |
+
"metadata": '''
|
38 |
+
1. Uses Pearson, Spearman, and Kendall correlations for numerical variables, chi-square/Cramér’s V for categorical associations, and statistical tests like ANOVA for numerical vs. categorical analysis. Identifies best-fit relationships (linear, polynomial, etc.) for numerical pairs.
|
39 |
+
2. Provides a detailed bivariate analysis of all variable pairs in a dataframe, summarizing key correlations, associations, and insights. Optionally generates and saves visualizations like scatterplots and heatmaps.
|
40 |
+
3. Uses Chi-square tests and Cramer's V to assess categorical feature associations, calculates Phi coefficient for 2x2 tables, and computes Goodman & Kruskal’s Lambda for predictive strength.
|
41 |
+
4. Identifies statistically significant relationships between categorical variables, ranks them by strength, and optionally visualizes contingency tables as heatmaps.
|
42 |
+
5. The function performs ANOVA (One-Way & Welch’s ANOVA), Point-Biserial Correlation (for binary categories), and Levene’s test to analyze relationships between numerical and categorical features, calculating effect sizes (eta-squared, omega-squared) for significance testing.
|
43 |
+
''',
|
44 |
+
},
|
45 |
+
'multivariate_analysis': {
|
46 |
+
"name": "Multivariate Analysis",
|
47 |
+
"function": multivariate_analysis,
|
48 |
+
"metadata": '''
|
49 |
+
1. Calculates the pairwise correlation coefficients between all numerical columns in a given DataFrame, generating a correlation matrix.
|
50 |
+
2. It identifies pairs of numerical features with absolute correlation values exceeding a threshold of 0.7, indicating strong linear relationships.
|
51 |
+
3. Calculates the Variance Inflation Factor (VIF) for each numerical feature to detect multicollinearity, flagging features with VIF values greater than 10 as potential issues.
|
52 |
+
4. Uses PCA, Factor Analysis, t-SNE, and MDS. Identifies principal components or latent factors, aiming for 80% variance retention in PCA.
|
53 |
+
5. Finds optimal clusters using silhouette score, evaluates cluster quality. Density-based clustering for smaller datasets (<=5000 rows), identifies noise. Fits Gaussian mixture models, evaluates model fit.
|
54 |
+
6. Statistical tests and mutual information to rank individual feature relevance. Random Forest models to determine feature contribution to prediction. Iterative feature removal to select top features (max 10).
|
55 |
+
7. Detects outliers by isolating them in random partitions, using a contamination rate of 5%. Identifies local density deviations for smaller datasets (<= 5000 rows), also using a 5percent contamination rate and 20 neighbors. Provides the number and percentage of detected outliers for each method.
|
56 |
+
8. MANOVA: Tests mean differences across categorical target groups for multiple numerical features. LDA: Dimensionality reduction and classification for categorical targets.
|
57 |
+
'''
|
58 |
+
}
|
59 |
+
}
|
src/app/pipelines/modules/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .data_statistics import DataStatisticsWorkflow
|
2 |
+
from .data_quality_assessment import DataQualityAssessmentWorkflow
|
3 |
+
from .data_understanding_context import DataUnderstandingContextWorkflow
|
4 |
+
from .univariate_analysis import UnivariateAnalysisWorkflow
|
src/app/pipelines/modules/data_quality_assessment.py
ADDED
@@ -0,0 +1,1657 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import os
|
3 |
+
import ast
|
4 |
+
import json
|
5 |
+
import difflib
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
from scipy import stats
|
9 |
+
from typing import Union
|
10 |
+
from collections import Counter
|
11 |
+
from src.core.utils import logger
|
12 |
+
from scipy.spatial.distance import pdist
|
13 |
+
from scipy.stats import chi2_contingency
|
14 |
+
from agno.agent import Agent, RunResponse
|
15 |
+
from agno.models.openai import OpenAIChat
|
16 |
+
from src.core.utils import KnowledgeBaseClass
|
17 |
+
from scipy.cluster.hierarchy import linkage, fcluster
|
18 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
19 |
+
from typing import Dict, List, Union, Tuple, Any, Optional
|
20 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
21 |
+
|
22 |
+
class DataQualityAssessmentWorkflow:
|
23 |
+
def __init__(
|
24 |
+
self, data_source: str,
|
25 |
+
llm_choice: str,
|
26 |
+
ml_task: str
|
27 |
+
) -> None:
|
28 |
+
''''''
|
29 |
+
self.data = None
|
30 |
+
self.data_source = data_source
|
31 |
+
self.llm_choice = llm_choice
|
32 |
+
self.ml_task = ml_task
|
33 |
+
self.llm = OpenAIChat(id=llm_choice, api_key=os.getenv('OPENAI_API_KEY'))
|
34 |
+
self.writer: Agent = Agent(
|
35 |
+
model=self.llm,
|
36 |
+
instructions=[
|
37 |
+
"You will be provided with lots of structured outputs. Your work is to display this"
|
38 |
+
"in a nicely formatted manner. You must analayze the results and output a comprehensive and insightful report"
|
39 |
+
],
|
40 |
+
markdown=True,
|
41 |
+
)
|
42 |
+
_ = self.load_data(data_source=data_source)
|
43 |
+
|
44 |
+
def load_data(self, data_source: str) -> Union[None, bool]:
|
45 |
+
'''Load CSV into dataframe'''
|
46 |
+
try:
|
47 |
+
self.data = pd.read_csv(data_source)
|
48 |
+
return True
|
49 |
+
except Exception as e:
|
50 |
+
logger.error(
|
51 |
+
f"Failed to read the file from the data source with error: {e}", log_type="data_quality_assessment", console=True)
|
52 |
+
return False
|
53 |
+
|
54 |
+
def analyze_missing_data(self, verbose=False) -> Dict:
|
55 |
+
"""Comprehensive analysis of missing data patterns in a CSV dataset"""
|
56 |
+
results = {
|
57 |
+
"status": "success",
|
58 |
+
"data_loaded": False,
|
59 |
+
"data_shape": None,
|
60 |
+
"missing_rates": None,
|
61 |
+
"little_mcar_test": None,
|
62 |
+
"mar_correlations": None,
|
63 |
+
"mutual_missingness": None,
|
64 |
+
"segment_analysis": None,
|
65 |
+
"summary": None,
|
66 |
+
"errors": []
|
67 |
+
}
|
68 |
+
|
69 |
+
try:
|
70 |
+
df = self.data
|
71 |
+
results["data_loaded"] = True
|
72 |
+
results["data_shape"] = df.shape
|
73 |
+
except Exception as e:
|
74 |
+
results["status"] = "failed"
|
75 |
+
results["errors"].append(f"Failed to load data: {str(e)}")
|
76 |
+
return results
|
77 |
+
|
78 |
+
try:
|
79 |
+
results["missing_rates"] = self.analyze_missing_rates(df, verbose=verbose)
|
80 |
+
except Exception as e:
|
81 |
+
results["errors"].append(f"Failed to analyze missing rates: {str(e)}")
|
82 |
+
logger.error(f"Failed to analyze missing rates: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
83 |
+
|
84 |
+
try:
|
85 |
+
results["little_mcar_test"] = self.littles_mcar_test(df, verbose=verbose)
|
86 |
+
except Exception as e:
|
87 |
+
results["errors"].append(f"Failed to perform Little's MCAR test: {str(e)}")
|
88 |
+
logger.error(f"Failed to perform Little's MCAR test: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
89 |
+
|
90 |
+
try:
|
91 |
+
results["mar_correlations"] = self.check_mar_correlations(df, verbose=verbose)
|
92 |
+
except Exception as e:
|
93 |
+
results["errors"].append(f"Failed to perform MAR correlation check: {str(e)}")
|
94 |
+
logger.error(f"Failed to perform MAR correlation check: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
95 |
+
|
96 |
+
try:
|
97 |
+
results["mutual_missingness"] = self.analyze_mutual_missingness(df, verbose=verbose)
|
98 |
+
except Exception as e:
|
99 |
+
results["errors"].append(f"Failed to analyze mutual missingness: {str(e)}")
|
100 |
+
logger.error(f"Failed to analyze mutual missingness: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
101 |
+
|
102 |
+
try:
|
103 |
+
results["segment_analysis"] = self.segment_based_analysis(df, verbose=verbose)
|
104 |
+
except Exception as e:
|
105 |
+
results["errors"].append(f"Failed to perform segment-based analysis: {str(e)}")
|
106 |
+
logger.error(f"Failed to perform segment-based analysis: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
107 |
+
|
108 |
+
results["summary"] = self.generate_summary(results, verbose=verbose)
|
109 |
+
|
110 |
+
return results
|
111 |
+
|
112 |
+
def analyze_missing_rates(self, df: pd.DataFrame, verbose=False) -> Dict:
|
113 |
+
"""Calculate per-variable/column missing rates"""
|
114 |
+
try:
|
115 |
+
missing_count = df.isnull().sum()
|
116 |
+
missing_percentage = (missing_count / len(df)) * 100
|
117 |
+
|
118 |
+
missing_info = pd.DataFrame({
|
119 |
+
'Column': missing_count.index,
|
120 |
+
'Missing Count': missing_count.values,
|
121 |
+
'Missing Percentage': missing_percentage.values
|
122 |
+
})
|
123 |
+
|
124 |
+
missing_info = missing_info.sort_values('Missing Percentage', ascending=False)
|
125 |
+
|
126 |
+
total_cells = df.shape[0] * df.shape[1]
|
127 |
+
total_missing = df.isnull().sum().sum()
|
128 |
+
overall_missing_percentage = (total_missing / total_cells) * 100
|
129 |
+
|
130 |
+
result = {
|
131 |
+
'per_column': missing_info.to_dict('records'),
|
132 |
+
'overall': {
|
133 |
+
'total_cells': total_cells,
|
134 |
+
'total_missing': total_missing,
|
135 |
+
'overall_missing_percentage': overall_missing_percentage
|
136 |
+
}
|
137 |
+
}
|
138 |
+
|
139 |
+
return result
|
140 |
+
except Exception as e:
|
141 |
+
logger.error(f"Error in analyze_missing_rates: {str(e)}", log_type="data_quality_assessment", console=verbose)
|
142 |
+
return {'error': str(e)}
|
143 |
+
|
144 |
+
def littles_mcar_test(self, df: pd.DataFrame, verbose=False) -> Dict:
|
145 |
+
"""Perform Little's MCAR test to check if data is Missing Completely At Random"""
|
146 |
+
try:
|
147 |
+
numeric_df = df.select_dtypes(include=[np.number])
|
148 |
+
if numeric_df.empty:
|
149 |
+
return {
|
150 |
+
'status': 'skipped',
|
151 |
+
'reason': 'No numeric columns available for Little\'s MCAR test'
|
152 |
+
}
|
153 |
+
|
154 |
+
if numeric_df.shape[0] < 3 or numeric_df.shape[1] < 2:
|
155 |
+
return {
|
156 |
+
'status': 'skipped',
|
157 |
+
'reason': 'Not enough data for meaningful Little\'s MCAR test'
|
158 |
+
}
|
159 |
+
|
160 |
+
results = []
|
161 |
+
|
162 |
+
for col in numeric_df.columns:
|
163 |
+
try:
|
164 |
+
missing_indicator = numeric_df[col].isnull()
|
165 |
+
|
166 |
+
if missing_indicator.sum() == 0 or missing_indicator.sum() == len(missing_indicator):
|
167 |
+
continue
|
168 |
+
|
169 |
+
for other_col in numeric_df.columns:
|
170 |
+
if col == other_col:
|
171 |
+
continue
|
172 |
+
|
173 |
+
if numeric_df[other_col].isnull().sum() == len(numeric_df):
|
174 |
+
continue
|
175 |
+
|
176 |
+
not_missing_values = numeric_df.loc[~missing_indicator, other_col].dropna()
|
177 |
+
missing_values = numeric_df.loc[missing_indicator, other_col].dropna()
|
178 |
+
|
179 |
+
if len(not_missing_values) > 5 and len(missing_values) > 5:
|
180 |
+
try:
|
181 |
+
not_missing_mean = not_missing_values.mean()
|
182 |
+
missing_mean = missing_values.mean()
|
183 |
+
|
184 |
+
t_stat, p_value = stats.ttest_ind(
|
185 |
+
not_missing_values,
|
186 |
+
missing_values,
|
187 |
+
equal_var=False,
|
188 |
+
nan_policy='omit'
|
189 |
+
)
|
190 |
+
|
191 |
+
results.append({
|
192 |
+
'reference_col': col,
|
193 |
+
'test_col': other_col,
|
194 |
+
'not_missing_mean': not_missing_mean,
|
195 |
+
'missing_mean': missing_mean,
|
196 |
+
'difference': abs(not_missing_mean - missing_mean),
|
197 |
+
't_statistic': t_stat,
|
198 |
+
'p_value': p_value,
|
199 |
+
'significant': p_value < 0.05
|
200 |
+
})
|
201 |
+
except Exception:
|
202 |
+
pass
|
203 |
+
except:
|
204 |
+
continue
|
205 |
+
significant_tests = [test for test in results if test.get('significant', False)]
|
206 |
+
|
207 |
+
if not results:
|
208 |
+
is_mcar = None
|
209 |
+
evidence_strength = None
|
210 |
+
else:
|
211 |
+
prop_significant = len(significant_tests) / len(results)
|
212 |
+
|
213 |
+
is_mcar = prop_significant <= 0.05
|
214 |
+
|
215 |
+
if prop_significant == 0:
|
216 |
+
evidence_strength = "Strong evidence for MCAR"
|
217 |
+
elif prop_significant <= 0.05:
|
218 |
+
evidence_strength = "Moderate evidence for MCAR"
|
219 |
+
elif prop_significant <= 0.10:
|
220 |
+
evidence_strength = "Weak evidence against MCAR"
|
221 |
+
else:
|
222 |
+
evidence_strength = "Strong evidence against MCAR"
|
223 |
+
|
224 |
+
return {
|
225 |
+
'status': 'completed',
|
226 |
+
'is_mcar': is_mcar,
|
227 |
+
'tests_conducted': len(results),
|
228 |
+
'significant_tests': len(significant_tests),
|
229 |
+
'proportion_significant': len(significant_tests) / len(results) if results else None,
|
230 |
+
'evidence_strength': evidence_strength,
|
231 |
+
'test_details': results[:10]
|
232 |
+
}
|
233 |
+
except Exception as e:
|
234 |
+
logger.error(f"Error in littles_mcar_test: {str(e)}", log_type="data_quality_assessment", console=verbose)
|
235 |
+
return {'status': 'error', 'error': str(e)}
|
236 |
+
|
237 |
+
def check_mar_correlations(self, df: pd.DataFrame, verbose=False) -> Dict:
|
238 |
+
"""Check if missingness in one variable depends on observed values in others (MAR)"""
|
239 |
+
try:
|
240 |
+
results = {}
|
241 |
+
|
242 |
+
for col in df.columns:
|
243 |
+
try:
|
244 |
+
missing_count = df[col].isnull().sum()
|
245 |
+
if missing_count == 0 or missing_count == len(df):
|
246 |
+
continue
|
247 |
+
|
248 |
+
missingness = df[col].isnull().astype(int)
|
249 |
+
|
250 |
+
col_results = {}
|
251 |
+
|
252 |
+
for other_col in df.columns:
|
253 |
+
if col == other_col or df[other_col].isnull().sum() == len(df):
|
254 |
+
continue
|
255 |
+
|
256 |
+
if pd.api.types.is_numeric_dtype(df[other_col]):
|
257 |
+
try:
|
258 |
+
present_values = df.loc[~df[col].isnull(), other_col].dropna()
|
259 |
+
missing_values = df.loc[df[col].isnull(), other_col].dropna()
|
260 |
+
|
261 |
+
if len(present_values) > 5 and len(missing_values) > 5:
|
262 |
+
present_mean = present_values.mean()
|
263 |
+
missing_mean = missing_values.mean()
|
264 |
+
|
265 |
+
t_stat, p_value = stats.ttest_ind(
|
266 |
+
present_values,
|
267 |
+
missing_values,
|
268 |
+
equal_var=False,
|
269 |
+
nan_policy='omit'
|
270 |
+
)
|
271 |
+
|
272 |
+
if p_value < 0.05:
|
273 |
+
col_results[other_col] = {
|
274 |
+
'test': 't-test',
|
275 |
+
'present_mean': present_mean,
|
276 |
+
'missing_mean': missing_mean,
|
277 |
+
'mean_difference': missing_mean - present_mean,
|
278 |
+
't_statistic': t_stat,
|
279 |
+
'p_value': p_value,
|
280 |
+
'significant': True
|
281 |
+
}
|
282 |
+
except Exception as e:
|
283 |
+
pass
|
284 |
+
|
285 |
+
elif pd.api.types.is_object_dtype(df[other_col]) or pd.api.types.is_categorical_dtype(df[other_col]):
|
286 |
+
try:
|
287 |
+
value_counts = df[other_col].value_counts()
|
288 |
+
threshold = max(5, len(df) * 0.01)
|
289 |
+
|
290 |
+
temp_df = df.copy()
|
291 |
+
|
292 |
+
low_freq_cats = value_counts[value_counts < threshold].index.tolist()
|
293 |
+
if low_freq_cats:
|
294 |
+
temp_df[other_col] = temp_df[other_col].apply(
|
295 |
+
lambda x: 'Other' if x in low_freq_cats else x
|
296 |
+
)
|
297 |
+
|
298 |
+
contingency = pd.crosstab(
|
299 |
+
temp_df[col].isnull(),
|
300 |
+
temp_df[other_col].fillna('Missing')
|
301 |
+
)
|
302 |
+
|
303 |
+
if (contingency.shape[0] > 1 and contingency.shape[1] > 1 and
|
304 |
+
(contingency < 5).sum().sum() <= contingency.size * 0.2):
|
305 |
+
|
306 |
+
chi2, p, dof, expected = chi2_contingency(contingency)
|
307 |
+
|
308 |
+
if p < 0.05:
|
309 |
+
col_results[other_col] = {
|
310 |
+
'test': 'chi-square',
|
311 |
+
'chi2': chi2,
|
312 |
+
'p_value': p,
|
313 |
+
'dof': dof,
|
314 |
+
'significant': True
|
315 |
+
}
|
316 |
+
except Exception as e:
|
317 |
+
pass
|
318 |
+
|
319 |
+
if col_results:
|
320 |
+
results[col] = col_results
|
321 |
+
except:
|
322 |
+
continue
|
323 |
+
|
324 |
+
is_mar = len(results) > 0
|
325 |
+
|
326 |
+
return {
|
327 |
+
'is_mar': is_mar,
|
328 |
+
'details': results,
|
329 |
+
'columns_with_mar_evidence': list(results.keys())
|
330 |
+
}
|
331 |
+
except Exception as e:
|
332 |
+
logger.error(f"Error in check_mar_correlations: {str(e)}", log_type="data_quality_assessment", console=verbose)
|
333 |
+
return {'error': str(e)}
|
334 |
+
|
335 |
+
def analyze_mutual_missingness(self, df: pd.DataFrame, verbose=False) -> Dict:
|
336 |
+
"""Check for mutual missingness - analyze if certain features tend to be missing together"""
|
337 |
+
try:
|
338 |
+
binary_missing = df.isnull().astype(int)
|
339 |
+
|
340 |
+
if binary_missing.sum().sum() == 0:
|
341 |
+
return {
|
342 |
+
'status': 'skipped',
|
343 |
+
'reason': 'No missing values found in dataset'
|
344 |
+
}
|
345 |
+
|
346 |
+
missingness_correlation = binary_missing.corr()
|
347 |
+
|
348 |
+
strong_correlations = []
|
349 |
+
|
350 |
+
for i in range(len(missingness_correlation.columns)):
|
351 |
+
try:
|
352 |
+
for j in range(i+1, len(missingness_correlation.columns)):
|
353 |
+
col1 = missingness_correlation.columns[i]
|
354 |
+
col2 = missingness_correlation.columns[j]
|
355 |
+
corr_value = missingness_correlation.iloc[i, j]
|
356 |
+
|
357 |
+
if abs(corr_value) > 0.5:
|
358 |
+
contingency = pd.crosstab(
|
359 |
+
binary_missing[col1],
|
360 |
+
binary_missing[col2]
|
361 |
+
)
|
362 |
+
|
363 |
+
try:
|
364 |
+
if contingency.values.min() < 5:
|
365 |
+
from scipy.stats import fisher_exact
|
366 |
+
_, p_value = fisher_exact(contingency)
|
367 |
+
else:
|
368 |
+
_, p_value, _, _ = chi2_contingency(contingency)
|
369 |
+
|
370 |
+
if p_value < 0.05:
|
371 |
+
strong_correlations.append({
|
372 |
+
'column1': col1,
|
373 |
+
'column2': col2,
|
374 |
+
'correlation': corr_value,
|
375 |
+
'p_value': p_value,
|
376 |
+
'significant': True
|
377 |
+
})
|
378 |
+
except Exception:
|
379 |
+
strong_correlations.append({
|
380 |
+
'column1': col1,
|
381 |
+
'column2': col2,
|
382 |
+
'correlation': corr_value,
|
383 |
+
'significant': None
|
384 |
+
})
|
385 |
+
except:
|
386 |
+
continue
|
387 |
+
|
388 |
+
co_occurrence = {}
|
389 |
+
for i in range(len(df.columns)):
|
390 |
+
try:
|
391 |
+
for j in range(i+1, len(df.columns)):
|
392 |
+
col1 = df.columns[i]
|
393 |
+
col2 = df.columns[j]
|
394 |
+
|
395 |
+
both_missing = (df[col1].isnull() & df[col2].isnull()).sum()
|
396 |
+
|
397 |
+
if both_missing > 0:
|
398 |
+
col1_missing = df[col1].isnull().sum()
|
399 |
+
col2_missing = df[col2].isnull().sum()
|
400 |
+
|
401 |
+
union_missing = (df[col1].isnull() | df[col2].isnull()).sum()
|
402 |
+
jaccard = both_missing / union_missing if union_missing > 0 else 0
|
403 |
+
|
404 |
+
co_occurrence[(col1, col2)] = {
|
405 |
+
'both_missing': both_missing,
|
406 |
+
'col1_missing': col1_missing,
|
407 |
+
'col2_missing': col2_missing,
|
408 |
+
'co_occurrence_ratio': both_missing / min(col1_missing, col2_missing) if min(col1_missing, col2_missing) > 0 else 0,
|
409 |
+
'jaccard_coefficient': jaccard
|
410 |
+
}
|
411 |
+
except:
|
412 |
+
continue
|
413 |
+
|
414 |
+
co_occurrence_list = [
|
415 |
+
{
|
416 |
+
'column1': cols[0],
|
417 |
+
'column2': cols[1],
|
418 |
+
'both_missing_count': data['both_missing'],
|
419 |
+
'co_occurrence_ratio': data['co_occurrence_ratio'],
|
420 |
+
'jaccard_coefficient': data['jaccard_coefficient']
|
421 |
+
}
|
422 |
+
for cols, data in co_occurrence.items()
|
423 |
+
]
|
424 |
+
co_occurrence_list.sort(key=lambda x: x['co_occurrence_ratio'], reverse=True)
|
425 |
+
|
426 |
+
missingness_clusters = None
|
427 |
+
try:
|
428 |
+
if len(binary_missing.columns) > 1:
|
429 |
+
|
430 |
+
|
431 |
+
cols_with_missing = [col for col in binary_missing.columns if binary_missing[col].sum() > 0]
|
432 |
+
if len(cols_with_missing) > 1:
|
433 |
+
missing_data = binary_missing[cols_with_missing].T
|
434 |
+
|
435 |
+
dist_matrix = pdist(missing_data, metric='correlation')
|
436 |
+
|
437 |
+
linkage_matrix = linkage(dist_matrix, method='average')
|
438 |
+
|
439 |
+
clusters = fcluster(linkage_matrix, t=0.5, criterion='distance')
|
440 |
+
|
441 |
+
missingness_clusters = {}
|
442 |
+
for i, col in enumerate(cols_with_missing):
|
443 |
+
cluster_id = clusters[i]
|
444 |
+
if cluster_id not in missingness_clusters:
|
445 |
+
missingness_clusters[cluster_id] = []
|
446 |
+
missingness_clusters[cluster_id].append(col)
|
447 |
+
|
448 |
+
missingness_clusters = {k: v for k, v in missingness_clusters.items() if len(v) > 1}
|
449 |
+
except Exception as e:
|
450 |
+
logger.info(f"Warning: Clustering of missingness patterns failed: {str(e)}", log_type="data_quality_assessment", console=verbose)
|
451 |
+
|
452 |
+
return {
|
453 |
+
'status': 'completed',
|
454 |
+
'strong_correlations': strong_correlations,
|
455 |
+
'co_occurrence': co_occurrence_list[:15],
|
456 |
+
'missingness_clusters': missingness_clusters
|
457 |
+
}
|
458 |
+
except Exception as e:
|
459 |
+
logger.error(f"Error in analyze_mutual_missingness: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
460 |
+
return {'status': 'error', 'error': str(e)}
|
461 |
+
|
462 |
+
def segment_based_analysis(self, df: pd.DataFrame, verbose=False) -> Dict:
|
463 |
+
"""Analyze missingness grouped by categories"""
|
464 |
+
try:
|
465 |
+
results = {}
|
466 |
+
|
467 |
+
categorical_cols = [
|
468 |
+
col for col in df.columns
|
469 |
+
if pd.api.types.is_object_dtype(df[col]) or
|
470 |
+
pd.api.types.is_categorical_dtype(df[col]) or
|
471 |
+
(pd.api.types.is_numeric_dtype(df[col]) and df[col].nunique() < 10)
|
472 |
+
]
|
473 |
+
|
474 |
+
if not categorical_cols:
|
475 |
+
return {
|
476 |
+
'status': 'skipped',
|
477 |
+
'reason': 'No suitable categorical columns found for segmentation'
|
478 |
+
}
|
479 |
+
|
480 |
+
for cat_col in categorical_cols:
|
481 |
+
try:
|
482 |
+
if df[cat_col].isnull().sum() > 0.5 * len(df):
|
483 |
+
continue
|
484 |
+
|
485 |
+
df_temp = df.copy()
|
486 |
+
df_temp[cat_col] = df_temp[cat_col].fillna('Missing')
|
487 |
+
|
488 |
+
categories = df_temp[cat_col].value_counts().head(10).index.tolist()
|
489 |
+
|
490 |
+
category_results = {}
|
491 |
+
|
492 |
+
for category in categories:
|
493 |
+
subset = df_temp[df_temp[cat_col] == category]
|
494 |
+
|
495 |
+
if len(subset) < 5:
|
496 |
+
continue
|
497 |
+
|
498 |
+
missing_percentages = subset.drop(columns=[cat_col]).isnull().mean() * 100
|
499 |
+
|
500 |
+
missing_percentages = missing_percentages[missing_percentages > 0]
|
501 |
+
|
502 |
+
if not missing_percentages.empty:
|
503 |
+
category_results[category] = {
|
504 |
+
'sample_size': len(subset),
|
505 |
+
'missing_percentages': missing_percentages.to_dict()
|
506 |
+
}
|
507 |
+
|
508 |
+
if category_results:
|
509 |
+
results[cat_col] = category_results
|
510 |
+
|
511 |
+
except Exception as e:
|
512 |
+
logger.error(f"Error in segment_based_analysis: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
513 |
+
return {'status': 'error', 'error': str(e)}
|
514 |
+
|
515 |
+
return {
|
516 |
+
'status': 'completed',
|
517 |
+
'segments_analyzed': len(results),
|
518 |
+
'details': results
|
519 |
+
}
|
520 |
+
except Exception as e:
|
521 |
+
logger.error(f"Error in segment_based_analysis: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
522 |
+
return {'status': 'error', 'error': str(e)}
|
523 |
+
|
524 |
+
def generate_summary(self, results: Dict, verbose=False) -> str:
|
525 |
+
"""Generate a human-readable summary of missing data analysis results"""
|
526 |
+
summary_lines = ["# Missing Data Analysis Summary"]
|
527 |
+
|
528 |
+
try:
|
529 |
+
if results["data_loaded"]:
|
530 |
+
summary_lines.append("\n## Dataset Overview")
|
531 |
+
summary_lines.append(f"- Successfully loaded dataset with shape: {results['data_shape'][0]} rows × {results['data_shape'][1]} columns")
|
532 |
+
else:
|
533 |
+
summary_lines.append("\n## Error Loading Dataset")
|
534 |
+
summary_lines.append("- Failed to load the dataset. Please check the file path and format.")
|
535 |
+
return "\n".join(summary_lines)
|
536 |
+
except Exception as e:
|
537 |
+
logger.error(f"Error in dataset loading section: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
538 |
+
|
539 |
+
try:
|
540 |
+
if results.get("missing_rates") and "error" not in results["missing_rates"]:
|
541 |
+
missing_rates = results["missing_rates"]
|
542 |
+
summary_lines.append("\n## Missing Values Overview")
|
543 |
+
summary_lines.append(f"- Overall missing data: {missing_rates['overall']['overall_missing_percentage']:.2f}% "
|
544 |
+
f"({missing_rates['overall']['total_missing']} out of {missing_rates['overall']['total_cells']} cells)")
|
545 |
+
|
546 |
+
if missing_rates['per_column']:
|
547 |
+
summary_lines.append("\n### Top columns with missing values:")
|
548 |
+
top_missing = sorted(missing_rates['per_column'], key=lambda x: x['Missing Percentage'], reverse=True)[:5]
|
549 |
+
for col in top_missing:
|
550 |
+
if col['Missing Percentage'] > 0:
|
551 |
+
summary_lines.append(f"- {col['Column']}: {col['Missing Percentage']:.2f}% ({col['Missing Count']} values)")
|
552 |
+
except Exception as e:
|
553 |
+
logger.error(f"Error in missing values overview section: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
554 |
+
|
555 |
+
try:
|
556 |
+
little_test = results.get("little_mcar_test", {})
|
557 |
+
if little_test.get("status") == "completed":
|
558 |
+
summary_lines.append("\n## Little's MCAR Test Results")
|
559 |
+
if little_test.get("is_mcar") is not None:
|
560 |
+
if little_test["is_mcar"]:
|
561 |
+
summary_lines.append("- **Data appears to be Missing Completely At Random (MCAR)**")
|
562 |
+
if "proportion_significant" in little_test:
|
563 |
+
summary_lines.append(f"- Proportion of significant tests: {little_test['proportion_significant']:.2f}")
|
564 |
+
summary_lines.append(f"- Evidence: {little_test.get('evidence_strength', 'Evidence supports MCAR')}")
|
565 |
+
else:
|
566 |
+
summary_lines.append("- **Data does not appear to be Missing Completely At Random (MCAR)**")
|
567 |
+
if "proportion_significant" in little_test:
|
568 |
+
summary_lines.append(f"- Proportion of significant tests: {little_test['proportion_significant']:.2f}")
|
569 |
+
summary_lines.append(f"- Evidence: {little_test.get('evidence_strength', 'Evidence against MCAR')}")
|
570 |
+
else:
|
571 |
+
summary_lines.append("- Could not determine MCAR status definitively")
|
572 |
+
elif little_test.get("status") == "skipped":
|
573 |
+
summary_lines.append("\n## Little's MCAR Test Results")
|
574 |
+
summary_lines.append(f"- Test skipped: {little_test.get('reason', 'Unknown reason')}")
|
575 |
+
except Exception as e:
|
576 |
+
logger.error(f"Error in Little's MCAR test section: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
577 |
+
|
578 |
+
try:
|
579 |
+
mar_results = results.get("mar_correlations", {})
|
580 |
+
if mar_results and "error" not in mar_results:
|
581 |
+
summary_lines.append("\n## Missing At Random (MAR) Analysis")
|
582 |
+
if mar_results.get("is_mar"):
|
583 |
+
summary_lines.append("- **Evidence found that data is Missing At Random (MAR)**")
|
584 |
+
summary_lines.append("- Missingness in some variables depends on observed values in other variables")
|
585 |
+
mar_details = mar_results.get("details", {})
|
586 |
+
for col in list(mar_details.keys())[:3]:
|
587 |
+
related_cols = list(mar_details[col].keys())
|
588 |
+
summary_lines.append(f"- Missingness in '{col}' depends on values in: {', '.join(related_cols[:3])}"
|
589 |
+
+ (f" and {len(related_cols) - 3} more columns" if len(related_cols) > 3 else ""))
|
590 |
+
else:
|
591 |
+
summary_lines.append("- No clear evidence that data is Missing At Random (MAR)")
|
592 |
+
except Exception as e:
|
593 |
+
logger.error(f"Error in MAR analysis section: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
594 |
+
|
595 |
+
try:
|
596 |
+
mutual_results = results.get("mutual_missingness", {})
|
597 |
+
if mutual_results.get("status") == "completed":
|
598 |
+
summary_lines.append("\n## Mutual Missingness Analysis")
|
599 |
+
strong_corrs = mutual_results.get("strong_correlations", [])
|
600 |
+
if strong_corrs:
|
601 |
+
summary_lines.append("- **Some variables tend to be missing together**")
|
602 |
+
for corr in strong_corrs[:3]:
|
603 |
+
summary_lines.append(f"- '{corr['column1']}' and '{corr['column2']}' have strongly correlated missingness (r = {corr['correlation']:.2f})")
|
604 |
+
if len(strong_corrs) > 3:
|
605 |
+
summary_lines.append(f"- {len(strong_corrs) - 3} other pairs of columns with strongly correlated missingness")
|
606 |
+
else:
|
607 |
+
summary_lines.append("- No strong patterns of mutual missingness detected")
|
608 |
+
|
609 |
+
co_occurrence = mutual_results.get("co_occurrence", [])
|
610 |
+
if co_occurrence:
|
611 |
+
summary_lines.append("\n### Most common co-occurrences of missing values:")
|
612 |
+
for co in co_occurrence[:3]:
|
613 |
+
summary_lines.append(f"- '{co['column1']}' and '{co['column2']}' are missing together in {co['both_missing_count']} rows")
|
614 |
+
except Exception as e:
|
615 |
+
logger.error(f"Error in mutual missingness section: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
616 |
+
|
617 |
+
try:
|
618 |
+
segment_results = results.get("segment_analysis", {})
|
619 |
+
if segment_results.get("status") == "completed":
|
620 |
+
summary_lines.append("\n## Segment-based Missing Value Analysis")
|
621 |
+
segments = segment_results.get("details", {})
|
622 |
+
if segments:
|
623 |
+
summary_lines.append(f"- Analyzed missingness patterns across {len(segments)} different segmentations")
|
624 |
+
example_segment = list(segments.keys())[0]
|
625 |
+
summary_lines.append(f"\n### Example: Missingness by '{example_segment}' categories")
|
626 |
+
for category in list(segments[example_segment].keys())[:3]:
|
627 |
+
cat_data = segments[example_segment][category]
|
628 |
+
sample_size = cat_data['sample_size']
|
629 |
+
missing_data = cat_data['missing_percentages']
|
630 |
+
top_missing = sorted(missing_data.items(), key=lambda x: x[1], reverse=True)[:2]
|
631 |
+
if top_missing:
|
632 |
+
summary_lines.append(f"- In category '{category}' (n={sample_size}):")
|
633 |
+
for col, pct in top_missing:
|
634 |
+
summary_lines.append(f" - '{col}' has {pct:.1f}% missing values")
|
635 |
+
else:
|
636 |
+
summary_lines.append("- No significant segment-based missingness patterns found")
|
637 |
+
except Exception as e:
|
638 |
+
logger.error(f"Error in segment-based analysis section: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
639 |
+
|
640 |
+
try:
|
641 |
+
summary_lines.append("\n## Recommendations for Missing Data")
|
642 |
+
if results.get("missing_rates") and "error" not in results["missing_rates"]:
|
643 |
+
missing_rates = results["missing_rates"]
|
644 |
+
high_missing_cols = [col for col in missing_rates['per_column'] if col['Missing Percentage'] > 50]
|
645 |
+
if high_missing_cols:
|
646 |
+
summary_lines.append("- **Consider dropping columns** with high missing rates (>50%):")
|
647 |
+
for col in high_missing_cols[:3]:
|
648 |
+
summary_lines.append(f" - '{col['Column']}' ({col['Missing Percentage']:.1f}% missing)")
|
649 |
+
if len(high_missing_cols) > 3:
|
650 |
+
summary_lines.append(f" - and {len(high_missing_cols) - 3} other columns")
|
651 |
+
|
652 |
+
if results.get("little_mcar_test", {}).get("is_mcar") is True:
|
653 |
+
summary_lines.append("- Since data appears to be MCAR, **simple imputation** methods like mean/median/mode imputation are reasonable")
|
654 |
+
|
655 |
+
if results.get("mar_correlations", {}).get("is_mar") is True:
|
656 |
+
summary_lines.append("- Since evidence suggests data is MAR, consider **model-based imputation methods** like:")
|
657 |
+
summary_lines.append(" - Multiple Imputation by Chained Equations (MICE)")
|
658 |
+
summary_lines.append(" - K-Nearest Neighbors (KNN) imputation")
|
659 |
+
summary_lines.append(" - Regression-based imputation")
|
660 |
+
|
661 |
+
if results.get("mutual_missingness", {}).get("strong_correlations"):
|
662 |
+
summary_lines.append("- For variables that are missing together, consider **multivariate imputation** approaches")
|
663 |
+
except Exception as e:
|
664 |
+
logger.error(f"Error in recommendations section: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
665 |
+
|
666 |
+
try:
|
667 |
+
if results.get("errors"):
|
668 |
+
summary_lines.append("\n## Analysis Issues")
|
669 |
+
summary_lines.append("The following issues were encountered during analysis:")
|
670 |
+
for error in results["errors"]:
|
671 |
+
summary_lines.append(f"- {error}")
|
672 |
+
except Exception as e:
|
673 |
+
logger.error(f"Error in final error reporting section: {str(e)}", log_type='data_quality_assessment', console=verbose)
|
674 |
+
|
675 |
+
return "\n".join(summary_lines)
|
676 |
+
|
677 |
+
def agent_mva(self, report: str, verbose=False):
|
678 |
+
try:
|
679 |
+
kbc = KnowledgeBaseClass()
|
680 |
+
kb = kbc.initialize_knowledge_base(task_type=self.ml_task)
|
681 |
+
agent = kbc.initialize_agent(
|
682 |
+
agent_name="missing_value_analysis_agent",
|
683 |
+
llm_choice=self.llm_choice,
|
684 |
+
knowledge_base=kb
|
685 |
+
)
|
686 |
+
|
687 |
+
input = f"Task type: {self.ml_task}\n\n\n{report}"
|
688 |
+
res: RunResponse = agent.run(input, stream=False)
|
689 |
+
|
690 |
+
formatted_code = self.post_process_code(code=res.content.code_generated, verbose=verbose)
|
691 |
+
|
692 |
+
with open("temp.py", "w") as f:
|
693 |
+
f.write(f'''{formatted_code}''')
|
694 |
+
|
695 |
+
from temp import main
|
696 |
+
|
697 |
+
results = main(self.data_source)
|
698 |
+
|
699 |
+
return results
|
700 |
+
except Exception as e:
|
701 |
+
logger.error(f"Failed to run agentic analysis with error: {e}", log_type="data_quality_assessment", console=verbose)
|
702 |
+
return ""
|
703 |
+
|
704 |
+
def post_process_code(self, code: str, verbose=False) -> str:
|
705 |
+
def fix_fstring_quotes(match):
|
706 |
+
inner = match.group(1)
|
707 |
+
fixed_inner = re.sub(r'\{([^{}]*?)\["([^"]+)"\]\}', r"{\1['\2']}", inner)
|
708 |
+
return f'f"{fixed_inner}"'
|
709 |
+
|
710 |
+
fstring_pattern = r"f'([^']*{[^}]+}[^']*)'"
|
711 |
+
code = re.sub(fstring_pattern, fix_fstring_quotes, code)
|
712 |
+
|
713 |
+
try:
|
714 |
+
ast.parse(code)
|
715 |
+
except SyntaxError as e:
|
716 |
+
logger.error(f"[Syntax Error after fix] Line {e.lineno}: {e.msg}", log_type="data_quality_assessment", console=verbose)
|
717 |
+
pass
|
718 |
+
|
719 |
+
return code
|
720 |
+
|
721 |
+
def detect_duplicates(
|
722 |
+
self,
|
723 |
+
key_columns: Optional[List[str]] = None,
|
724 |
+
similarity_columns: Optional[List[str]] = None,
|
725 |
+
similarity_threshold: float = 0.8,
|
726 |
+
numeric_threshold: float = 0.05,
|
727 |
+
sample_size: Optional[int] = None,
|
728 |
+
verbose = False
|
729 |
+
) -> Dict[str, Any]:
|
730 |
+
"""Comprehensive duplicate detection function that can analyze any CSV dataset"""
|
731 |
+
|
732 |
+
df = self.data
|
733 |
+
|
734 |
+
results = {
|
735 |
+
"dataset_info": {
|
736 |
+
"original_rows": len(df),
|
737 |
+
"original_columns": len(df.columns),
|
738 |
+
"column_dtypes": {col: str(df[col].dtype) for col in df.columns}
|
739 |
+
},
|
740 |
+
"exact_duplicates": {},
|
741 |
+
"key_based_duplicates": {},
|
742 |
+
"near_duplicates": {},
|
743 |
+
"distribution_impact": {},
|
744 |
+
"summary": ""
|
745 |
+
}
|
746 |
+
|
747 |
+
if sample_size and len(df) > sample_size:
|
748 |
+
try:
|
749 |
+
analysis_df = df.sample(sample_size, random_state=42)
|
750 |
+
results["dataset_info"]["sampled"] = True
|
751 |
+
results["dataset_info"]["sample_size"] = sample_size
|
752 |
+
except Exception as e:
|
753 |
+
analysis_df = df
|
754 |
+
results["dataset_info"]["sampling_error"] = str(e)
|
755 |
+
else:
|
756 |
+
analysis_df = df
|
757 |
+
|
758 |
+
try:
|
759 |
+
exact_duplicates = self.analyze_exact_duplicates(analysis_df, verbose=verbose)
|
760 |
+
results["exact_duplicates"] = exact_duplicates
|
761 |
+
except Exception as e:
|
762 |
+
results["exact_duplicates"]["error"] = str(e)
|
763 |
+
|
764 |
+
try:
|
765 |
+
key_duplicates = self.analyze_key_based_duplicates(analysis_df, key_columns, verbose=verbose)
|
766 |
+
results["key_based_duplicates"] = key_duplicates
|
767 |
+
except Exception as e:
|
768 |
+
results["key_based_duplicates"]["error"] = str(e)
|
769 |
+
|
770 |
+
try:
|
771 |
+
near_duplicates = self.analyze_near_duplicates(analysis_df, similarity_columns,
|
772 |
+
similarity_threshold, numeric_threshold, verbose=verbose)
|
773 |
+
results["near_duplicates"] = near_duplicates
|
774 |
+
except Exception as e:
|
775 |
+
results["near_duplicates"]["error"] = str(e)
|
776 |
+
|
777 |
+
try:
|
778 |
+
distribution_impact = self.analyze_distribution_impact(analysis_df, results, verbose=verbose)
|
779 |
+
results["distribution_impact"] = distribution_impact
|
780 |
+
except Exception as e:
|
781 |
+
results["distribution_impact"]["error"] = str(e)
|
782 |
+
|
783 |
+
try:
|
784 |
+
results["summary"] = self.generate_duplicate_analysis_summary(results, verbose=verbose)
|
785 |
+
except Exception as e:
|
786 |
+
results["summary"] = f"Error generating summary: {str(e)}"
|
787 |
+
|
788 |
+
return results
|
789 |
+
|
790 |
+
def analyze_exact_duplicates(self, df: pd.DataFrame, verbose=False) -> Dict[str, Any]:
|
791 |
+
"""Detect and analyze exact duplicate rows"""
|
792 |
+
results = {}
|
793 |
+
|
794 |
+
try:
|
795 |
+
duplicate_mask = df.duplicated()
|
796 |
+
duplicated_rows = df[duplicate_mask]
|
797 |
+
|
798 |
+
unique_duplicate_patterns = df[df.duplicated(keep=False)].drop_duplicates()
|
799 |
+
|
800 |
+
results["total_exact_duplicates"] = int(duplicate_mask.sum())
|
801 |
+
results["unique_duplicate_patterns"] = len(unique_duplicate_patterns)
|
802 |
+
results["duplicate_percentage"] = round(results["total_exact_duplicates"] / len(df) * 100, 2)
|
803 |
+
|
804 |
+
if len(duplicated_rows) > 0:
|
805 |
+
dup_counts = Counter(map(tuple, df[df.duplicated(keep=False)].itertuples(index=False)))
|
806 |
+
most_common = [(str(k), v) for k, v in dup_counts.most_common(5)]
|
807 |
+
results["most_common_duplicates"] = most_common
|
808 |
+
|
809 |
+
if len(df.columns) <= 20:
|
810 |
+
column_duplication = {}
|
811 |
+
for col in df.columns:
|
812 |
+
dup_count = df.duplicated(subset=[col], keep=False).sum()
|
813 |
+
if dup_count > 0:
|
814 |
+
column_duplication[col] = int(dup_count)
|
815 |
+
|
816 |
+
results["column_duplication_counts"] = column_duplication
|
817 |
+
|
818 |
+
except Exception as e:
|
819 |
+
results["error"] = f"Error in exact duplicates analysis: {str(e)}"
|
820 |
+
logger.error(f"Error in exact duplicates analysis: {str(e)}", log_type="data_quality_assessment", console=verbose)
|
821 |
+
|
822 |
+
return results
|
823 |
+
|
824 |
+
def analyze_key_based_duplicates(self, df: pd.DataFrame, key_columns: Optional[List[str]] = None, verbose=False) -> Dict[str, Any]:
|
825 |
+
"""Detect and analyze key-based duplicates"""
|
826 |
+
results = {}
|
827 |
+
|
828 |
+
try:
|
829 |
+
if key_columns is None:
|
830 |
+
key_columns = self.identify_key_candidates(df)
|
831 |
+
results["detected_key_candidates"] = key_columns
|
832 |
+
|
833 |
+
if not key_columns:
|
834 |
+
results["message"] = "No key columns identified or provided"
|
835 |
+
return results
|
836 |
+
|
837 |
+
dup_counts = {}
|
838 |
+
for key in key_columns:
|
839 |
+
try:
|
840 |
+
if key in df.columns:
|
841 |
+
dups = df.duplicated(subset=[key], keep=False)
|
842 |
+
dup_count = int(dups.sum())
|
843 |
+
dup_percentage = round(dup_count / len(df) * 100, 2)
|
844 |
+
|
845 |
+
if dup_count > 0:
|
846 |
+
dup_values = df[dups][key].value_counts().head(5).to_dict()
|
847 |
+
dup_values = {str(k): int(v) for k, v in dup_values.items()}
|
848 |
+
else:
|
849 |
+
dup_values = {}
|
850 |
+
|
851 |
+
dup_counts[key] = {
|
852 |
+
"duplicate_count": dup_count,
|
853 |
+
"duplicate_percentage": dup_percentage,
|
854 |
+
"top_duplicated_values": dup_values
|
855 |
+
}
|
856 |
+
except Exception as e:
|
857 |
+
dup_counts[key] = {"error": str(e)}
|
858 |
+
|
859 |
+
results["key_duplicate_analysis"] = dup_counts
|
860 |
+
|
861 |
+
if len(key_columns) > 1:
|
862 |
+
try:
|
863 |
+
multi_key_dups = df.duplicated(subset=key_columns, keep=False)
|
864 |
+
results["multi_key_duplicates"] = {
|
865 |
+
"count": int(multi_key_dups.sum()),
|
866 |
+
"percentage": round(multi_key_dups.sum() / len(df) * 100, 2)
|
867 |
+
}
|
868 |
+
except Exception as e:
|
869 |
+
results["multi_key_duplicates"] = {"error": str(e)}
|
870 |
+
|
871 |
+
except Exception as e:
|
872 |
+
results["error"] = f"Error in key-based duplicates analysis: {str(e)}"
|
873 |
+
logger.error(f"Error in key-based duplicates analysis: {str(e)}", log_type="data_quality_assessment", console=verbose)
|
874 |
+
|
875 |
+
return results
|
876 |
+
|
877 |
+
def identify_key_candidates(self, df: pd.DataFrame) -> List[str]:
|
878 |
+
"""
|
879 |
+
Identify potential primary key columns in the dataframe
|
880 |
+
"""
|
881 |
+
candidates = []
|
882 |
+
|
883 |
+
try:
|
884 |
+
for col in df.columns:
|
885 |
+
if df[col].nunique() == len(df) and df[col].notna().all():
|
886 |
+
candidates.append(col)
|
887 |
+
|
888 |
+
if not candidates:
|
889 |
+
for col in df.columns:
|
890 |
+
uniqueness_ratio = df[col].nunique() / len(df)
|
891 |
+
if uniqueness_ratio > 0.9 and df[col].notna().all():
|
892 |
+
candidates.append(col)
|
893 |
+
|
894 |
+
if not candidates:
|
895 |
+
for col in df.columns:
|
896 |
+
if any(key_term in col.lower() for key_term in ['id', 'key', 'code', 'num', 'uuid']):
|
897 |
+
candidates.append(col)
|
898 |
+
|
899 |
+
except Exception:
|
900 |
+
pass
|
901 |
+
|
902 |
+
return candidates
|
903 |
+
|
904 |
+
def analyze_near_duplicates(
|
905 |
+
self,
|
906 |
+
df: pd.DataFrame,
|
907 |
+
similarity_columns: Optional[List[str]] = None,
|
908 |
+
similarity_threshold: float = 0.8,
|
909 |
+
numeric_threshold: float = 0.05,
|
910 |
+
verbose = False
|
911 |
+
) -> Dict[str, Any]:
|
912 |
+
"""
|
913 |
+
Detect and analyze near-duplicate rows using similarity metrics
|
914 |
+
"""
|
915 |
+
results = {}
|
916 |
+
|
917 |
+
try:
|
918 |
+
if similarity_columns is None:
|
919 |
+
text_columns = [col for col in df.columns if df[col].dtype == 'object']
|
920 |
+
numeric_columns = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
|
921 |
+
|
922 |
+
similarity_columns = text_columns[:5] if len(text_columns) > 5 else text_columns
|
923 |
+
results["auto_selected_text_columns"] = similarity_columns
|
924 |
+
results["available_numeric_columns"] = numeric_columns
|
925 |
+
|
926 |
+
if not similarity_columns:
|
927 |
+
results["message"] = "No text columns identified for similarity analysis"
|
928 |
+
return results
|
929 |
+
|
930 |
+
sample_size = min(1000, len(df))
|
931 |
+
if len(df) > sample_size:
|
932 |
+
sample_df = df.sample(sample_size, random_state=42)
|
933 |
+
results["sampled_for_similarity"] = True
|
934 |
+
results["similarity_sample_size"] = sample_size
|
935 |
+
else:
|
936 |
+
sample_df = df
|
937 |
+
|
938 |
+
text_similarity_results = {}
|
939 |
+
for col in similarity_columns:
|
940 |
+
try:
|
941 |
+
if col in df.columns and df[col].dtype == 'object':
|
942 |
+
col_data = sample_df[col].fillna("").astype(str)
|
943 |
+
|
944 |
+
if col_data.nunique() <= 1:
|
945 |
+
continue
|
946 |
+
|
947 |
+
similar_pairs = self.find_similar_text(col_data, similarity_threshold)
|
948 |
+
|
949 |
+
if similar_pairs:
|
950 |
+
text_similarity_results[col] = {
|
951 |
+
"similar_pairs_count": len(similar_pairs),
|
952 |
+
"examples": similar_pairs[:5] # Limit to first 5 examples
|
953 |
+
}
|
954 |
+
except Exception as e:
|
955 |
+
text_similarity_results[col] = {"error": str(e)}
|
956 |
+
|
957 |
+
results["text_similarity"] = text_similarity_results
|
958 |
+
|
959 |
+
numeric_similarity_results = {}
|
960 |
+
numeric_cols = [col for col in df.columns if col in similarity_columns and pd.api.types.is_numeric_dtype(df[col])]
|
961 |
+
|
962 |
+
for col in numeric_cols:
|
963 |
+
try:
|
964 |
+
if df[col].isna().sum() / len(df) > 0.3: # More than 30% missing
|
965 |
+
continue
|
966 |
+
|
967 |
+
col_data = sample_df[col].dropna()
|
968 |
+
similar_numeric_pairs = self.find_similar_numeric(col_data, numeric_threshold)
|
969 |
+
|
970 |
+
if similar_numeric_pairs:
|
971 |
+
numeric_similarity_results[col] = {
|
972 |
+
"similar_pairs_count": len(similar_numeric_pairs),
|
973 |
+
"examples": similar_numeric_pairs[:5] # Limit to first 5 examples
|
974 |
+
}
|
975 |
+
except Exception as e:
|
976 |
+
numeric_similarity_results[col] = {"error": str(e)}
|
977 |
+
|
978 |
+
results["numeric_similarity"] = numeric_similarity_results
|
979 |
+
|
980 |
+
try:
|
981 |
+
if len(similarity_columns) >= 2:
|
982 |
+
combined_text = sample_df[similarity_columns].fillna("").astype(str).apply(
|
983 |
+
lambda x: " ".join(x), axis=1
|
984 |
+
)
|
985 |
+
|
986 |
+
if len(combined_text) > 1: # Need at least 2 rows for comparison
|
987 |
+
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
|
988 |
+
try:
|
989 |
+
tfidf_matrix = vectorizer.fit_transform(combined_text)
|
990 |
+
similar_doc_pairs = self.find_similar_vectors(tfidf_matrix, similarity_threshold)
|
991 |
+
|
992 |
+
if similar_doc_pairs:
|
993 |
+
results["multi_column_similarity"] = {
|
994 |
+
"similar_rows_count": len(similar_doc_pairs),
|
995 |
+
"examples": similar_doc_pairs[:5] # Limit to first 5 examples
|
996 |
+
}
|
997 |
+
except Exception as e:
|
998 |
+
results["multi_column_similarity"] = {"error": str(e)}
|
999 |
+
except Exception as e:
|
1000 |
+
results["multi_column_similarity_error"] = str(e)
|
1001 |
+
|
1002 |
+
except Exception as e:
|
1003 |
+
results["error"] = f"Error in near-duplicates analysis: {str(e)}"
|
1004 |
+
logger.error(f"Error in near-duplicates analysis: {str(e)}", log_type="data_quality_assessment", console=verbose)
|
1005 |
+
|
1006 |
+
|
1007 |
+
return results
|
1008 |
+
|
1009 |
+
def find_similar_text(self, series: pd.Series, threshold: float) -> List[Tuple[str, str, float]]:
|
1010 |
+
"""
|
1011 |
+
Find similar text pairs in a series
|
1012 |
+
"""
|
1013 |
+
similar_pairs = []
|
1014 |
+
values = series.tolist()
|
1015 |
+
|
1016 |
+
max_comparisons = 10000
|
1017 |
+
if len(values) > 200: # For large sets, sample comparisons
|
1018 |
+
import random
|
1019 |
+
from itertools import combinations
|
1020 |
+
|
1021 |
+
indices = list(range(len(values)))
|
1022 |
+
all_pairs = list(combinations(indices, 2))
|
1023 |
+
|
1024 |
+
if len(all_pairs) > max_comparisons:
|
1025 |
+
sampled_pairs = random.sample(all_pairs, max_comparisons)
|
1026 |
+
else:
|
1027 |
+
sampled_pairs = all_pairs
|
1028 |
+
|
1029 |
+
for i, j in sampled_pairs:
|
1030 |
+
try:
|
1031 |
+
if not values[i] or not values[j]:
|
1032 |
+
continue
|
1033 |
+
|
1034 |
+
similarity = difflib.SequenceMatcher(None, values[i], values[j]).ratio()
|
1035 |
+
if similarity >= threshold:
|
1036 |
+
similar_pairs.append((values[i], values[j], round(similarity, 2)))
|
1037 |
+
except:
|
1038 |
+
continue
|
1039 |
+
else:
|
1040 |
+
for i in range(len(values)):
|
1041 |
+
try:
|
1042 |
+
for j in range(i+1, len(values)):
|
1043 |
+
if not values[i] or not values[j]:
|
1044 |
+
continue
|
1045 |
+
|
1046 |
+
similarity = difflib.SequenceMatcher(None, values[i], values[j]).ratio()
|
1047 |
+
if similarity >= threshold:
|
1048 |
+
similar_pairs.append((values[i], values[j], round(similarity, 2)))
|
1049 |
+
except:
|
1050 |
+
continue
|
1051 |
+
return similar_pairs
|
1052 |
+
|
1053 |
+
def find_similar_numeric(self, series: pd.Series, threshold: float) -> List[Tuple[float, float, float]]:
|
1054 |
+
"""
|
1055 |
+
Find similar numeric pairs in a series
|
1056 |
+
"""
|
1057 |
+
similar_pairs = []
|
1058 |
+
values = series.tolist()
|
1059 |
+
|
1060 |
+
max_comparisons = 10000
|
1061 |
+
if len(values) > 200: # For large sets, sample comparisons
|
1062 |
+
import random
|
1063 |
+
from itertools import combinations
|
1064 |
+
|
1065 |
+
indices = list(range(len(values)))
|
1066 |
+
all_pairs = list(combinations(indices, 2))
|
1067 |
+
|
1068 |
+
if len(all_pairs) > max_comparisons:
|
1069 |
+
sampled_pairs = random.sample(all_pairs, max_comparisons)
|
1070 |
+
else:
|
1071 |
+
sampled_pairs = all_pairs
|
1072 |
+
|
1073 |
+
for i, j in sampled_pairs:
|
1074 |
+
try:
|
1075 |
+
if values[i] == 0 or values[j] == 0:
|
1076 |
+
continue
|
1077 |
+
|
1078 |
+
max_val = max(abs(values[i]), abs(values[j]))
|
1079 |
+
min_val = min(abs(values[i]), abs(values[j]))
|
1080 |
+
|
1081 |
+
if max_val == 0: # Both values are zero
|
1082 |
+
continue
|
1083 |
+
|
1084 |
+
rel_diff = (max_val - min_val) / max_val
|
1085 |
+
|
1086 |
+
if rel_diff <= threshold:
|
1087 |
+
similar_pairs.append((values[i], values[j], round(rel_diff, 3)))
|
1088 |
+
except:
|
1089 |
+
continue
|
1090 |
+
else:
|
1091 |
+
for i in range(len(values)):
|
1092 |
+
try:
|
1093 |
+
for j in range(i+1, len(values)):
|
1094 |
+
if values[i] == 0 or values[j] == 0:
|
1095 |
+
continue
|
1096 |
+
|
1097 |
+
max_val = max(abs(values[i]), abs(values[j]))
|
1098 |
+
min_val = min(abs(values[i]), abs(values[j]))
|
1099 |
+
|
1100 |
+
if max_val == 0: # Both values are zero
|
1101 |
+
continue
|
1102 |
+
|
1103 |
+
rel_diff = (max_val - min_val) / max_val
|
1104 |
+
|
1105 |
+
if rel_diff <= threshold:
|
1106 |
+
similar_pairs.append((values[i], values[j], round(rel_diff, 3)))
|
1107 |
+
except:
|
1108 |
+
continue
|
1109 |
+
|
1110 |
+
return similar_pairs
|
1111 |
+
|
1112 |
+
def find_similar_vectors(self, tfidf_matrix, threshold: float) -> List[Tuple[int, int, float]]:
|
1113 |
+
"""
|
1114 |
+
Find similar document pairs based on TF-IDF vectors
|
1115 |
+
"""
|
1116 |
+
similar_pairs = []
|
1117 |
+
|
1118 |
+
max_rows = 1000
|
1119 |
+
if tfidf_matrix.shape[0] > max_rows:
|
1120 |
+
import random
|
1121 |
+
indices = random.sample(range(tfidf_matrix.shape[0]), max_rows)
|
1122 |
+
sampled_matrix = tfidf_matrix[indices]
|
1123 |
+
else:
|
1124 |
+
sampled_matrix = tfidf_matrix
|
1125 |
+
indices = list(range(tfidf_matrix.shape[0]))
|
1126 |
+
|
1127 |
+
similarity_matrix = cosine_similarity(sampled_matrix)
|
1128 |
+
|
1129 |
+
rows, cols = np.where(similarity_matrix >= threshold)
|
1130 |
+
for i, j in zip(rows, cols):
|
1131 |
+
try:
|
1132 |
+
if i < j: # Only include each pair once
|
1133 |
+
similar_pairs.append((indices[i], indices[j], round(similarity_matrix[i, j], 2)))
|
1134 |
+
except:
|
1135 |
+
continue
|
1136 |
+
|
1137 |
+
return similar_pairs
|
1138 |
+
|
1139 |
+
def analyze_distribution_impact(self, df: pd.DataFrame, results: Dict[str, Any], verbose=False) -> Dict[str, Any]:
|
1140 |
+
"""
|
1141 |
+
Assess impact of duplicates on distribution statistics
|
1142 |
+
"""
|
1143 |
+
distribution_results = {}
|
1144 |
+
|
1145 |
+
try:
|
1146 |
+
df_deduped = df.drop_duplicates()
|
1147 |
+
|
1148 |
+
if len(df) == len(df_deduped):
|
1149 |
+
distribution_results["message"] = "No exact duplicates found for distribution impact analysis"
|
1150 |
+
return distribution_results
|
1151 |
+
|
1152 |
+
numeric_cols = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
|
1153 |
+
|
1154 |
+
col_impacts = {}
|
1155 |
+
for col in numeric_cols:
|
1156 |
+
try:
|
1157 |
+
if df[col].isna().sum() / len(df) > 0.5: # More than 50% missing
|
1158 |
+
continue
|
1159 |
+
|
1160 |
+
orig_stats = {
|
1161 |
+
"mean": float(df[col].mean()),
|
1162 |
+
"median": float(df[col].median()),
|
1163 |
+
"std": float(df[col].std()),
|
1164 |
+
"min": float(df[col].min()),
|
1165 |
+
"max": float(df[col].max())
|
1166 |
+
}
|
1167 |
+
|
1168 |
+
dedup_stats = {
|
1169 |
+
"mean": float(df_deduped[col].mean()),
|
1170 |
+
"median": float(df_deduped[col].median()),
|
1171 |
+
"std": float(df_deduped[col].std()),
|
1172 |
+
"min": float(df_deduped[col].min()),
|
1173 |
+
"max": float(df_deduped[col].max())
|
1174 |
+
}
|
1175 |
+
|
1176 |
+
pct_changes = {}
|
1177 |
+
for stat in orig_stats:
|
1178 |
+
try:
|
1179 |
+
if orig_stats[stat] != 0:
|
1180 |
+
pct_changes[stat] = round(
|
1181 |
+
(dedup_stats[stat] - orig_stats[stat]) / abs(orig_stats[stat]) * 100, 2
|
1182 |
+
)
|
1183 |
+
else:
|
1184 |
+
pct_changes[stat] = 0.0 if dedup_stats[stat] == 0 else float('inf')
|
1185 |
+
except:
|
1186 |
+
continue
|
1187 |
+
|
1188 |
+
col_impacts[col] = {
|
1189 |
+
"original_stats": orig_stats,
|
1190 |
+
"deduped_stats": dedup_stats,
|
1191 |
+
"percentage_changes": pct_changes,
|
1192 |
+
"significant_change": any(abs(chg) > 5 for chg in pct_changes.values())
|
1193 |
+
}
|
1194 |
+
except Exception as e:
|
1195 |
+
col_impacts[col] = {"error": str(e)}
|
1196 |
+
|
1197 |
+
distribution_results["numeric_columns_impact"] = col_impacts
|
1198 |
+
|
1199 |
+
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
|
1200 |
+
|
1201 |
+
cat_cols = cat_cols[:10] if len(cat_cols) > 10 else cat_cols
|
1202 |
+
|
1203 |
+
cat_impacts = {}
|
1204 |
+
for col in cat_cols:
|
1205 |
+
try:
|
1206 |
+
orig_counts = df[col].value_counts(normalize=True).head(10).to_dict()
|
1207 |
+
dedup_counts = df_deduped[col].value_counts(normalize=True).head(10).to_dict()
|
1208 |
+
|
1209 |
+
orig_counts = {str(k): float(v) for k, v in orig_counts.items()}
|
1210 |
+
dedup_counts = {str(k): float(v) for k, v in dedup_counts.items()}
|
1211 |
+
|
1212 |
+
changed_cats = {}
|
1213 |
+
all_cats = set(list(orig_counts.keys()) + list(dedup_counts.keys()))
|
1214 |
+
|
1215 |
+
for cat in all_cats:
|
1216 |
+
orig_val = orig_counts.get(cat, 0)
|
1217 |
+
dedup_val = dedup_counts.get(cat, 0)
|
1218 |
+
|
1219 |
+
if orig_val > 0:
|
1220 |
+
pct_change = round((dedup_val - orig_val) / orig_val * 100, 2)
|
1221 |
+
if abs(pct_change) > 5: # 5% threshold for significant change
|
1222 |
+
changed_cats[cat] = pct_change
|
1223 |
+
|
1224 |
+
cat_impacts[col] = {
|
1225 |
+
"significant_category_changes": changed_cats,
|
1226 |
+
"has_significant_changes": len(changed_cats) > 0
|
1227 |
+
}
|
1228 |
+
except Exception as e:
|
1229 |
+
cat_impacts[col] = {"error": str(e)}
|
1230 |
+
|
1231 |
+
distribution_results["categorical_columns_impact"] = cat_impacts
|
1232 |
+
|
1233 |
+
try:
|
1234 |
+
significant_numeric_changes = sum(
|
1235 |
+
1 for col in col_impacts if "significant_change" in col_impacts[col] and col_impacts[col]["significant_change"]
|
1236 |
+
)
|
1237 |
+
|
1238 |
+
significant_cat_changes = sum(
|
1239 |
+
1 for col in cat_impacts if "has_significant_changes" in cat_impacts[col] and cat_impacts[col]["has_significant_changes"]
|
1240 |
+
)
|
1241 |
+
|
1242 |
+
distribution_results["overall_assessment"] = {
|
1243 |
+
"columns_with_significant_numeric_changes": significant_numeric_changes,
|
1244 |
+
"columns_with_significant_categorical_changes": significant_cat_changes,
|
1245 |
+
"total_columns_analyzed": len(col_impacts) + len(cat_impacts),
|
1246 |
+
"duplicates_impact_level": self.get_impact_level(
|
1247 |
+
significant_numeric_changes + significant_cat_changes,
|
1248 |
+
len(col_impacts) + len(cat_impacts)
|
1249 |
+
)
|
1250 |
+
}
|
1251 |
+
except Exception as e:
|
1252 |
+
distribution_results["overall_assessment"] = {"error": str(e)}
|
1253 |
+
|
1254 |
+
except Exception as e:
|
1255 |
+
distribution_results["error"] = f"Error in distribution impact analysis: {str(e)}"
|
1256 |
+
logger.error(f"Error in distribution impact analysis: {str(e)}", log_type="data_quality_assessment", console=verbose)
|
1257 |
+
|
1258 |
+
return distribution_results
|
1259 |
+
|
1260 |
+
def get_impact_level(self, significant_changes: int, total_cols: int) -> str:
|
1261 |
+
"""
|
1262 |
+
Determine the impact level based on the proportion of columns with significant changes
|
1263 |
+
"""
|
1264 |
+
if total_cols == 0:
|
1265 |
+
return "Unknown"
|
1266 |
+
try:
|
1267 |
+
proportion = significant_changes / total_cols
|
1268 |
+
|
1269 |
+
if proportion == 0:
|
1270 |
+
return "None"
|
1271 |
+
elif proportion < 0.1:
|
1272 |
+
return "Minimal"
|
1273 |
+
elif proportion < 0.3:
|
1274 |
+
return "Low"
|
1275 |
+
elif proportion < 0.5:
|
1276 |
+
return "Moderate"
|
1277 |
+
elif proportion < 0.7:
|
1278 |
+
return "High"
|
1279 |
+
else:
|
1280 |
+
return "Severe"
|
1281 |
+
except:
|
1282 |
+
pass
|
1283 |
+
|
1284 |
+
def generate_duplicate_analysis_summary(self, results: Dict[str, Any], verbose=False) -> str:
|
1285 |
+
"""
|
1286 |
+
Generate a text summary of duplicate detection results
|
1287 |
+
"""
|
1288 |
+
summary_parts = []
|
1289 |
+
|
1290 |
+
try:
|
1291 |
+
dataset_info = results.get("dataset_info", {})
|
1292 |
+
summary_parts.append(f"Dataset Summary: {dataset_info.get('original_rows', 'N/A')} rows, "
|
1293 |
+
f"{dataset_info.get('original_columns', 'N/A')} columns")
|
1294 |
+
if dataset_info.get("sampled"):
|
1295 |
+
summary_parts.append(f"Analysis performed on a sample of {dataset_info.get('sample_size', 'N/A')} rows")
|
1296 |
+
except Exception as e:
|
1297 |
+
summary_parts.append(f"Error summarizing dataset info: {e}")
|
1298 |
+
|
1299 |
+
try:
|
1300 |
+
exact_dups = results.get("exact_duplicates", {})
|
1301 |
+
if "error" in exact_dups:
|
1302 |
+
summary_parts.append(f"Error in exact duplicate analysis summary: {exact_dups['error']}")
|
1303 |
+
else:
|
1304 |
+
dup_count = exact_dups.get("total_exact_duplicates", 0)
|
1305 |
+
dup_pct = exact_dups.get("duplicate_percentage", 0)
|
1306 |
+
summary_parts.append(f"Exact Duplicates: {dup_count} rows ({dup_pct}% of dataset)")
|
1307 |
+
except Exception as e:
|
1308 |
+
summary_parts.append(f"Error summarizing exact duplicates: {e}")
|
1309 |
+
|
1310 |
+
try:
|
1311 |
+
key_dups = results.get("key_based_duplicates", {})
|
1312 |
+
if "error" in key_dups:
|
1313 |
+
summary_parts.append(f"Error in key-based duplicate analysis summary: {key_dups['error']}")
|
1314 |
+
else:
|
1315 |
+
if "detected_key_candidates" in key_dups:
|
1316 |
+
candidates = key_dups.get("detected_key_candidates", [])
|
1317 |
+
if candidates:
|
1318 |
+
summary_parts.append(f"Detected Key Candidates: {', '.join(candidates)}")
|
1319 |
+
else:
|
1320 |
+
summary_parts.append("No key candidates detected")
|
1321 |
+
key_analysis = key_dups.get("key_duplicate_analysis", {})
|
1322 |
+
if key_analysis:
|
1323 |
+
key_summary = []
|
1324 |
+
for key, info in key_analysis.items():
|
1325 |
+
if "error" in info:
|
1326 |
+
continue
|
1327 |
+
key_summary.append(f"{key}: {info.get('duplicate_count', 0)} duplicates "
|
1328 |
+
f"({info.get('duplicate_percentage', 0)}%)")
|
1329 |
+
if key_summary:
|
1330 |
+
summary_parts.append("Key-Based Duplicates Summary:\n- " + "\n- ".join(key_summary))
|
1331 |
+
multi_key_dups = key_dups.get("multi_key_duplicates", {})
|
1332 |
+
if multi_key_dups and "error" not in multi_key_dups:
|
1333 |
+
summary_parts.append(f"Multi-Column Key Duplicates: {multi_key_dups.get('count', 0)} rows "
|
1334 |
+
f"({multi_key_dups.get('percentage', 0)}%)")
|
1335 |
+
except Exception as e:
|
1336 |
+
summary_parts.append(f"Error summarizing key-based duplicates: {e}")
|
1337 |
+
|
1338 |
+
try:
|
1339 |
+
near_dups = results.get("near_duplicates", {})
|
1340 |
+
if "error" in near_dups:
|
1341 |
+
summary_parts.append(f"Error in near-duplicate analysis summary: {near_dups['error']}")
|
1342 |
+
else:
|
1343 |
+
text_similarity = near_dups.get("text_similarity", {})
|
1344 |
+
if text_similarity:
|
1345 |
+
text_similar_count = sum(info.get("similar_pairs_count", 0)
|
1346 |
+
for info in text_similarity.values()
|
1347 |
+
if "error" not in info)
|
1348 |
+
if text_similar_count > 0:
|
1349 |
+
summary_parts.append(f"Text Near-Duplicates: {text_similar_count} similar pairs identified")
|
1350 |
+
|
1351 |
+
numeric_similarity = near_dups.get("numeric_similarity", {})
|
1352 |
+
if numeric_similarity:
|
1353 |
+
numeric_similar_count = sum(info.get("similar_pairs_count", 0)
|
1354 |
+
for info in numeric_similarity.values()
|
1355 |
+
if "error" not in info)
|
1356 |
+
if numeric_similar_count > 0:
|
1357 |
+
summary_parts.append(f"Numeric Near-Duplicates: {numeric_similar_count} similar pairs identified")
|
1358 |
+
|
1359 |
+
multi_col = near_dups.get("multi_column_similarity", {})
|
1360 |
+
if multi_col and "error" not in multi_col and multi_col.get("similar_rows_count", 0) > 0:
|
1361 |
+
summary_parts.append(f"Multi-Column Near-Duplicates: {multi_col.get('similar_rows_count', 0)} similar row pairs")
|
1362 |
+
except Exception as e:
|
1363 |
+
summary_parts.append(f"Error summarizing near duplicates: {e}")
|
1364 |
+
|
1365 |
+
try:
|
1366 |
+
dist_impact = results.get("distribution_impact", {})
|
1367 |
+
if "error" in dist_impact:
|
1368 |
+
summary_parts.append(f"Error in distribution impact analysis summary: {dist_impact['error']}")
|
1369 |
+
else:
|
1370 |
+
overall = dist_impact.get("overall_assessment", {})
|
1371 |
+
if overall and "error" not in overall:
|
1372 |
+
impact_level = overall.get("duplicates_impact_level", "Unknown")
|
1373 |
+
sig_cols = overall.get("columns_with_significant_numeric_changes", 0) + \
|
1374 |
+
overall.get("columns_with_significant_categorical_changes", 0)
|
1375 |
+
total_cols = overall.get("total_columns_analyzed", 0)
|
1376 |
+
if total_cols > 0:
|
1377 |
+
summary_parts.append(f"Distribution Impact: {impact_level} "
|
1378 |
+
f"({sig_cols}/{total_cols} columns significantly affected)")
|
1379 |
+
except Exception as e:
|
1380 |
+
summary_parts.append(f"Error summarizing distribution impact: {e}")
|
1381 |
+
|
1382 |
+
try:
|
1383 |
+
if "exact_duplicates" in results and "total_exact_duplicates" in results["exact_duplicates"]:
|
1384 |
+
dup_pct = results["exact_duplicates"].get("duplicate_percentage", 0)
|
1385 |
+
|
1386 |
+
if dup_pct > 20:
|
1387 |
+
summary_parts.append("\nRECOMMENDATION: High duplicate percentage detected. "
|
1388 |
+
"Consider deduplicating the dataset before analysis.")
|
1389 |
+
elif dup_pct > 5:
|
1390 |
+
summary_parts.append("\nRECOMMENDATION: Moderate duplicate percentage detected. "
|
1391 |
+
"Consider the impact of duplicates on your analysis.")
|
1392 |
+
elif dup_pct > 0:
|
1393 |
+
summary_parts.append("\nRECOMMENDATION: Low duplicate percentage detected. "
|
1394 |
+
"Minimal impact expected on analysis.")
|
1395 |
+
else:
|
1396 |
+
summary_parts.append("\nRECOMMENDATION: No exact duplicates found.")
|
1397 |
+
else:
|
1398 |
+
summary_parts.append("\nRECOMMENDATION: Duplicate analysis complete.")
|
1399 |
+
except Exception as e:
|
1400 |
+
summary_parts.append(f"Error generating recommendation: {e}")
|
1401 |
+
|
1402 |
+
return "\n".join(summary_parts)
|
1403 |
+
|
1404 |
+
def perform_consistency_checks(self, verbose=False) -> Dict[str, Any]:
|
1405 |
+
"""Performs various data consistency checks."""
|
1406 |
+
results = {}
|
1407 |
+
|
1408 |
+
try:
|
1409 |
+
results["cross_field_validation"] = self.check_cross_field_validity()
|
1410 |
+
except Exception as e:
|
1411 |
+
results["cross_field_validation"] = {"error": str(e)}
|
1412 |
+
logger.error(f"Error in cross-field validation: {e}", log_type="consistency_check", console=verbose)
|
1413 |
+
|
1414 |
+
try:
|
1415 |
+
results["logical_relationship"] = self.verify_logical_relationships()
|
1416 |
+
except Exception as e:
|
1417 |
+
results["logical_relationship"] = {"error": str(e)}
|
1418 |
+
logger.error(f"Error in logical relationship verification: {e}", log_type="consistency_check", console=verbose)
|
1419 |
+
|
1420 |
+
try:
|
1421 |
+
results["data_type_validation"] = self.validate_data_types()
|
1422 |
+
except Exception as e:
|
1423 |
+
results["data_type_validation"] = {"error": str(e)}
|
1424 |
+
logger.error(f"Error in data type validation: {e}", log_type="consistency_check", console=verbose)
|
1425 |
+
|
1426 |
+
try:
|
1427 |
+
results["value_transition_validity"] = self.check_value_transitions()
|
1428 |
+
except Exception as e:
|
1429 |
+
results["value_transition_validity"] = {"error": str(e)}
|
1430 |
+
logger.error(f"Error in value transition validity: {e}", log_type="consistency_check", console=verbose)
|
1431 |
+
|
1432 |
+
try:
|
1433 |
+
results["unit_consistency"] = self.check_unit_consistency()
|
1434 |
+
except Exception as e:
|
1435 |
+
results["unit_consistency"] = {"error": str(e)}
|
1436 |
+
logger.error(f"Error in unit consistency checks: {e}", log_type="consistency_check", console=verbose)
|
1437 |
+
|
1438 |
+
try:
|
1439 |
+
results["format_consistency"] = self.check_format_consistency()
|
1440 |
+
except Exception as e:
|
1441 |
+
results["format_consistency"] = {"error": str(e)}
|
1442 |
+
logger.error(f"Error in format consistency tests: {e}", log_type="consistency_check", console=verbose)
|
1443 |
+
|
1444 |
+
return results
|
1445 |
+
|
1446 |
+
def check_cross_field_validity(self) -> Dict[str, Any]:
|
1447 |
+
"""Example: Check if discount is not greater than the price."""
|
1448 |
+
if self.data is None or 'price' not in self.data.columns or 'discount' not in self.data.columns:
|
1449 |
+
return {"warning": "Price or discount columns not found."}
|
1450 |
+
invalid_rows = self.data[self.data['discount'] > self.data['price']]
|
1451 |
+
return {"invalid_count": len(invalid_rows), "invalid_indices": invalid_rows.index.tolist()}
|
1452 |
+
|
1453 |
+
def verify_logical_relationships(self) -> Dict[str, Any]:
|
1454 |
+
"""Example: If order_status is 'shipped', shipment_date should not be NaN."""
|
1455 |
+
if self.data is None or 'order_status' not in self.data.columns or 'shipment_date' not in self.data.columns:
|
1456 |
+
return {"warning": "order_status or shipment_date columns not found."}
|
1457 |
+
invalid_rows = self.data[(self.data['order_status'] == 'shipped') & (self.data['shipment_date'].isnull())]
|
1458 |
+
return {"invalid_count": len(invalid_rows), "invalid_indices": invalid_rows.index.tolist()}
|
1459 |
+
|
1460 |
+
def validate_data_types(self) -> Dict[str, Any]:
|
1461 |
+
"""Check if declared data types match actual data types (basic check)."""
|
1462 |
+
if self.data is None:
|
1463 |
+
return {"warning": "No data loaded."}
|
1464 |
+
mismatches = {}
|
1465 |
+
for col in self.data.columns:
|
1466 |
+
try:
|
1467 |
+
inferred_type = pd.api.types.infer_dtype(self.data[col])
|
1468 |
+
mismatches[col] = {"inferred_type": inferred_type}
|
1469 |
+
except:
|
1470 |
+
continue
|
1471 |
+
return mismatches
|
1472 |
+
|
1473 |
+
def check_value_transitions(self) -> Dict[str, Any]:
|
1474 |
+
"""Example: Check if a 'temperature' column generally increases over a 'time' column (simplistic)."""
|
1475 |
+
if self.data is None or 'temperature' not in self.data.columns or 'time' not in self.data.columns:
|
1476 |
+
return {"warning": "temperature or time columns not found."}
|
1477 |
+
diffs = self.data['temperature'].diff()
|
1478 |
+
decreasing_transitions = diffs[diffs < 0].count()
|
1479 |
+
increasing_transitions = diffs[diffs > 0].count()
|
1480 |
+
return {"decreasing_transitions": decreasing_transitions, "increasing_transitions": increasing_transitions}
|
1481 |
+
|
1482 |
+
def check_unit_consistency(self) -> Dict[str, Any]:
|
1483 |
+
"""Example: Check if a 'measurement' column has consistent units (requires some form of unit identification)."""
|
1484 |
+
if self.data is None or 'measurement' not in self.data.columns:
|
1485 |
+
return {"warning": "measurement column not found."}
|
1486 |
+
|
1487 |
+
unique_values = self.data['measurement'].astype(str).unique()
|
1488 |
+
return {"unique_measurement_values": list(unique_values), "comment": "Implement more sophisticated unit parsing if needed."}
|
1489 |
+
|
1490 |
+
def check_format_consistency(self) -> Dict[str, Any]:
|
1491 |
+
"""Example: Check if a 'date' column has a consistent date format."""
|
1492 |
+
if self.data is None or 'date' not in self.data.columns:
|
1493 |
+
return {"warning": "date column not found."}
|
1494 |
+
formats = set()
|
1495 |
+
for value in self.data['date'].astype(str).dropna().unique():
|
1496 |
+
try:
|
1497 |
+
pd.to_datetime(value)
|
1498 |
+
except:
|
1499 |
+
formats.add(f"Inconsistent format: '{value}'")
|
1500 |
+
return {"inconsistent_formats": list(formats)}
|
1501 |
+
|
1502 |
+
def perform_data_completeness_checks(self, verbose=False) -> Dict[str, Any]:
|
1503 |
+
"""Performs various data completeness checks."""
|
1504 |
+
results = {}
|
1505 |
+
|
1506 |
+
try:
|
1507 |
+
results["coverage_analysis"] = self.analyze_coverage()
|
1508 |
+
except Exception as e:
|
1509 |
+
results["coverage_analysis"] = {"error": str(e)}
|
1510 |
+
logger.error(f"Error in coverage analysis: {e}", log_type="completeness_check", console=verbose)
|
1511 |
+
|
1512 |
+
try:
|
1513 |
+
results["time_period_completeness"] = self.check_time_completeness()
|
1514 |
+
except Exception as e:
|
1515 |
+
results["time_period_completeness"] = {"error": str(e)}
|
1516 |
+
logger.error(f"Error in time period completeness checks: {e}", log_type="completeness_check", console=verbose)
|
1517 |
+
|
1518 |
+
return results
|
1519 |
+
|
1520 |
+
def analyze_coverage(self) -> Dict[str, Any]:
|
1521 |
+
"""Analyze the percentage of non-missing values for each column."""
|
1522 |
+
if self.data is None:
|
1523 |
+
return {"warning": "No data loaded."}
|
1524 |
+
coverage = {}
|
1525 |
+
for col in self.data.columns:
|
1526 |
+
try:
|
1527 |
+
coverage[col] = f"{self.data[col].count() / len(self.data) * 100:.2f}%"
|
1528 |
+
except:
|
1529 |
+
continue
|
1530 |
+
return coverage
|
1531 |
+
|
1532 |
+
def check_time_completeness(self) -> Dict[str, Any]:
|
1533 |
+
"""Example: Check if there are missing dates in a 'timestamp' column (assuming sorted)."""
|
1534 |
+
if self.data is None or 'timestamp' not in self.data.columns:
|
1535 |
+
return {"warning": "timestamp column not found."}
|
1536 |
+
try:
|
1537 |
+
time_series = pd.to_datetime(self.data['timestamp']).sort_values()
|
1538 |
+
if not time_series.empty:
|
1539 |
+
first_date = time_series.iloc[0].date()
|
1540 |
+
last_date = time_series.iloc[-1].date()
|
1541 |
+
expected_days = (last_date - first_date).days + 1
|
1542 |
+
actual_days = len(time_series.dt.date.unique())
|
1543 |
+
return {"expected_unique_days": expected_days, "actual_unique_days": actual_days}
|
1544 |
+
else:
|
1545 |
+
return {"warning": "Timestamp column is empty."}
|
1546 |
+
except Exception as e:
|
1547 |
+
return {"error": f"Could not process timestamp column: {e}"}
|
1548 |
+
|
1549 |
+
def generate_consistency_analysis_summary(self, consistency_results: Dict[str, Any], completeness_results: Dict[str, Any]) -> str:
|
1550 |
+
"""Generates a summary of the data quality assessment."""
|
1551 |
+
summary_lines = ["Data Quality Assessment Summary:\n"]
|
1552 |
+
|
1553 |
+
summary_lines.append("\nConsistency Checks:\n")
|
1554 |
+
if consistency_results:
|
1555 |
+
for check, result in consistency_results.items():
|
1556 |
+
try:
|
1557 |
+
summary_lines.append(f"- {check}: {result}\n")
|
1558 |
+
except:
|
1559 |
+
continue
|
1560 |
+
else:
|
1561 |
+
summary_lines.append("- No consistency checks performed or results available.\n")
|
1562 |
+
|
1563 |
+
summary_lines.append("\nData Completeness Checks:\n")
|
1564 |
+
if completeness_results:
|
1565 |
+
for check, result in completeness_results.items():
|
1566 |
+
try:
|
1567 |
+
summary_lines.append(f"- {check}: {result}\n")
|
1568 |
+
except:
|
1569 |
+
continue
|
1570 |
+
else:
|
1571 |
+
summary_lines.append("- No data completeness checks performed or results available.\n")
|
1572 |
+
|
1573 |
+
return "".join(summary_lines)
|
1574 |
+
|
1575 |
+
def generate_report_from_agent(self, input)->str:
|
1576 |
+
'''Transform the json output to a user-readable report'''
|
1577 |
+
try:
|
1578 |
+
input = f"ML Task: {self.ml_task}\n{input}"
|
1579 |
+
response: RunResponse = self.writer.run(input, stream=False)
|
1580 |
+
return response.content
|
1581 |
+
except Exception as e:
|
1582 |
+
return f"Failed to generate report with error: {e}"
|
1583 |
+
|
1584 |
+
def convert_numpy_types(self, obj):
|
1585 |
+
if isinstance(obj, dict):
|
1586 |
+
return {k: self.convert_numpy_types(v) for k, v in obj.items()}
|
1587 |
+
elif isinstance(obj, list):
|
1588 |
+
return [self.convert_numpy_types(item) for item in obj]
|
1589 |
+
elif isinstance(obj, np.integer):
|
1590 |
+
return int(obj)
|
1591 |
+
elif isinstance(obj, np.floating):
|
1592 |
+
return float(obj)
|
1593 |
+
elif isinstance(obj, np.bool_):
|
1594 |
+
return bool(obj)
|
1595 |
+
elif isinstance(obj, np.ndarray):
|
1596 |
+
return obj.tolist()
|
1597 |
+
else:
|
1598 |
+
return obj
|
1599 |
+
|
1600 |
+
|
1601 |
+
def run(self, verbose=False) -> Dict[str, dict]:
|
1602 |
+
'''Run the entire workflow'''
|
1603 |
+
logger.info("Starting missing value analysis...", log_type="data_quality_assessment", console=verbose)
|
1604 |
+
mva_results = self.analyze_missing_data(verbose=verbose)
|
1605 |
+
# logger.info("Starting agentic missing value analysis...", log_type="data_quality_assessment", console=verbose)
|
1606 |
+
# agentic_mva = self.agent_mva(report=mva_results['summary'], verbose=verbose)
|
1607 |
+
|
1608 |
+
logger.info("Starting Duplicate value analysis...", log_type="data_quality_assessment", console=verbose)
|
1609 |
+
duplicate_analysis_results = self.detect_duplicates(verbose=verbose)
|
1610 |
+
|
1611 |
+
logger.info("Starting Data quality & Consistency checks...", log_type="data_quality_assessment", console=verbose)
|
1612 |
+
consistency_results = self.perform_consistency_checks(verbose=verbose)
|
1613 |
+
completeness_results = self.perform_data_completeness_checks(verbose=verbose)
|
1614 |
+
data_quality_results = consistency_results | completeness_results
|
1615 |
+
data_quality_summary = self.generate_consistency_analysis_summary(consistency_results, completeness_results)
|
1616 |
+
|
1617 |
+
logger.info("Generating final reports....", log_type='data_quality_assessment', console=verbose)
|
1618 |
+
|
1619 |
+
final_result = {
|
1620 |
+
"missing_value_analysis": {
|
1621 |
+
'dict': self.convert_numpy_types(mva_results),
|
1622 |
+
'report': ""
|
1623 |
+
},
|
1624 |
+
"duplicate_analysis": {
|
1625 |
+
'dict': self.convert_numpy_types(duplicate_analysis_results),
|
1626 |
+
'report': ""
|
1627 |
+
},
|
1628 |
+
"data_quality_analysis": {
|
1629 |
+
'dict': self.convert_numpy_types(data_quality_results),
|
1630 |
+
'report': ""
|
1631 |
+
},
|
1632 |
+
}
|
1633 |
+
|
1634 |
+
try:
|
1635 |
+
mva_str = json.dumps(mva_results, indent=2, default=str)
|
1636 |
+
final_result['missing_value_analysis']['report'] = self.generate_report_from_agent(mva_str)
|
1637 |
+
except:
|
1638 |
+
logger.error("Failed to generate report for mva....", log_type='data_quality_assessment', console=verbose)
|
1639 |
+
pass
|
1640 |
+
|
1641 |
+
try:
|
1642 |
+
duplicate_analysis_results_str = json.dumps(duplicate_analysis_results, indent=2, default=str)
|
1643 |
+
final_result['duplicate_analysis']['report'] = self.generate_report_from_agent(duplicate_analysis_results_str)
|
1644 |
+
except:
|
1645 |
+
logger.error("Failed to generate report for duplicate analysis....", log_type='data_quality_assessment', console=verbose)
|
1646 |
+
pass
|
1647 |
+
|
1648 |
+
try:
|
1649 |
+
data_quality_results_str = json.dumps(data_quality_results, indent=2, default=str)
|
1650 |
+
data_quality_results_str = data_quality_results_str +'\n'+data_quality_summary
|
1651 |
+
final_result['data_quality_analysis']['report'] = self.generate_report_from_agent(data_quality_results_str)
|
1652 |
+
except:
|
1653 |
+
logger.error("Failed to generate report for data quality....", log_type='data_quality_assessment', console=verbose)
|
1654 |
+
pass
|
1655 |
+
|
1656 |
+
|
1657 |
+
return final_result
|
src/app/pipelines/modules/data_statistics.py
ADDED
@@ -0,0 +1,1270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import math
|
4 |
+
import json
|
5 |
+
import psutil
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
import dateutil.parser
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
from dateutil import parser
|
11 |
+
from datetime import datetime
|
12 |
+
from collections import Counter
|
13 |
+
from src.core.utils import logger
|
14 |
+
from agno.agent import Agent, RunResponse
|
15 |
+
from agno.models.openai import OpenAIChat
|
16 |
+
from typing import Union, List, Dict, Any, Tuple
|
17 |
+
|
18 |
+
load_dotenv()
|
19 |
+
|
20 |
+
class DataStatisticsWorkflow:
|
21 |
+
def __init__(
|
22 |
+
self, data_source: str,
|
23 |
+
llm_choice: str,
|
24 |
+
ml_task: str
|
25 |
+
) -> None:
|
26 |
+
''''''
|
27 |
+
self.data = None
|
28 |
+
self.llm = OpenAIChat(id=llm_choice, api_key=os.getenv('OPENAI_API_KEY'))
|
29 |
+
self.ml_task = ml_task
|
30 |
+
_ = self.load_data(data_source=data_source)
|
31 |
+
self.llm = OpenAIChat(id=llm_choice, api_key=os.getenv('OPENAI_API_KEY'))
|
32 |
+
self.writer: Agent = Agent(
|
33 |
+
model=self.llm,
|
34 |
+
instructions=[
|
35 |
+
"You will be provided with lots of structured outputs. Your work is to display this"
|
36 |
+
"in a nicely formatted manner. You must analayze the results and output a comprehensive and insightful report"
|
37 |
+
],
|
38 |
+
markdown=True,
|
39 |
+
)
|
40 |
+
|
41 |
+
def load_data(self, data_source: str) -> Union[None, bool]:
|
42 |
+
'''Load CSV into dataframe'''
|
43 |
+
try:
|
44 |
+
self.data = pd.read_csv(data_source)
|
45 |
+
return True
|
46 |
+
except Exception as e:
|
47 |
+
logger.error(
|
48 |
+
f"Failed to read the file from the data source with error: {e}", log_type="data_statistics", console=True)
|
49 |
+
return False
|
50 |
+
|
51 |
+
def format_json(self, results, indent: int = 4) -> str:
|
52 |
+
def convert_to_serializable(obj):
|
53 |
+
if isinstance(obj, (np.integer, np.floating, np.bool_)):
|
54 |
+
return obj.item()
|
55 |
+
elif isinstance(obj, np.ndarray):
|
56 |
+
return obj.tolist()
|
57 |
+
elif pd.isna(obj):
|
58 |
+
return None
|
59 |
+
return obj
|
60 |
+
|
61 |
+
def process_dict(d):
|
62 |
+
result = {}
|
63 |
+
for k, v in d.items():
|
64 |
+
if isinstance(v, dict):
|
65 |
+
result[k] = process_dict(v)
|
66 |
+
elif isinstance(v, list):
|
67 |
+
result[k] = [convert_to_serializable(item) for item in v]
|
68 |
+
else:
|
69 |
+
result[k] = convert_to_serializable(v)
|
70 |
+
return result
|
71 |
+
|
72 |
+
serializable_results = process_dict(results)
|
73 |
+
|
74 |
+
return json.dumps(serializable_results, indent=indent)
|
75 |
+
|
76 |
+
def build_statistical_summary(self, data_source: str = None, verbose=False) -> Dict[str, Any]:
|
77 |
+
'''Get the basic central tendancy, dispersion, quantiles, distinct values, frequency distributions and sparsity'''
|
78 |
+
logger.info(f"Starting statistical analysis", log_type="data_statistics", console=verbose)
|
79 |
+
if data_source:
|
80 |
+
status = self.load_data(data_source=data_source)
|
81 |
+
if not status:
|
82 |
+
logger.info("Failed to load data. Can't build statistical summary",
|
83 |
+
log_type='data_statistics', console=True)
|
84 |
+
|
85 |
+
quantiles: List[float] = [0.05, 0.25, 0.5, 0.75, 0.95, 0.99]
|
86 |
+
num_freq_values: int = 10
|
87 |
+
|
88 |
+
results = {
|
89 |
+
"dataset_info": {
|
90 |
+
"num_rows": len(self.data),
|
91 |
+
"num_columns": len(self.data.columns),
|
92 |
+
"memory_usage": self.data.memory_usage(deep=True).sum(),
|
93 |
+
"dtypes": {col: str(dtype) for col, dtype in self.data.dtypes.items()}
|
94 |
+
},
|
95 |
+
"columns": {}
|
96 |
+
}
|
97 |
+
|
98 |
+
for column in self.data.columns:
|
99 |
+
column_data = self.data[column]
|
100 |
+
column_type = str(self.data[column].dtype)
|
101 |
+
is_numeric = pd.api.types.is_numeric_dtype(column_data)
|
102 |
+
is_datetime = pd.api.types.is_datetime64_any_dtype(column_data)
|
103 |
+
|
104 |
+
col_results = {
|
105 |
+
"dtype": column_type,
|
106 |
+
"count": len(column_data),
|
107 |
+
"num_unique": column_data.nunique(),
|
108 |
+
"num_missing": column_data.isna().sum(),
|
109 |
+
"missing_percentage": (column_data.isna().sum() / len(column_data)) * 100,
|
110 |
+
"sparsity": {
|
111 |
+
"zeros": None,
|
112 |
+
"zeros_percentage": None,
|
113 |
+
"empty_strings": None,
|
114 |
+
"empty_strings_percentage": None
|
115 |
+
}
|
116 |
+
}
|
117 |
+
|
118 |
+
is_boolean = pd.api.types.is_bool_dtype(column_data)
|
119 |
+
|
120 |
+
if is_numeric and not is_boolean:
|
121 |
+
try:
|
122 |
+
num_zeros = (column_data == 0).sum()
|
123 |
+
col_results["sparsity"]["zeros"] = num_zeros
|
124 |
+
col_results["sparsity"]["zeros_percentage"] = (num_zeros / column_data.count()) * 100
|
125 |
+
except Exception as e:
|
126 |
+
logger.error(f"{e}", log_type='data_statistics', console=verbose)
|
127 |
+
elif column_type.startswith('object') or column_type.startswith('string'):
|
128 |
+
try:
|
129 |
+
empty_strings = column_data.fillna('').apply(lambda x: isinstance(x, str) and x.strip() == '').sum()
|
130 |
+
col_results["sparsity"]["empty_strings"] = empty_strings
|
131 |
+
col_results["sparsity"]["empty_strings_percentage"] = (empty_strings / column_data.count()) * 100
|
132 |
+
except Exception as e:
|
133 |
+
logger.error(f"{e}", log_type='data_statistics', console=verbose)
|
134 |
+
|
135 |
+
if is_numeric and not is_boolean:
|
136 |
+
try:
|
137 |
+
col_results["central_tendency"] = {
|
138 |
+
"mean": column_data.mean() if not all(column_data.isna()) else None,
|
139 |
+
"median": column_data.median() if not all(column_data.isna()) else None,
|
140 |
+
"mode": column_data.mode().iloc[0] if not column_data.mode().empty else None
|
141 |
+
}
|
142 |
+
except Exception as e:
|
143 |
+
logger.error(f"{e}", log_type='data_statistics', console=verbose)
|
144 |
+
|
145 |
+
try:
|
146 |
+
col_results["dispersion"] = {
|
147 |
+
"std": column_data.std() if not all(column_data.isna()) else None,
|
148 |
+
"variance": column_data.var() if not all(column_data.isna()) else None,
|
149 |
+
"range": {
|
150 |
+
"min": column_data.min() if not all(column_data.isna()) else None,
|
151 |
+
"max": column_data.max() if not all(column_data.isna()) else None
|
152 |
+
},
|
153 |
+
"iqr": (
|
154 |
+
column_data.quantile(0.75) - column_data.quantile(0.25)
|
155 |
+
if not pd.api.types.is_bool_dtype(column_data)
|
156 |
+
else None
|
157 |
+
)
|
158 |
+
}
|
159 |
+
except Exception as e:
|
160 |
+
logger.error(f"{e}", log_type='data_statistics', console=verbose)
|
161 |
+
|
162 |
+
if not all(column_data.isna()):
|
163 |
+
col_results["quantiles"] = {
|
164 |
+
f"q{int(q*100)}": column_data.quantile(q) for q in quantiles
|
165 |
+
}
|
166 |
+
else:
|
167 |
+
col_results["quantiles"] = {
|
168 |
+
f"q{int(q*100)}": None for q in quantiles}
|
169 |
+
|
170 |
+
value_counts = column_data.value_counts(
|
171 |
+
dropna=False).head(num_freq_values)
|
172 |
+
col_results["frequency_distribution"] = {
|
173 |
+
"values": value_counts.index.tolist(),
|
174 |
+
"counts": value_counts.tolist(),
|
175 |
+
"percentages": (value_counts / len(column_data) * 100).tolist()
|
176 |
+
}
|
177 |
+
|
178 |
+
if is_datetime:
|
179 |
+
try:
|
180 |
+
col_results["datetime_info"] = {
|
181 |
+
"min_date": column_data.min().strftime('%Y-%m-%d %H:%M:%S') if not all(column_data.isna()) else None,
|
182 |
+
"max_date": column_data.max().strftime('%Y-%m-%d %H:%M:%S') if not all(column_data.isna()) else None,
|
183 |
+
"date_range_days": (column_data.max() - column_data.min()).days if not all(column_data.isna()) else None
|
184 |
+
}
|
185 |
+
except:
|
186 |
+
col_results["datetime_info"] = "Error processing datetime information"
|
187 |
+
|
188 |
+
results["columns"][column] = col_results
|
189 |
+
|
190 |
+
numeric_cols = self.data.select_dtypes(include=['number']).columns.tolist()
|
191 |
+
if len(numeric_cols) > 1:
|
192 |
+
try:
|
193 |
+
correlations = self.data[numeric_cols].corr().round(
|
194 |
+
3).to_dict()
|
195 |
+
results["correlations"] = correlations
|
196 |
+
except:
|
197 |
+
results["correlations"] = "Error computing correlations"
|
198 |
+
|
199 |
+
return results
|
200 |
+
|
201 |
+
def analyze_data_types(self, sample_size=None, verbose=False):
|
202 |
+
logger.info(f"Starting data-type analysis", log_type="data_statistics", console=verbose)
|
203 |
+
|
204 |
+
df = self.data
|
205 |
+
|
206 |
+
if sample_size and sample_size < len(self.data):
|
207 |
+
df = self.data.sample(sample_size, random_state=42)
|
208 |
+
|
209 |
+
df = df.replace('', np.nan)
|
210 |
+
|
211 |
+
results = {}
|
212 |
+
|
213 |
+
for column in df.columns:
|
214 |
+
if df[column].isna().all():
|
215 |
+
results[column] = {
|
216 |
+
"inferred_type": "empty",
|
217 |
+
"description": "Column is entirely empty"
|
218 |
+
}
|
219 |
+
continue
|
220 |
+
|
221 |
+
values = df[column].dropna().values
|
222 |
+
if len(values) == 0:
|
223 |
+
continue
|
224 |
+
|
225 |
+
column_analysis = self.analyze_column(values, column, df)
|
226 |
+
results[column] = column_analysis
|
227 |
+
|
228 |
+
results["__summary__"] = self.generate_summary(results, df)
|
229 |
+
return results
|
230 |
+
|
231 |
+
def analyze_column(self, values, column_name, df):
|
232 |
+
"""Analyze a single column's values to determine data types and patterns (safe and error-resilient)"""
|
233 |
+
try:
|
234 |
+
sample_values = values[:5].tolist() if hasattr(values, 'tolist') else list(values)[:5]
|
235 |
+
except Exception:
|
236 |
+
sample_values = []
|
237 |
+
|
238 |
+
type_counts = {
|
239 |
+
"integer": 0,
|
240 |
+
"float": 0,
|
241 |
+
"boolean": 0,
|
242 |
+
"date": 0,
|
243 |
+
"text": 0
|
244 |
+
}
|
245 |
+
|
246 |
+
unique_values = set()
|
247 |
+
total_values = len(values)
|
248 |
+
|
249 |
+
for val in values:
|
250 |
+
try:
|
251 |
+
unique_values.add(val)
|
252 |
+
detected_type = self.detect_value_type(val)
|
253 |
+
if detected_type:
|
254 |
+
type_counts[detected_type] += 1
|
255 |
+
except Exception:
|
256 |
+
continue
|
257 |
+
|
258 |
+
try:
|
259 |
+
main_type = max(type_counts.items(), key=lambda x: x[1])[0]
|
260 |
+
mixed_types = sum(1 for count in type_counts.values() if count > 0) > 1
|
261 |
+
type_percentages = {
|
262 |
+
t: (count / total_values) * 100 for t, count in type_counts.items() if count > 0
|
263 |
+
}
|
264 |
+
except Exception:
|
265 |
+
main_type = "text"
|
266 |
+
mixed_types = False
|
267 |
+
type_percentages = {}
|
268 |
+
|
269 |
+
result = {
|
270 |
+
"inferred_type": main_type,
|
271 |
+
"has_mixed_types": mixed_types,
|
272 |
+
"type_percentages": type_percentages,
|
273 |
+
"unique_count": len(unique_values),
|
274 |
+
"unique_percentage": (len(unique_values) / total_values * 100) if total_values else 0,
|
275 |
+
"sample_values": sample_values
|
276 |
+
}
|
277 |
+
|
278 |
+
try:
|
279 |
+
if main_type in ["integer", "float"]:
|
280 |
+
numeric_analysis = self.analyze_numeric_column(values, result)
|
281 |
+
result.update(numeric_analysis)
|
282 |
+
except Exception:
|
283 |
+
pass
|
284 |
+
|
285 |
+
try:
|
286 |
+
if main_type == "date":
|
287 |
+
date_analysis = self.analyze_date_column(values)
|
288 |
+
result.update(date_analysis)
|
289 |
+
except Exception:
|
290 |
+
pass
|
291 |
+
|
292 |
+
if main_type == "text":
|
293 |
+
try:
|
294 |
+
numeric_as_text = self.check_numeric_as_text(values)
|
295 |
+
if numeric_as_text.get("is_numeric_as_text"):
|
296 |
+
result["numeric_as_text"] = True
|
297 |
+
result["numeric_as_text_details"] = numeric_as_text
|
298 |
+
except Exception:
|
299 |
+
pass
|
300 |
+
|
301 |
+
try:
|
302 |
+
date_as_text = self.check_dates_as_text(values)
|
303 |
+
if date_as_text.get("is_date_as_text"):
|
304 |
+
result["date_as_text"] = True
|
305 |
+
result["date_as_text_details"] = date_as_text
|
306 |
+
except Exception:
|
307 |
+
pass
|
308 |
+
|
309 |
+
try:
|
310 |
+
cardinality = self.analyze_cardinality(values, result["unique_count"], total_values)
|
311 |
+
result.update(cardinality)
|
312 |
+
except Exception:
|
313 |
+
pass
|
314 |
+
|
315 |
+
try:
|
316 |
+
if result["unique_count"] <= 2 and not result.get("is_id_like", False):
|
317 |
+
result["is_binary"] = True
|
318 |
+
result["binary_values"] = list(unique_values)
|
319 |
+
else:
|
320 |
+
result["is_binary"] = False
|
321 |
+
except Exception:
|
322 |
+
result["is_binary"] = False
|
323 |
+
|
324 |
+
try:
|
325 |
+
result["description"] = self.generate_column_description(result, column_name)
|
326 |
+
except Exception:
|
327 |
+
result["description"] = "Could not generate column description due to an error."
|
328 |
+
|
329 |
+
return result
|
330 |
+
|
331 |
+
def detect_value_type(self, value, verbose=False):
|
332 |
+
"""Determine the data type of a single value"""
|
333 |
+
try:
|
334 |
+
if value.lower() in ('true', 'false', 'yes', 'no', 'y', 'n', 't', 'f', '1', '0'):
|
335 |
+
if value.lower() in ('1', '0'):
|
336 |
+
pass
|
337 |
+
else:
|
338 |
+
return "boolean"
|
339 |
+
|
340 |
+
if re.match(r'^-?\d+$', value):
|
341 |
+
return "integer"
|
342 |
+
|
343 |
+
if re.match(r'^-?\d+\.\d+$', value) or re.match(r'^-?\d+,\d+$', value):
|
344 |
+
return "float"
|
345 |
+
|
346 |
+
try:
|
347 |
+
dateutil.parser.parse(value)
|
348 |
+
return "date"
|
349 |
+
except:
|
350 |
+
pass
|
351 |
+
|
352 |
+
return "text"
|
353 |
+
except Exception as e:
|
354 |
+
logger.error(f"Value type detection failed with error: {e}", log_type='data_statistics', console=verbose)
|
355 |
+
return ""
|
356 |
+
|
357 |
+
def analyze_date_column(self, values):
|
358 |
+
"""Further analyze date columns safely and thoroughly"""
|
359 |
+
from datetime import datetime
|
360 |
+
formats = []
|
361 |
+
min_date = None
|
362 |
+
max_date = None
|
363 |
+
|
364 |
+
for val in values[:100]:
|
365 |
+
if not val:
|
366 |
+
continue
|
367 |
+
try:
|
368 |
+
date_val = val if isinstance(val, datetime) else dateutil.parser.parse(str(val))
|
369 |
+
|
370 |
+
if not min_date or date_val < min_date:
|
371 |
+
min_date = date_val
|
372 |
+
if not max_date or date_val > max_date:
|
373 |
+
max_date = date_val
|
374 |
+
|
375 |
+
val_str = str(val).strip()
|
376 |
+
if re.match(r'^\d{4}-\d{2}-\d{2}$', val_str):
|
377 |
+
formats.append("YYYY-MM-DD")
|
378 |
+
elif re.match(r'^\d{4}/\d{2}/\d{2}$', val_str):
|
379 |
+
formats.append("YYYY/MM/DD")
|
380 |
+
elif re.match(r'^\d{2}-\d{2}-\d{4}$', val_str):
|
381 |
+
formats.append("DD-MM-YYYY")
|
382 |
+
elif re.match(r'^\d{2}/\d{2}/\d{4}$', val_str):
|
383 |
+
formats.append("DD/MM/YYYY")
|
384 |
+
elif re.match(r'^\d{2}-\w{3}-\d{4}$', val_str):
|
385 |
+
formats.append("DD-MMM-YYYY")
|
386 |
+
elif re.match(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$', val_str):
|
387 |
+
formats.append("YYYY-MM-DD HH:MM:SS")
|
388 |
+
else:
|
389 |
+
formats.append("Other")
|
390 |
+
except Exception:
|
391 |
+
continue
|
392 |
+
|
393 |
+
format_counts = Counter(formats)
|
394 |
+
most_common = format_counts.most_common(2)
|
395 |
+
|
396 |
+
result = {
|
397 |
+
"date_format": [f[0] for f in most_common] if most_common else ["Unknown"],
|
398 |
+
"min_date": min_date.isoformat() if min_date else None,
|
399 |
+
"max_date": max_date.isoformat() if max_date else None,
|
400 |
+
"date_range_days": (max_date - min_date).days if min_date and max_date else None,
|
401 |
+
"confidence": (len(formats) / min(len(values), 100)) * 100
|
402 |
+
}
|
403 |
+
|
404 |
+
return result
|
405 |
+
|
406 |
+
def analyze_numeric_column(self, values, current_results):
|
407 |
+
"""Further analyze numeric columns (safe version)"""
|
408 |
+
result = {}
|
409 |
+
|
410 |
+
try:
|
411 |
+
numeric_values = []
|
412 |
+
for val in values:
|
413 |
+
try:
|
414 |
+
if val is None:
|
415 |
+
continue
|
416 |
+
if isinstance(val, (int, float)):
|
417 |
+
numeric_values.append(float(val))
|
418 |
+
elif isinstance(val, str):
|
419 |
+
cleaned = val.replace(',', '.').strip()
|
420 |
+
numeric_values.append(float(cleaned))
|
421 |
+
except (ValueError, TypeError):
|
422 |
+
continue
|
423 |
+
|
424 |
+
if not numeric_values:
|
425 |
+
result["warning"] = "No valid numeric values found"
|
426 |
+
return result
|
427 |
+
|
428 |
+
numeric_values.sort()
|
429 |
+
n = len(numeric_values)
|
430 |
+
median = (
|
431 |
+
numeric_values[n // 2]
|
432 |
+
if n % 2 == 1
|
433 |
+
else (numeric_values[n // 2 - 1] + numeric_values[n // 2]) / 2
|
434 |
+
)
|
435 |
+
|
436 |
+
result.update({
|
437 |
+
"min": min(numeric_values),
|
438 |
+
"max": max(numeric_values),
|
439 |
+
"mean": sum(numeric_values) / n,
|
440 |
+
"median": median
|
441 |
+
})
|
442 |
+
|
443 |
+
if current_results.get("inferred_type") == "float":
|
444 |
+
integers_count = sum(1 for x in numeric_values if x == int(x))
|
445 |
+
result["integers_percentage"] = (integers_count / n) * 100
|
446 |
+
|
447 |
+
except Exception as e:
|
448 |
+
result["error"] = f"Error in analyze_numeric_column: {str(e)}"
|
449 |
+
|
450 |
+
return result
|
451 |
+
|
452 |
+
def check_numeric_as_text(self, values):
|
453 |
+
"""Check if text values might actually be numeric values stored as text (safe version)"""
|
454 |
+
result = {
|
455 |
+
"is_numeric_as_text": False,
|
456 |
+
"confidence": 0.0,
|
457 |
+
"matched_patterns": {}
|
458 |
+
}
|
459 |
+
|
460 |
+
try:
|
461 |
+
if not values:
|
462 |
+
result["warning"] = "No values provided"
|
463 |
+
return result
|
464 |
+
|
465 |
+
sample_size = min(100, len(values))
|
466 |
+
if sample_size == 0:
|
467 |
+
result["warning"] = "No valid sample size"
|
468 |
+
return result
|
469 |
+
|
470 |
+
numeric_patterns = [
|
471 |
+
r'^\d{1,3}(,\d{3})+(\.\d+)?$', # US format: 1,234.56
|
472 |
+
r'^\d{1,3}(\.\d{3})+(,\d+)?$', # EU format: 1.234,56
|
473 |
+
r'^-?\d+\s\d+(/\d+)?$', # Fractions: 1 1/2
|
474 |
+
r'^\$\d+(\.\d+)?$', # Dollars: $123.45
|
475 |
+
r'^€\d+(\.\d+)?$', # Euros: €123.45
|
476 |
+
r'^\d+(\.\d+)?%$', # Percentages: 12.3%
|
477 |
+
]
|
478 |
+
|
479 |
+
matches = 0
|
480 |
+
pattern_matches = {pattern: 0 for pattern in numeric_patterns}
|
481 |
+
|
482 |
+
for val in values[:sample_size]:
|
483 |
+
try:
|
484 |
+
if not isinstance(val, str):
|
485 |
+
continue
|
486 |
+
for pattern in numeric_patterns:
|
487 |
+
if re.match(pattern, val.strip()):
|
488 |
+
matches += 1
|
489 |
+
pattern_matches[pattern] += 1
|
490 |
+
break
|
491 |
+
except Exception:
|
492 |
+
continue
|
493 |
+
|
494 |
+
percentage = (matches / sample_size) * 100
|
495 |
+
is_numeric_as_text = percentage > 80
|
496 |
+
|
497 |
+
result.update({
|
498 |
+
"is_numeric_as_text": is_numeric_as_text,
|
499 |
+
"confidence": percentage,
|
500 |
+
"matched_patterns": {p: c for p, c in pattern_matches.items() if c > 0}
|
501 |
+
})
|
502 |
+
|
503 |
+
except Exception as e:
|
504 |
+
result["error"] = f"Error in check_numeric_as_text: {str(e)}"
|
505 |
+
|
506 |
+
return result
|
507 |
+
|
508 |
+
def check_dates_as_text(self, values):
|
509 |
+
"""Check if text values might actually be dates stored as text (safe version)"""
|
510 |
+
result = {
|
511 |
+
"is_date_as_text": False,
|
512 |
+
"confidence": 0.0,
|
513 |
+
"common_formats": []
|
514 |
+
}
|
515 |
+
|
516 |
+
try:
|
517 |
+
if not values:
|
518 |
+
result["warning"] = "No values provided"
|
519 |
+
return result
|
520 |
+
|
521 |
+
sample_size = min(100, len(values))
|
522 |
+
dates_detected = 0
|
523 |
+
formats_detected = []
|
524 |
+
|
525 |
+
for val in values[:sample_size]:
|
526 |
+
try:
|
527 |
+
if not isinstance(val, str):
|
528 |
+
continue
|
529 |
+
|
530 |
+
parser.parse(val)
|
531 |
+
dates_detected += 1
|
532 |
+
|
533 |
+
if re.match(r'^\d{4}-\d{2}-\d{2}', val):
|
534 |
+
formats_detected.append("ISO")
|
535 |
+
elif re.match(r'^\d{2}/\d{2}/\d{4}', val):
|
536 |
+
formats_detected.append("US/UK")
|
537 |
+
elif re.match(r'^\d{2}-\w{3}-\d{4}', val):
|
538 |
+
formats_detected.append("DD-MMM-YYYY")
|
539 |
+
else:
|
540 |
+
formats_detected.append("Other")
|
541 |
+
|
542 |
+
except Exception:
|
543 |
+
continue
|
544 |
+
|
545 |
+
if sample_size > 0:
|
546 |
+
percentage = (dates_detected / sample_size) * 100
|
547 |
+
else:
|
548 |
+
percentage = 0.0
|
549 |
+
|
550 |
+
is_date_as_text = percentage > 80
|
551 |
+
format_counts = Counter(formats_detected)
|
552 |
+
common_formats = format_counts.most_common(2)
|
553 |
+
|
554 |
+
result.update({
|
555 |
+
"is_date_as_text": is_date_as_text,
|
556 |
+
"confidence": percentage,
|
557 |
+
"common_formats": common_formats if formats_detected else []
|
558 |
+
})
|
559 |
+
|
560 |
+
except Exception as e:
|
561 |
+
result["error"] = f"Error in check_dates_as_text: {str(e)}"
|
562 |
+
|
563 |
+
return result
|
564 |
+
|
565 |
+
def analyze_cardinality(self, values, unique_count, total_count):
|
566 |
+
"""Analyze cardinality to determine if a column is categorical, continuous, or ID-like, safely"""
|
567 |
+
result = {}
|
568 |
+
|
569 |
+
try:
|
570 |
+
if not values or total_count == 0:
|
571 |
+
result["error"] = "Empty values or total_count is 0"
|
572 |
+
return result
|
573 |
+
|
574 |
+
uniqueness_ratio = unique_count / total_count if total_count else 0
|
575 |
+
|
576 |
+
try:
|
577 |
+
value_counts = Counter(values)
|
578 |
+
most_common = value_counts.most_common(5)
|
579 |
+
top_value_percentage = most_common[0][1] / total_count * 100 if most_common else 0
|
580 |
+
except Exception as e:
|
581 |
+
most_common = []
|
582 |
+
top_value_percentage = 0
|
583 |
+
result["warning"] = f"Error analyzing value counts: {str(e)}"
|
584 |
+
|
585 |
+
if uniqueness_ratio > 0.9:
|
586 |
+
result["is_id_like"] = True
|
587 |
+
result["cardinality_type"] = "id_like"
|
588 |
+
elif uniqueness_ratio > 0.5:
|
589 |
+
result["is_continuous"] = True
|
590 |
+
result["cardinality_type"] = "continuous"
|
591 |
+
else:
|
592 |
+
result["is_categorical"] = True
|
593 |
+
result["cardinality_type"] = "categorical"
|
594 |
+
|
595 |
+
if unique_count <= 5:
|
596 |
+
result["categorical_level"] = "low"
|
597 |
+
elif unique_count <= 20:
|
598 |
+
result["categorical_level"] = "medium"
|
599 |
+
else:
|
600 |
+
result["categorical_level"] = "high"
|
601 |
+
|
602 |
+
result["top_categories"] = [
|
603 |
+
{
|
604 |
+
"value": val,
|
605 |
+
"count": count,
|
606 |
+
"percentage": (count / total_count * 100 if total_count else 0)
|
607 |
+
}
|
608 |
+
for val, count in most_common
|
609 |
+
]
|
610 |
+
|
611 |
+
result["uniqueness_ratio"] = uniqueness_ratio
|
612 |
+
result["top_value_percentage"] = top_value_percentage
|
613 |
+
|
614 |
+
except Exception as e:
|
615 |
+
result["error"] = f"Error in analyze_cardinality: {str(e)}"
|
616 |
+
|
617 |
+
return result
|
618 |
+
|
619 |
+
def generate_column_description(self, analysis, column_name):
|
620 |
+
"""Generate a human-readable description of the column based on analysis, with error safety"""
|
621 |
+
descriptions = []
|
622 |
+
|
623 |
+
try:
|
624 |
+
if analysis.get("has_mixed_types", False):
|
625 |
+
inferred = analysis.get("inferred_type", "unknown")
|
626 |
+
descriptions.append(f"Mixed data types with {inferred} being most common")
|
627 |
+
else:
|
628 |
+
inferred = analysis.get("inferred_type", "unknown")
|
629 |
+
descriptions.append(f"{inferred.capitalize()} data type")
|
630 |
+
except Exception as e:
|
631 |
+
descriptions.append(f"[Error identifying type]: {str(e)}")
|
632 |
+
|
633 |
+
try:
|
634 |
+
if analysis.get("is_id_like", False):
|
635 |
+
descriptions.append("Likely an ID field (high uniqueness)")
|
636 |
+
elif analysis.get("is_binary", False):
|
637 |
+
binary_values = ", ".join(str(v) for v in analysis.get("binary_values", []))
|
638 |
+
descriptions.append(f"Binary field with values: {binary_values}")
|
639 |
+
elif analysis.get("is_categorical", False):
|
640 |
+
cardinality = analysis.get("categorical_level", "unknown")
|
641 |
+
unique_count = analysis.get("unique_count", "unknown")
|
642 |
+
descriptions.append(f"{cardinality.capitalize()} cardinality categorical field with {unique_count} unique values")
|
643 |
+
elif analysis.get("is_continuous", False):
|
644 |
+
descriptions.append("Continuous variable")
|
645 |
+
except Exception as e:
|
646 |
+
descriptions.append(f"[Error determining field category]: {str(e)}")
|
647 |
+
|
648 |
+
try:
|
649 |
+
if analysis.get("numeric_as_text", False):
|
650 |
+
descriptions.append("Numeric values stored as text")
|
651 |
+
except Exception:
|
652 |
+
pass
|
653 |
+
|
654 |
+
try:
|
655 |
+
if analysis.get("date_as_text", False):
|
656 |
+
descriptions.append("Date values stored as text")
|
657 |
+
except Exception:
|
658 |
+
pass
|
659 |
+
|
660 |
+
try:
|
661 |
+
if "min" in analysis and "max" in analysis:
|
662 |
+
descriptions.append(f"Range: {analysis['min']} to {analysis['max']}")
|
663 |
+
except Exception as e:
|
664 |
+
descriptions.append(f"[Error extracting range]: {str(e)}")
|
665 |
+
|
666 |
+
try:
|
667 |
+
if "date_format" in analysis:
|
668 |
+
descriptions.append(f"Date format: {analysis['date_format']}")
|
669 |
+
if "date_range_days" in analysis:
|
670 |
+
descriptions.append(f"Spans {analysis['date_range_days']} days")
|
671 |
+
except Exception as e:
|
672 |
+
descriptions.append(f"[Error in date format]: {str(e)}")
|
673 |
+
|
674 |
+
return ". ".join(descriptions)
|
675 |
+
|
676 |
+
def generate_summary(self, results, df):
|
677 |
+
"""Generate an overall summary of the dataset with error safety"""
|
678 |
+
analysis_results = {k: v for k, v in results.items() if k != "__summary__"}
|
679 |
+
|
680 |
+
column_types = {}
|
681 |
+
mixed_types_count = numeric_as_text_count = date_as_text_count = 0
|
682 |
+
binary_count = id_like_count = categorical_count = continuous_count = 0
|
683 |
+
|
684 |
+
for col, analysis in analysis_results.items():
|
685 |
+
try:
|
686 |
+
col_type = analysis.get("inferred_type", "unknown")
|
687 |
+
column_types[col_type] = column_types.get(col_type, 0) + 1
|
688 |
+
except Exception as e:
|
689 |
+
column_types["unknown"] = column_types.get("unknown", 0) + 1
|
690 |
+
|
691 |
+
try:
|
692 |
+
if analysis.get("has_mixed_types", False):
|
693 |
+
mixed_types_count += 1
|
694 |
+
except Exception:
|
695 |
+
pass
|
696 |
+
|
697 |
+
try:
|
698 |
+
if analysis.get("numeric_as_text", False):
|
699 |
+
numeric_as_text_count += 1
|
700 |
+
except Exception:
|
701 |
+
pass
|
702 |
+
|
703 |
+
try:
|
704 |
+
if analysis.get("date_as_text", False):
|
705 |
+
date_as_text_count += 1
|
706 |
+
except Exception:
|
707 |
+
pass
|
708 |
+
|
709 |
+
try:
|
710 |
+
if analysis.get("is_binary", False):
|
711 |
+
binary_count += 1
|
712 |
+
except Exception:
|
713 |
+
pass
|
714 |
+
|
715 |
+
try:
|
716 |
+
if analysis.get("is_id_like", False):
|
717 |
+
id_like_count += 1
|
718 |
+
except Exception:
|
719 |
+
pass
|
720 |
+
|
721 |
+
try:
|
722 |
+
if analysis.get("is_categorical", False):
|
723 |
+
categorical_count += 1
|
724 |
+
except Exception:
|
725 |
+
pass
|
726 |
+
|
727 |
+
try:
|
728 |
+
if analysis.get("is_continuous", False):
|
729 |
+
continuous_count += 1
|
730 |
+
except Exception:
|
731 |
+
pass
|
732 |
+
|
733 |
+
recommendations = []
|
734 |
+
if mixed_types_count > 0:
|
735 |
+
recommendations.append(f"Review {mixed_types_count} columns with mixed data types")
|
736 |
+
if numeric_as_text_count > 0:
|
737 |
+
recommendations.append(f"Convert {numeric_as_text_count} columns with numeric values stored as text")
|
738 |
+
if date_as_text_count > 0:
|
739 |
+
recommendations.append(f"Convert {date_as_text_count} columns with date values stored as text")
|
740 |
+
|
741 |
+
try:
|
742 |
+
detailed_findings = self.generate_key_findings(analysis_results)
|
743 |
+
except Exception as e:
|
744 |
+
detailed_findings = [f"[Error generating key findings]: {str(e)}"]
|
745 |
+
|
746 |
+
return {
|
747 |
+
"total_rows": len(df),
|
748 |
+
"total_columns": len(analysis_results),
|
749 |
+
"column_type_counts": column_types,
|
750 |
+
"data_quality": {
|
751 |
+
"mixed_types_count": mixed_types_count,
|
752 |
+
"numeric_as_text_count": numeric_as_text_count,
|
753 |
+
"date_as_text_count": date_as_text_count
|
754 |
+
},
|
755 |
+
"structure": {
|
756 |
+
"binary_columns": binary_count,
|
757 |
+
"id_like_columns": id_like_count,
|
758 |
+
"categorical_columns": categorical_count,
|
759 |
+
"continuous_columns": continuous_count,
|
760 |
+
},
|
761 |
+
"recommendations": recommendations,
|
762 |
+
"detailed_findings": detailed_findings
|
763 |
+
}
|
764 |
+
|
765 |
+
def generate_key_findings(self, analysis_results):
|
766 |
+
"""Generate detailed key findings from the analysis results with error safety"""
|
767 |
+
findings = []
|
768 |
+
|
769 |
+
for col, analysis in analysis_results.items():
|
770 |
+
try:
|
771 |
+
if analysis.get("has_mixed_types", False):
|
772 |
+
type_percentages = analysis.get("type_percentages", {})
|
773 |
+
finding = f"Column '{col}' has mixed data types: " + ", ".join(
|
774 |
+
f"{type_name} ({percentage:.1f}%)"
|
775 |
+
for type_name, percentage in type_percentages.items()
|
776 |
+
)
|
777 |
+
findings.append(finding)
|
778 |
+
except Exception as e:
|
779 |
+
findings.append(f"[Error analyzing mixed types for '{col}']: {e}")
|
780 |
+
|
781 |
+
try:
|
782 |
+
if analysis.get("numeric_as_text", False):
|
783 |
+
details = analysis.get("numeric_as_text_details", {})
|
784 |
+
confidence = details.get("confidence", 0.0)
|
785 |
+
findings.append(
|
786 |
+
f"Column '{col}' likely contains numeric values stored as text ({confidence:.1f}% confidence)"
|
787 |
+
)
|
788 |
+
except Exception as e:
|
789 |
+
findings.append(f"[Error analyzing numeric-as-text for '{col}']: {e}")
|
790 |
+
|
791 |
+
try:
|
792 |
+
if analysis.get("date_as_text", False):
|
793 |
+
details = analysis.get("date_as_text_details", {})
|
794 |
+
confidence = details.get("confidence", 0.0)
|
795 |
+
formats = details.get("common_formats", [])
|
796 |
+
format_str = ", ".join(f"{fmt[0]}" for fmt in formats[:2]) if formats else "various"
|
797 |
+
findings.append(
|
798 |
+
f"Column '{col}' likely contains dates in {format_str} format ({confidence:.1f}% confidence)"
|
799 |
+
)
|
800 |
+
except Exception as e:
|
801 |
+
findings.append(f"[Error analyzing date-as-text for '{col}']: {e}")
|
802 |
+
|
803 |
+
try:
|
804 |
+
if analysis.get("is_id_like", False):
|
805 |
+
uniqueness_ratio = analysis.get("uniqueness_ratio", 1.0)
|
806 |
+
if uniqueness_ratio < 1.0:
|
807 |
+
dupe_percentage = (1 - uniqueness_ratio) * 100
|
808 |
+
findings.append(
|
809 |
+
f"ID-like column '{col}' has {dupe_percentage:.1f}% duplicate values"
|
810 |
+
)
|
811 |
+
except Exception as e:
|
812 |
+
findings.append(f"[Error analyzing ID-like column '{col}']: {e}")
|
813 |
+
|
814 |
+
return findings
|
815 |
+
|
816 |
+
def analyze_dataset_dimensionality(self, sample_size=None, detailed=True, verbose=False):
|
817 |
+
"""Analyze the dimensionality characteristics of a dataset"""
|
818 |
+
logger.info(f"Starting dataset dimensionality analysis", log_type="data_statistics", console=verbose)
|
819 |
+
|
820 |
+
results = {
|
821 |
+
"basic_info": {},
|
822 |
+
"aspect_ratio": {},
|
823 |
+
"sparsity": {},
|
824 |
+
"memory_usage": {},
|
825 |
+
"processing_complexity": {},
|
826 |
+
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
827 |
+
}
|
828 |
+
|
829 |
+
analysis_failures = []
|
830 |
+
|
831 |
+
try:
|
832 |
+
df = self.data
|
833 |
+
|
834 |
+
if sample_size and sample_size < len(df):
|
835 |
+
df = df.sample(sample_size, random_state=42)
|
836 |
+
|
837 |
+
results["basic_info"] = {
|
838 |
+
"rows": len(df),
|
839 |
+
"columns": len(df.columns),
|
840 |
+
"total_cells": len(df) * len(df.columns),
|
841 |
+
"column_names": list(df.columns)
|
842 |
+
}
|
843 |
+
except Exception as e:
|
844 |
+
results["basic_info"] = {
|
845 |
+
"status": "failed",
|
846 |
+
"error": str(e)
|
847 |
+
}
|
848 |
+
analysis_failures.append("basic_info")
|
849 |
+
|
850 |
+
results["status"] = "failed"
|
851 |
+
results["error"] = f"Failed to read CSV file: {str(e)}"
|
852 |
+
return results
|
853 |
+
|
854 |
+
|
855 |
+
|
856 |
+
if sample_size and sample_size < len(df):
|
857 |
+
df = df.sample(sample_size, random_state=42)
|
858 |
+
|
859 |
+
try:
|
860 |
+
rows = results["basic_info"]["rows"]
|
861 |
+
cols = results["basic_info"]["columns"]
|
862 |
+
|
863 |
+
results["aspect_ratio"] = {
|
864 |
+
"rows_to_columns_ratio": rows / cols if cols > 0 else float('inf'),
|
865 |
+
"columns_to_rows_ratio": cols / rows if rows > 0 else float('inf'),
|
866 |
+
"is_wide": cols > rows,
|
867 |
+
"is_tall": rows > cols,
|
868 |
+
"shape_description": self.get_shape_description(rows, cols)
|
869 |
+
}
|
870 |
+
except Exception as e:
|
871 |
+
results["aspect_ratio"] = {
|
872 |
+
"status": "failed",
|
873 |
+
"error": str(e)
|
874 |
+
}
|
875 |
+
analysis_failures.append("aspect_ratio")
|
876 |
+
|
877 |
+
try:
|
878 |
+
null_count = df.isna().sum().sum()
|
879 |
+
total_cells = results["basic_info"]["total_cells"]
|
880 |
+
|
881 |
+
empty_str_count = 0
|
882 |
+
for col in df.select_dtypes(include=['object']).columns:
|
883 |
+
try:
|
884 |
+
empty_str_count += (df[col] == "").sum()
|
885 |
+
except:
|
886 |
+
pass
|
887 |
+
|
888 |
+
zero_count = 0
|
889 |
+
for col in df.select_dtypes(include=['number']).columns:
|
890 |
+
try:
|
891 |
+
zero_count += (df[col] == 0).sum()
|
892 |
+
except:
|
893 |
+
pass
|
894 |
+
|
895 |
+
null_sparsity = null_count / total_cells if total_cells > 0 else 0
|
896 |
+
total_sparsity = (null_count + empty_str_count) / total_cells if total_cells > 0 else 0
|
897 |
+
|
898 |
+
results["sparsity"] = {
|
899 |
+
"null_count": int(null_count),
|
900 |
+
"empty_string_count": int(empty_str_count),
|
901 |
+
"zero_count_in_numeric": int(zero_count),
|
902 |
+
"null_sparsity": null_sparsity,
|
903 |
+
"total_sparsity": total_sparsity,
|
904 |
+
"sparsity_percentage": total_sparsity * 100,
|
905 |
+
"sparsity_level": self.get_sparsity_level(total_sparsity)
|
906 |
+
}
|
907 |
+
|
908 |
+
if detailed:
|
909 |
+
col_sparsity = {}
|
910 |
+
for col in df.columns:
|
911 |
+
null_pct = df[col].isna().mean() * 100
|
912 |
+
|
913 |
+
empty_pct = 0
|
914 |
+
if df[col].dtype == 'object':
|
915 |
+
try:
|
916 |
+
empty_pct = (df[col] == "").mean() * 100
|
917 |
+
except:
|
918 |
+
pass
|
919 |
+
|
920 |
+
col_sparsity[col] = {
|
921 |
+
"null_percentage": null_pct,
|
922 |
+
"empty_percentage": empty_pct,
|
923 |
+
"total_sparsity": null_pct + empty_pct
|
924 |
+
}
|
925 |
+
results["sparsity"]["column_sparsity"] = col_sparsity
|
926 |
+
except Exception as e:
|
927 |
+
results["sparsity"] = {
|
928 |
+
"status": "failed",
|
929 |
+
"error": str(e)
|
930 |
+
}
|
931 |
+
analysis_failures.append("sparsity")
|
932 |
+
|
933 |
+
try:
|
934 |
+
mem_usage = df.memory_usage(deep=True)
|
935 |
+
total_mem = mem_usage.sum()
|
936 |
+
|
937 |
+
mem_by_type = {}
|
938 |
+
for dtype in df.dtypes.value_counts().index:
|
939 |
+
dtype_cols = df.select_dtypes(include=[dtype]).columns
|
940 |
+
dtype_mem = sum(mem_usage[col] for col in dtype_cols if col in mem_usage)
|
941 |
+
mem_by_type[str(dtype)] = dtype_mem
|
942 |
+
|
943 |
+
full_dataset_mem = total_mem
|
944 |
+
if sample_size and sample_size < results["basic_info"]["rows"]:
|
945 |
+
full_rows = results["basic_info"]["rows"]
|
946 |
+
full_dataset_mem = total_mem * (full_rows / sample_size)
|
947 |
+
|
948 |
+
try:
|
949 |
+
system_mem = psutil.virtual_memory().total
|
950 |
+
except:
|
951 |
+
system_mem = None
|
952 |
+
|
953 |
+
results["memory_usage"] = {
|
954 |
+
"total_bytes": int(total_mem),
|
955 |
+
"total_mb": round(total_mem / (1024**2), 2),
|
956 |
+
"memory_by_type": {k: int(v) for k, v in mem_by_type.items()},
|
957 |
+
"average_bytes_per_row": int(total_mem / len(df)) if len(df) > 0 else 0,
|
958 |
+
"full_dataset_estimate_mb": round(full_dataset_mem / (1024**2), 2),
|
959 |
+
"system_memory_gb": round(system_mem / (1024**3), 2) if system_mem else None,
|
960 |
+
"memory_utilization_percentage": round(full_dataset_mem / system_mem * 100, 2) if system_mem else None
|
961 |
+
}
|
962 |
+
except Exception as e:
|
963 |
+
results["memory_usage"] = {
|
964 |
+
"status": "failed",
|
965 |
+
"error": str(e)
|
966 |
+
}
|
967 |
+
analysis_failures.append("memory_usage")
|
968 |
+
|
969 |
+
try:
|
970 |
+
rows = results["basic_info"]["rows"]
|
971 |
+
cols = results["basic_info"]["columns"]
|
972 |
+
|
973 |
+
basic_complexity = rows * cols # O(n*m) operations
|
974 |
+
sorting_complexity = rows * math.log2(rows) if rows > 0 else 0 # O(n log n)
|
975 |
+
|
976 |
+
unique_counts = {}
|
977 |
+
for col in df.select_dtypes(include=['object']).columns:
|
978 |
+
try:
|
979 |
+
unique_counts[col] = df[col].nunique()
|
980 |
+
except:
|
981 |
+
pass
|
982 |
+
|
983 |
+
num_numeric = len(df.select_dtypes(include=['number']).columns)
|
984 |
+
|
985 |
+
complexity_level = self.get_complexity_level(rows, cols)
|
986 |
+
|
987 |
+
results["processing_complexity"] = {
|
988 |
+
"basic_complexity": basic_complexity,
|
989 |
+
"sorting_complexity": sorting_complexity,
|
990 |
+
"unique_values_in_string_columns": unique_counts,
|
991 |
+
"numeric_columns_count": num_numeric,
|
992 |
+
"complexity_level": complexity_level,
|
993 |
+
"processing_recommendation": self.get_processing_recommendation(rows, cols)
|
994 |
+
}
|
995 |
+
except Exception as e:
|
996 |
+
results["processing_complexity"] = {
|
997 |
+
"status": "failed",
|
998 |
+
"error": str(e)
|
999 |
+
}
|
1000 |
+
analysis_failures.append("processing_complexity")
|
1001 |
+
|
1002 |
+
try:
|
1003 |
+
results["summary"] = self.generate_dimensionality_summary(results, analysis_failures)
|
1004 |
+
except Exception as e:
|
1005 |
+
results["summary"] = {
|
1006 |
+
"status": "failed",
|
1007 |
+
"error": str(e)
|
1008 |
+
}
|
1009 |
+
|
1010 |
+
if analysis_failures:
|
1011 |
+
results["status"] = "partial_success"
|
1012 |
+
results["failed_analyses"] = analysis_failures
|
1013 |
+
else:
|
1014 |
+
results["status"] = "success"
|
1015 |
+
|
1016 |
+
return results
|
1017 |
+
|
1018 |
+
def get_shape_description(self, rows, cols):
|
1019 |
+
"""Get a descriptive term for the dataset shape based on rows and columns ratio"""
|
1020 |
+
ratio = rows / cols if cols > 0 else float('inf')
|
1021 |
+
|
1022 |
+
if ratio > 1000:
|
1023 |
+
return "extremely_tall"
|
1024 |
+
elif ratio > 100:
|
1025 |
+
return "very_tall"
|
1026 |
+
elif ratio > 10:
|
1027 |
+
return "tall"
|
1028 |
+
elif ratio > 3:
|
1029 |
+
return "moderately_tall"
|
1030 |
+
elif ratio > 0.33:
|
1031 |
+
return "balanced"
|
1032 |
+
elif ratio > 0.1:
|
1033 |
+
return "moderately_wide"
|
1034 |
+
elif ratio > 0.01:
|
1035 |
+
return "wide"
|
1036 |
+
elif ratio > 0.001:
|
1037 |
+
return "very_wide"
|
1038 |
+
else:
|
1039 |
+
return "extremely_wide"
|
1040 |
+
|
1041 |
+
def get_sparsity_level(self, sparsity_ratio):
|
1042 |
+
"""Get a descriptive term for the dataset sparsity level"""
|
1043 |
+
if sparsity_ratio > 0.9:
|
1044 |
+
return "extremely_sparse"
|
1045 |
+
elif sparsity_ratio > 0.7:
|
1046 |
+
return "very_sparse"
|
1047 |
+
elif sparsity_ratio > 0.5:
|
1048 |
+
return "sparse"
|
1049 |
+
elif sparsity_ratio > 0.3:
|
1050 |
+
return "moderately_sparse"
|
1051 |
+
elif sparsity_ratio > 0.1:
|
1052 |
+
return "slightly_sparse"
|
1053 |
+
else:
|
1054 |
+
return "dense"
|
1055 |
+
|
1056 |
+
def get_complexity_level(self, rows, cols):
|
1057 |
+
"""Get a descriptive term for processing complexity based on dataset size"""
|
1058 |
+
cells = rows * cols
|
1059 |
+
|
1060 |
+
if cells > 1_000_000_000: # 1 billion cells
|
1061 |
+
return "extremely_high"
|
1062 |
+
elif cells > 100_000_000: # 100 million cells
|
1063 |
+
return "very_high"
|
1064 |
+
elif cells > 10_000_000: # 10 million cells
|
1065 |
+
return "high"
|
1066 |
+
elif cells > 1_000_000: # 1 million cells
|
1067 |
+
return "moderate"
|
1068 |
+
elif cells > 100_000: # 100,000 cells
|
1069 |
+
return "low"
|
1070 |
+
else:
|
1071 |
+
return "very_low"
|
1072 |
+
|
1073 |
+
def get_processing_recommendation(self, rows, cols):
|
1074 |
+
"""Get processing recommendations based on dataset size"""
|
1075 |
+
cells = rows * cols
|
1076 |
+
|
1077 |
+
if cells > 1_000_000_000: # 1 billion cells
|
1078 |
+
return "Distributed computing recommended (Spark, Dask). Consider data sampling or partitioning."
|
1079 |
+
elif cells > 100_000_000: # 100 million cells
|
1080 |
+
return "Optimized libraries recommended. Consider chunking data or using out-of-core processing."
|
1081 |
+
elif cells > 10_000_000: # 10 million cells
|
1082 |
+
return "Standard pandas may be slow. Consider optimizing memory usage or using more efficient libraries."
|
1083 |
+
elif cells > 1_000_000: # 1 million cells
|
1084 |
+
return "Standard pandas should work well with sufficient memory."
|
1085 |
+
else:
|
1086 |
+
return "Dataset can be easily processed with standard tools."
|
1087 |
+
|
1088 |
+
def generate_dimensionality_summary(self, results, analysis_failures):
|
1089 |
+
"""Generate a summary of the dimensionality analysis"""
|
1090 |
+
summary = {}
|
1091 |
+
|
1092 |
+
try:
|
1093 |
+
rows = results["basic_info"]["rows"]
|
1094 |
+
cols = results["basic_info"]["columns"]
|
1095 |
+
shape_desc = results["aspect_ratio"]["shape_description"]
|
1096 |
+
summary["shape"] = f"Dataset has {rows:,} rows and {cols:,} columns ({shape_desc} shape)"
|
1097 |
+
except:
|
1098 |
+
summary["shape"] = "Could not determine dataset shape"
|
1099 |
+
|
1100 |
+
try:
|
1101 |
+
null_count = results["sparsity"]["null_count"]
|
1102 |
+
sparsity_pct = results["sparsity"]["sparsity_percentage"]
|
1103 |
+
sparsity_level = results["sparsity"]["sparsity_level"]
|
1104 |
+
summary["sparsity"] = f"Dataset is {sparsity_level} with {sparsity_pct:.1f}% missing values"
|
1105 |
+
except:
|
1106 |
+
summary["sparsity"] = "Could not determine dataset sparsity"
|
1107 |
+
|
1108 |
+
try:
|
1109 |
+
mem_mb = results["memory_usage"]["total_mb"]
|
1110 |
+
full_est_mb = results["memory_usage"]["full_dataset_estimate_mb"]
|
1111 |
+
if mem_mb == full_est_mb:
|
1112 |
+
summary["memory"] = f"Memory usage: {mem_mb:.1f} MB"
|
1113 |
+
else:
|
1114 |
+
summary["memory"] = f"Memory usage: {mem_mb:.1f} MB (estimated full dataset: {full_est_mb:.1f} MB)"
|
1115 |
+
except:
|
1116 |
+
summary["memory"] = "Could not determine memory usage"
|
1117 |
+
|
1118 |
+
try:
|
1119 |
+
complexity = results["processing_complexity"]["complexity_level"]
|
1120 |
+
summary["complexity"] = f"Processing complexity: {complexity}"
|
1121 |
+
except:
|
1122 |
+
summary["complexity"] = "Could not determine processing complexity"
|
1123 |
+
|
1124 |
+
insights = []
|
1125 |
+
|
1126 |
+
try:
|
1127 |
+
if results["aspect_ratio"]["is_wide"]:
|
1128 |
+
insights.append("Dataset has more columns than rows, which is unusual and may indicate a wide/transposed format")
|
1129 |
+
|
1130 |
+
if cols > 100:
|
1131 |
+
insights.append(f"High number of columns ({cols}) may indicate a need for dimensionality reduction")
|
1132 |
+
except:
|
1133 |
+
pass
|
1134 |
+
|
1135 |
+
try:
|
1136 |
+
if results["sparsity"]["sparsity_percentage"] > 50:
|
1137 |
+
insights.append(f"Dataset is quite sparse ({results['sparsity']['sparsity_percentage']:.1f}% missing values), consider handling missing data")
|
1138 |
+
|
1139 |
+
if "column_sparsity" in results["sparsity"]:
|
1140 |
+
sparse_cols = [col for col, data in results["sparsity"]["column_sparsity"].items()
|
1141 |
+
if data["total_sparsity"] > 80]
|
1142 |
+
if sparse_cols:
|
1143 |
+
col_count = len(sparse_cols)
|
1144 |
+
if col_count <= 3:
|
1145 |
+
insights.append(f"Consider dropping or imputing very sparse columns: {', '.join(sparse_cols)}")
|
1146 |
+
else:
|
1147 |
+
insights.append(f"{col_count} columns have >80% missing values and may be candidates for removal")
|
1148 |
+
except:
|
1149 |
+
pass
|
1150 |
+
|
1151 |
+
try:
|
1152 |
+
if results["memory_usage"].get("memory_utilization_percentage", 0) > 50:
|
1153 |
+
insights.append("Dataset is using a significant portion of system memory, consider chunking or sampling")
|
1154 |
+
|
1155 |
+
if "memory_by_type" in results["memory_usage"]:
|
1156 |
+
mem_by_type = results["memory_usage"]["memory_by_type"]
|
1157 |
+
if "object" in mem_by_type:
|
1158 |
+
object_mem_ratio = mem_by_type["object"] / results["memory_usage"]["total_bytes"]
|
1159 |
+
if object_mem_ratio > 0.7:
|
1160 |
+
insights.append("String/object columns use most memory; consider category type conversion")
|
1161 |
+
except:
|
1162 |
+
pass
|
1163 |
+
|
1164 |
+
try:
|
1165 |
+
complexity_level = results["processing_complexity"]["complexity_level"]
|
1166 |
+
if complexity_level in ["high", "very_high", "extremely_high"]:
|
1167 |
+
insights.append(results["processing_complexity"]["processing_recommendation"])
|
1168 |
+
except:
|
1169 |
+
pass
|
1170 |
+
|
1171 |
+
summary["key_insights"] = insights if insights else ["No specific insights identified"]
|
1172 |
+
|
1173 |
+
recommendations = []
|
1174 |
+
|
1175 |
+
try:
|
1176 |
+
if cols > 100:
|
1177 |
+
recommendations.append("Consider dimensionality reduction (PCA, feature selection)")
|
1178 |
+
|
1179 |
+
if results["aspect_ratio"]["is_wide"]:
|
1180 |
+
recommendations.append("Check if data is in correct format (might need transposing)")
|
1181 |
+
except:
|
1182 |
+
pass
|
1183 |
+
|
1184 |
+
try:
|
1185 |
+
if results["sparsity"]["sparsity_percentage"] > 30:
|
1186 |
+
recommendations.append("Implement appropriate missing value strategy (imputation or removal)")
|
1187 |
+
except:
|
1188 |
+
pass
|
1189 |
+
|
1190 |
+
try:
|
1191 |
+
if "memory_by_type" in results["memory_usage"] and "object" in results["memory_usage"]["memory_by_type"]:
|
1192 |
+
object_mem_ratio = results["memory_usage"]["memory_by_type"]["object"] / results["memory_usage"]["total_bytes"]
|
1193 |
+
if object_mem_ratio > 0.5:
|
1194 |
+
recommendations.append("Optimize memory by converting string columns to categories or using more efficient data types")
|
1195 |
+
except:
|
1196 |
+
pass
|
1197 |
+
|
1198 |
+
try:
|
1199 |
+
complexity_level = results["processing_complexity"]["complexity_level"]
|
1200 |
+
if complexity_level in ["very_high", "extremely_high"]:
|
1201 |
+
recommendations.append("Use chunking, sampling or distributed processing for this dataset")
|
1202 |
+
except:
|
1203 |
+
pass
|
1204 |
+
|
1205 |
+
summary["recommendations"] = recommendations if recommendations else ["No specific recommendations"]
|
1206 |
+
|
1207 |
+
if analysis_failures:
|
1208 |
+
summary["failures"] = f"Some analyses failed: {', '.join(analysis_failures)}"
|
1209 |
+
|
1210 |
+
return summary
|
1211 |
+
|
1212 |
+
def generate_report_from_agent(self, input)->str:
|
1213 |
+
'''Transform the json output to a user-readable report'''
|
1214 |
+
try:
|
1215 |
+
input = f"ML Task: {self.ml_task}\n{input}"
|
1216 |
+
response: RunResponse = self.writer.run(input, stream=False)
|
1217 |
+
return response.content
|
1218 |
+
except Exception as e:
|
1219 |
+
return f"Failed to generate report with error: {e}"
|
1220 |
+
|
1221 |
+
def convert_numpy_types(self, obj):
|
1222 |
+
if isinstance(obj, dict):
|
1223 |
+
return {k: self.convert_numpy_types(v) for k, v in obj.items()}
|
1224 |
+
elif isinstance(obj, list):
|
1225 |
+
return [self.convert_numpy_types(item) for item in obj]
|
1226 |
+
elif isinstance(obj, np.integer):
|
1227 |
+
return int(obj)
|
1228 |
+
elif isinstance(obj, np.floating):
|
1229 |
+
return float(obj)
|
1230 |
+
elif isinstance(obj, np.bool_):
|
1231 |
+
return bool(obj)
|
1232 |
+
elif isinstance(obj, np.ndarray):
|
1233 |
+
return obj.tolist()
|
1234 |
+
else:
|
1235 |
+
return obj
|
1236 |
+
|
1237 |
+
def run(self, verbose=False)-> Dict[str, dict]:
|
1238 |
+
'''Run the entire workflow'''
|
1239 |
+
statistical_summary_dict = self.build_statistical_summary(verbose=verbose)
|
1240 |
+
data_type_analysis_dict = self.analyze_data_types(verbose=verbose)
|
1241 |
+
dataset_dimensionality_dict = self.analyze_dataset_dimensionality(verbose=verbose)
|
1242 |
+
|
1243 |
+
statistical_summary_str = self.format_json(statistical_summary_dict)
|
1244 |
+
data_type_analysis_str = self.format_json(data_type_analysis_dict)
|
1245 |
+
dataset_dimensionality_str = self.format_json(dataset_dimensionality_dict)
|
1246 |
+
|
1247 |
+
logger.info("Generating final reports....", log_type='data_statistics', console=verbose)
|
1248 |
+
statistical_report = self.generate_report_from_agent(input=statistical_summary_str)
|
1249 |
+
data_type_report = self.generate_report_from_agent(input=data_type_analysis_str)
|
1250 |
+
dataset_dimensionality_report = self.generate_report_from_agent(input=dataset_dimensionality_str)
|
1251 |
+
|
1252 |
+
final_result = {
|
1253 |
+
"statistical_analysis": {
|
1254 |
+
'dict': self.convert_numpy_types(statistical_summary_dict),
|
1255 |
+
'report': statistical_report
|
1256 |
+
},
|
1257 |
+
"data_type_analysis": {
|
1258 |
+
'dict': self.convert_numpy_types(data_type_analysis_dict),
|
1259 |
+
'report': data_type_report
|
1260 |
+
},
|
1261 |
+
"dataset_dimensionality_analysis": {
|
1262 |
+
'dict': self.convert_numpy_types(dataset_dimensionality_dict),
|
1263 |
+
'report': dataset_dimensionality_report
|
1264 |
+
},
|
1265 |
+
}
|
1266 |
+
|
1267 |
+
return final_result
|
1268 |
+
|
1269 |
+
|
1270 |
+
|
src/app/pipelines/modules/data_understanding_context.py
ADDED
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
import pandas as pd
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from src.core.utils import logger
|
6 |
+
from pydantic import BaseModel, Field
|
7 |
+
from agno.models.openai import OpenAIChat
|
8 |
+
from agno.agent import Agent, RunResponse
|
9 |
+
from typing import Optional, Union, List, Dict, Tuple
|
10 |
+
from agno.tools.duckduckgo import DuckDuckGoTools
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
# class BCAgentResponseSchema(BaseModel):
|
15 |
+
# executive_summary: Optional[str] = Field(
|
16 |
+
# default=None,
|
17 |
+
# description="Contains information about the industry recognized, business purpose, key insights and impact on analysis"
|
18 |
+
# )
|
19 |
+
# business_concept_mapping: Optional[str] = Field(
|
20 |
+
# default=None,
|
21 |
+
# description="Contains all the columns along with their business concept, definition, industry standard term and their metric"
|
22 |
+
# )
|
23 |
+
# business_dependencies: Optional[str] = Field(
|
24 |
+
# default=None,
|
25 |
+
# description="Contains all the columns along with their dependent variable(s), nature of dependency and business impact"
|
26 |
+
# )
|
27 |
+
# key_performance_indicators: Optional[str] = Field(
|
28 |
+
# default=None,
|
29 |
+
# description="Contains names of all the kpis along with their formula, business objective, target range and industry benchmark"
|
30 |
+
# )
|
31 |
+
# regulatory_compliance_considerations: Optional[List[str]] = Field(
|
32 |
+
# default=None,
|
33 |
+
# description="Contains list all the regulations along with their applicability, key requirements and affected variables"
|
34 |
+
# )
|
35 |
+
# business_rules: Optional[List[str]] = Field(
|
36 |
+
# default=None,
|
37 |
+
# description="Contains list of all the rules along with their description, affected variables, priority level"
|
38 |
+
# )
|
39 |
+
# impact_analysis: Optional[List[str]] = Field(
|
40 |
+
# default=None,
|
41 |
+
# description="Contains list of all the business context elements along with their data quality impact, analysis approach impact, modeling approach followed by their recommended approach"
|
42 |
+
# )
|
43 |
+
# references: Optional[List[str]] = Field(
|
44 |
+
# default=None,
|
45 |
+
# description="Contains the links for all the references and sources"
|
46 |
+
# )
|
47 |
+
|
48 |
+
# class SUAgentResponseSchema(BaseModel):
|
49 |
+
# variable_definitions: Optional[Dict[str, str]] = Field(
|
50 |
+
# default=None,
|
51 |
+
# description="Contains the concise definition for each variable along with its unit of measurement observed"
|
52 |
+
# )
|
53 |
+
# reference_codes: Optional[Dict[str, str]] = Field(
|
54 |
+
# default=None,
|
55 |
+
# description="Contains the meaning for various abbreviations"
|
56 |
+
# )
|
57 |
+
# heirarchical_relationships: Optional[Dict[str, str]] = Field(
|
58 |
+
# default=None,
|
59 |
+
# description="Contains some information about heirarchy between variables"
|
60 |
+
# )
|
61 |
+
|
62 |
+
|
63 |
+
# DATA SOURCE ASSESSMENT STILL INCOMPLETE
|
64 |
+
|
65 |
+
class DataUnderstandingContextWorkflow:
|
66 |
+
def __init__(
|
67 |
+
self, data_source: str,
|
68 |
+
llm_choice: str,
|
69 |
+
source_context: Union[str, None] = None,
|
70 |
+
business_context: Union[str, None] = None
|
71 |
+
) -> None:
|
72 |
+
'''This is the very 1st module which will be executed for all EDA tasks regardless'''
|
73 |
+
|
74 |
+
self.data = None
|
75 |
+
self.source_context = source_context
|
76 |
+
self.business_context = business_context
|
77 |
+
self.llm = OpenAIChat(
|
78 |
+
id=llm_choice, api_key=os.getenv('OPENAI_API_KEY'))
|
79 |
+
|
80 |
+
try:
|
81 |
+
self.data = pd.read_csv(data_source)
|
82 |
+
except Exception as e:
|
83 |
+
logger.error(
|
84 |
+
f"Failed to read the file from the data source with error: {e}", log_type="data_understanding_context", console=True)
|
85 |
+
|
86 |
+
def extract_column_data(self, max_col_limit=15) -> Union[str, None]:
|
87 |
+
"""
|
88 |
+
Extracts column names and samples rows from a dataset.
|
89 |
+
|
90 |
+
Rules:
|
91 |
+
- If the dataset has more than 15 columns, sample 1 random row.
|
92 |
+
- Otherwise, sample 3 random rows.
|
93 |
+
|
94 |
+
Returns:
|
95 |
+
A formatted string containing column names and sampled data.
|
96 |
+
"""
|
97 |
+
|
98 |
+
columns = self.data.columns.tolist()
|
99 |
+
num_columns = len(columns)
|
100 |
+
|
101 |
+
if num_columns > max_col_limit:
|
102 |
+
sampled_rows = self.data.sample(
|
103 |
+
1, random_state=random.randint(0, 1000))
|
104 |
+
else:
|
105 |
+
sampled_rows = self.data.sample(
|
106 |
+
3, random_state=random.randint(0, 1000))
|
107 |
+
|
108 |
+
doc = f"""
|
109 |
+
Dataset Summary
|
110 |
+
===============
|
111 |
+
|
112 |
+
Total Columns: {num_columns}
|
113 |
+
Column Names:
|
114 |
+
{columns}
|
115 |
+
|
116 |
+
Sampled Data:
|
117 |
+
{sampled_rows.to_string(index=False)}
|
118 |
+
"""
|
119 |
+
|
120 |
+
return doc.strip()
|
121 |
+
|
122 |
+
def build_business_context_integration(self, verbose=False) -> Union[RunResponse, None]:
|
123 |
+
logger.info(f"Starting business context integration", log_type='data_understanding_context', console=verbose)
|
124 |
+
if not self.business_context:
|
125 |
+
logger.info("No business context found. Skipping this section...",
|
126 |
+
log_type='data_understanding_context', console=verbose)
|
127 |
+
return None
|
128 |
+
else:
|
129 |
+
|
130 |
+
try:
|
131 |
+
bc_agent = Agent(
|
132 |
+
name="Businesss Context Agent",
|
133 |
+
model=self.llm,
|
134 |
+
markdown=True,
|
135 |
+
reasoning=True,
|
136 |
+
# response_model=BCAgentResponseSchema,
|
137 |
+
instructions="""
|
138 |
+
Business Context Integration Agent Prompt
|
139 |
+
|
140 |
+
Your objective is to serve as a Business Context Integration Agent. Given a dataset with column names and the type of machine learning task (e.g., classification, regression, clustering), your responsibility is to gather authoritative, relevant business context by performing extensive web-based research.
|
141 |
+
|
142 |
+
All claims must be supported with referenced links.
|
143 |
+
|
144 |
+
Use the dataset columns and ML task as the anchor to drive your research and reasoning.
|
145 |
+
|
146 |
+
-----------------------------------------------
|
147 |
+
Research Goals:
|
148 |
+
-----------------------------------------------
|
149 |
+
|
150 |
+
1. executive_summary:
|
151 |
+
- Identify the broader industry and functional domain that the dataset likely belongs to based on the column names.
|
152 |
+
- Outline the business purpose for building the machine learning model.
|
153 |
+
- Explain how incorporating domain knowledge impacts the model’s relevance and usefulness in a real-world context.
|
154 |
+
- Highlight notable trends, challenges, or insights from the industry that are relevant to the problem space.
|
155 |
+
|
156 |
+
2. business_concept_mapping:
|
157 |
+
- Map each dataset column to its corresponding business concept and provide a clear and formal definition.
|
158 |
+
- Identify industry-recognized terms and naming conventions related to the variables.
|
159 |
+
- Include units of measurement, and where applicable, mathematical formulas or logic behind computed values.
|
160 |
+
- Use standardized terminologies from domain-specific glossaries or data dictionaries to ensure consistency.
|
161 |
+
|
162 |
+
3. business_dependencies:
|
163 |
+
- For each variable, identify direct and indirect relationships with other variables in the dataset.
|
164 |
+
- Define whether these relationships are statistical, causal, temporal, or logical.
|
165 |
+
- Reference literature or domain analyses that justify these dependencies and describe their business implications.
|
166 |
+
- Explain how these dependencies may affect downstream model features or interpretation.
|
167 |
+
|
168 |
+
4. key_performance_indicators:
|
169 |
+
- Determine the key performance indicators (KPIs) that are either directly present in the dataset or can be derived from it.
|
170 |
+
- Provide detailed formulas, objectives behind each KPI, and their role in strategic or operational decision-making.
|
171 |
+
- Include target thresholds or performance ranges based on benchmarks from industry or regulatory sources.
|
172 |
+
- Document frameworks or methodologies used to calculate and track these KPIs.
|
173 |
+
|
174 |
+
5. regulatory_compliance_considerations:
|
175 |
+
- Identify regulations, compliance requirements, or legal constraints that apply to the variables or their use in machine learning models.
|
176 |
+
- Summarize mandates from regulatory bodies, compliance frameworks, and data protection laws relevant to the domain.
|
177 |
+
- Detail how these regulations affect data collection, storage, analysis, and reporting for the columns involved.
|
178 |
+
- Note any enforcement actions or penalties associated with non-compliance, if applicable.
|
179 |
+
|
180 |
+
6. business_rules:
|
181 |
+
- Collect formal and informal business rules that define the valid behavior, constraints, or validation criteria for each variable.
|
182 |
+
- Document rule sources such as process documentation, system specifications, industry best practices, and technical standards.
|
183 |
+
- Classify rules by type (validation, constraint, workflow, consistency) and specify affected variables.
|
184 |
+
- Assign a priority level (e.g., High, Medium, Low) to each rule based on its impact or frequency.
|
185 |
+
|
186 |
+
7. impact_analysis:
|
187 |
+
- Assess the influence of the business context elements (e.g., dependencies, rules, KPIs, regulations) on data quality and model effectiveness.
|
188 |
+
- Highlight how they should inform exploratory analysis, feature selection, or modeling strategies.
|
189 |
+
- Recommend specific techniques or precautions to address potential risks or quality issues stemming from business constraints or misalignments.
|
190 |
+
- Discuss how different business-driven assumptions or constraints might affect model interpretability, performance, and deployment.
|
191 |
+
|
192 |
+
8. references:
|
193 |
+
- Provide a list of URLs for all sources consulted or referenced in each of the above categories.
|
194 |
+
- Prioritize credible references including academic publications, regulatory websites, vendor documentation, whitepapers, and official data standards.
|
195 |
+
|
196 |
+
|
197 |
+
9. metadata_analysis:
|
198 |
+
- Evaluate the naming conventions of each column in the dataset to determine if they follow consistent, domain-aligned patterns.
|
199 |
+
• Identify the naming style (e.g., snake_case, camelCase, abbreviations) and flag inconsistencies.
|
200 |
+
• Recommend improvements to align with industry-standard naming practices where needed.
|
201 |
+
• Justify suggestions based on business readability, clarity, and traceability.
|
202 |
+
- Categorize all dataset variables into the following types:
|
203 |
+
• Identifier Variables: Used to uniquely identify records (e.g., IDs, Ticket Numbers).
|
204 |
+
• Analytical Variables: Variables that contain information relevant to prediction, segmentation, or business insight generation.
|
205 |
+
• Target Variables: Outcome labels used in supervised machine learning tasks.
|
206 |
+
• Derived or Redundant Variables: Columns generated from transformations, combinations, or duplicates of others.
|
207 |
+
- Explain the business role and implications of each variable type:
|
208 |
+
• Why are certain fields considered identifiers and not used for analysis?
|
209 |
+
• Which analytical variables are central to decision-making and why?
|
210 |
+
• How does identifying target and redundant variables support data quality and model interpretability?
|
211 |
+
- Where applicable, reference industry data dictionaries or data modeling standards to support naming and classification.
|
212 |
+
|
213 |
+
-----------------------------------------------
|
214 |
+
Guidelines:
|
215 |
+
-----------------------------------------------
|
216 |
+
|
217 |
+
- Maintain a descriptive, business-friendly tone in all outputs.
|
218 |
+
- Avoid technical jargon unless it is relevant to business understanding or regulatory compliance.
|
219 |
+
- Do not make assumptions without a verifiable source; indicate low-confidence insights where applicable.
|
220 |
+
- Ensure all responses are grounded in real-world context and supported by credible, traceable information.
|
221 |
+
- Be comprehensive but concise—aim to deliver practical business value with each section of the output.
|
222 |
+
""",
|
223 |
+
tools=[DuckDuckGoTools(search=True, news=False)]
|
224 |
+
)
|
225 |
+
|
226 |
+
columns_data = self.extract_column_data()
|
227 |
+
prompt = (
|
228 |
+
f'Business Context: \n'
|
229 |
+
f'{self.business_context}\n'
|
230 |
+
f'{columns_data}'
|
231 |
+
)
|
232 |
+
|
233 |
+
response: RunResponse = bc_agent.run(prompt, stream=False)
|
234 |
+
|
235 |
+
logger.info(f"Business Context integration finished....",
|
236 |
+
log_type="data_understanding_context", console=verbose)
|
237 |
+
return response
|
238 |
+
except Exception as e:
|
239 |
+
logger.error(
|
240 |
+
f"Failed to build business context integration with error: {e}", log_type="data_understanding_context", console=verbose)
|
241 |
+
return None
|
242 |
+
|
243 |
+
def build_semantic_understanding(self, verbose=False) -> Union[RunResponse, None]:
|
244 |
+
logger.info(f"Starting to build semantic integration", log_type='data_understanding_context', console=verbose)
|
245 |
+
try:
|
246 |
+
bc_agent = Agent(
|
247 |
+
name="Semantic Understanding Agent",
|
248 |
+
model=self.llm,
|
249 |
+
markdown=True,
|
250 |
+
reasoning=True,
|
251 |
+
# response_model=SUAgentResponseSchema,
|
252 |
+
instructions="""
|
253 |
+
Objective:
|
254 |
+
You are a Semantic Understanding Agent responsible for analyzing a dataset with no prior documentation. Your goal is to enrich each variable (column) by extracting and constructing meaningful metadata that helps clarify the variable’s semantic role in the dataset.
|
255 |
+
This process is essential for enabling downstream tasks like data validation, feature engineering, explainability, and regulatory compliance.
|
256 |
+
|
257 |
+
You must generate detailed descriptions and metadata across the following four components for each column in the dataset:
|
258 |
+
|
259 |
+
-------------------------------------------------------------
|
260 |
+
1. Create/Validate Variable Definitions
|
261 |
+
-------------------------------------------------------------
|
262 |
+
- Infer and construct a clear, concise, and human-readable definition for each variable based solely on the column name, data type, and observed value patterns.
|
263 |
+
- Where the column contains obvious domain terms, use domain-specific language to define them precisely.
|
264 |
+
- If a variable is ambiguous, document your reasoning and note the uncertainty.
|
265 |
+
- Ensure that the definition is useful for a business analyst or data scientist trying to understand the purpose of the variable.
|
266 |
+
|
267 |
+
-------------------------------------------------------------
|
268 |
+
2. Document Units of Measurement for Each Variable
|
269 |
+
-------------------------------------------------------------
|
270 |
+
- Determine if the variable has an implicit or explicit unit of measurement.
|
271 |
+
- Use value ranges and patterns to infer the unit (e.g., numeric age values imply years).
|
272 |
+
- If no unit is applicable (e.g., categorical strings), indicate “N/A”.
|
273 |
+
- Standardize the unit using singular form and SI/non-SI conventions when applicable.
|
274 |
+
|
275 |
+
-------------------------------------------------------------
|
276 |
+
3. Identify Reference Codes and Their Meanings
|
277 |
+
-------------------------------------------------------------
|
278 |
+
- Identify variables that represent coded values, whether single-letter, numeric, or short abbreviations.
|
279 |
+
- For such variables, list all unique values and attempt to map each one to a meaningful label.
|
280 |
+
- Use statistical reasoning (value distribution) and common domain knowledge to deduce what each code stands for.
|
281 |
+
- If meanings are uncertain, flag them with a confidence level.
|
282 |
+
|
283 |
+
|
284 |
+
-------------------------------------------------------------
|
285 |
+
4. Document Any Known Hierarchical Relationships
|
286 |
+
-------------------------------------------------------------
|
287 |
+
- Identify columns that relate to one another hierarchically, structurally, or contextually.
|
288 |
+
- Describe the hierarchy and explain the nature of the relationship (e.g., parent-child, group-subgroup, categorical-numerical link).
|
289 |
+
- If patterns suggest derived or grouped relationships, explain your logic.
|
290 |
+
|
291 |
+
-------------------------------------------------------------
|
292 |
+
Output Guidelines:
|
293 |
+
-------------------------------------------------------------
|
294 |
+
- Be precise and neutral in language.
|
295 |
+
- If uncertain, include a note explaining ambiguity or assumption.
|
296 |
+
- Avoid inventing information—only reason from what's in the dataset.
|
297 |
+
|
298 |
+
-------------------------------------------------------------
|
299 |
+
Final Note:
|
300 |
+
-------------------------------------------------------------
|
301 |
+
You do not have access to external documentation or glossaries. All your outputs must be reasoned and extracted using the dataset column names and data types alone
|
302 |
+
"""
|
303 |
+
)
|
304 |
+
|
305 |
+
columns_data = self.extract_column_data()
|
306 |
+
|
307 |
+
prompt = (
|
308 |
+
f"{columns_data}"
|
309 |
+
)
|
310 |
+
|
311 |
+
response: RunResponse = bc_agent.run(prompt, stream=False)
|
312 |
+
|
313 |
+
logger.info(f"Semantic Understanding Finished....",
|
314 |
+
log_type="data_understanding_context", console=verbose)
|
315 |
+
return response
|
316 |
+
except Exception as e:
|
317 |
+
logger.error(
|
318 |
+
f"Failed to build Semantic Understanding with error: {e}", log_type="data_understanding_context", console=verbose)
|
319 |
+
return None
|
320 |
+
|
321 |
+
def run(self, verbose=False) -> Dict[str, dict]:
|
322 |
+
bci_result = self.build_business_context_integration(verbose=verbose)
|
323 |
+
su_result = self.build_semantic_understanding(verbose=verbose)
|
324 |
+
|
325 |
+
return {
|
326 |
+
'business_context_integration':{
|
327 |
+
"report": bci_result.content
|
328 |
+
},
|
329 |
+
'semantic_understanding': {
|
330 |
+
"report": su_result.content
|
331 |
+
}
|
332 |
+
}
|
src/app/pipelines/modules/univariate_analysis.py
ADDED
@@ -0,0 +1,1437 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import math
|
3 |
+
import json
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
from tqdm import tqdm
|
7 |
+
import lmoments3 as lm
|
8 |
+
import scipy.stats as stats
|
9 |
+
from src.core.utils import logger
|
10 |
+
from agno.models.openai import OpenAIChat
|
11 |
+
from agno.agent import Agent, RunResponse
|
12 |
+
from sklearn.neighbors import KernelDensity
|
13 |
+
from sklearn.model_selection import GridSearchCV
|
14 |
+
from typing import Union, Tuple, Dict, Any, List, Optional
|
15 |
+
from scipy.stats import norm, shapiro, anderson, kstest, normaltest
|
16 |
+
|
17 |
+
class UnivariateAnalysisWorkflow:
|
18 |
+
def __init__(
|
19 |
+
self, data_source: str,
|
20 |
+
llm_choice: str,
|
21 |
+
ml_task: str
|
22 |
+
) -> None:
|
23 |
+
''''''
|
24 |
+
self.data = None
|
25 |
+
self.data_source = data_source
|
26 |
+
self.llm_choice = llm_choice
|
27 |
+
self.llm = OpenAIChat(id=llm_choice, api_key=os.getenv('OPENAI_API_KEY'))
|
28 |
+
self.writer: Agent = Agent(
|
29 |
+
model=self.llm,
|
30 |
+
instructions=[
|
31 |
+
"You will be provided with lots of structured outputs. Your work is to display this"
|
32 |
+
"in a nicely formatted manner. You must analayze the results and output a comprehensive and insightful report"
|
33 |
+
],
|
34 |
+
markdown=True,
|
35 |
+
)
|
36 |
+
self.ml_task = ml_task
|
37 |
+
_ = self.load_data(data_source=data_source)
|
38 |
+
|
39 |
+
def load_data(self, data_source: str) -> Union[None, bool]:
|
40 |
+
'''Load CSV into dataframe'''
|
41 |
+
try:
|
42 |
+
self.data = pd.read_csv(data_source)
|
43 |
+
return True
|
44 |
+
except Exception as e:
|
45 |
+
logger.error(f"Failed to read the file from the data source with error: {e}", log_type="data_quality_assessment", console=True)
|
46 |
+
return False
|
47 |
+
|
48 |
+
def detect_variable_types(self)->Tuple[Dict[str, str], Dict[str, str]]:
|
49 |
+
"""Automatically detect continuous vs categorical variables"""
|
50 |
+
continuous_vars = {}
|
51 |
+
categorical_vars = {}
|
52 |
+
|
53 |
+
for col in self.data.columns:
|
54 |
+
col_data = self.data[col].dropna()
|
55 |
+
|
56 |
+
if len(col_data) == 0:
|
57 |
+
continue
|
58 |
+
|
59 |
+
dtype = str(self.data[col].dtype)
|
60 |
+
nunique = col_data.nunique()
|
61 |
+
sample_values = col_data.head(5).tolist()
|
62 |
+
|
63 |
+
if np.issubdtype(self.data[col].dtype, np.datetime64):
|
64 |
+
continue
|
65 |
+
|
66 |
+
if dtype == 'bool' or nunique == 2 and set(col_data.unique()).issubset({0, 1, True, False}):
|
67 |
+
categorical_vars[col] = 'binary'
|
68 |
+
continue
|
69 |
+
|
70 |
+
if dtype == 'object':
|
71 |
+
try:
|
72 |
+
_ = pd.to_numeric(col_data)
|
73 |
+
col_data = pd.to_numeric(col_data)
|
74 |
+
except ValueError:
|
75 |
+
categorical_vars[col] = f'categorical (text, {nunique} unique values)'
|
76 |
+
continue
|
77 |
+
|
78 |
+
if np.issubdtype(col_data.dtype, np.number):
|
79 |
+
if nunique <= 20:
|
80 |
+
if (np.array_equal(col_data.unique(), np.arange(nunique)) or (nunique <= 10 and all(x in col_data.unique() for x in range(1, nunique+1)))):
|
81 |
+
categorical_vars[col] = f'categorical (ordinal, {nunique} levels)'
|
82 |
+
else:
|
83 |
+
categorical_vars[col] = f'categorical (nominal, {nunique} levels)'
|
84 |
+
else:
|
85 |
+
continuous_vars[col] = 'continuous'
|
86 |
+
else:
|
87 |
+
categorical_vars[col] = f'categorical (other, {nunique} unique values)'
|
88 |
+
|
89 |
+
return continuous_vars, categorical_vars
|
90 |
+
|
91 |
+
def analyze_distributions(self, verbose=False):
|
92 |
+
"""Run distribution analysis for all continuous variables"""
|
93 |
+
logger.info("Starting to analyze distributions..", log_type='univariate_analysis', console=verbose)
|
94 |
+
continuous_vars, _ = self.detect_variable_types()
|
95 |
+
results = {}
|
96 |
+
for var in tqdm(continuous_vars):
|
97 |
+
try:
|
98 |
+
var_results = {
|
99 |
+
'kde': self.kernel_density_estimation(var),
|
100 |
+
'normality_tests': self.run_normality_tests(var),
|
101 |
+
# 'distribution_fit': self.fit_distributions(var),
|
102 |
+
# 'modality_tests': self.test_modality(var)
|
103 |
+
}
|
104 |
+
results[var] = var_results
|
105 |
+
except Exception as e:
|
106 |
+
if verbose:
|
107 |
+
logger.error(f"Error analyzing variable {var}: {str(e)}", log_type="data_quality_assessment", console=verbose)
|
108 |
+
results[var] = {'error': str(e)}
|
109 |
+
return results
|
110 |
+
|
111 |
+
def kernel_density_estimation(self, variable: str, verbose: bool = False) -> Dict[str, Any]:
|
112 |
+
"""Perform comprehensive kernel density estimation with optimal bandwidth selection, multiple kernel types, and comparative analysis."""
|
113 |
+
if variable not in self.data.columns:
|
114 |
+
raise ValueError(f"Variable {variable} not found in dataset")
|
115 |
+
|
116 |
+
x = self.data[variable].dropna().values
|
117 |
+
results = {}
|
118 |
+
|
119 |
+
try:
|
120 |
+
results['basic_stats'] = {
|
121 |
+
'n': len(x),
|
122 |
+
'mean': np.mean(x),
|
123 |
+
'std': np.std(x),
|
124 |
+
'min': np.min(x),
|
125 |
+
'max': np.max(x),
|
126 |
+
'skewness': stats.skew(x),
|
127 |
+
'kurtosis': stats.kurtosis(x)
|
128 |
+
}
|
129 |
+
except Exception as e:
|
130 |
+
logger.error(f"Failed to compute basic stats: {e}", log_type='univariate_analysis', console=verbose)
|
131 |
+
|
132 |
+
try:
|
133 |
+
silverman_bandwidth = (4 * np.std(x)**5 / (3 * len(x)))**(1/5)
|
134 |
+
scott_bandwidth = 1.06 * np.std(x) * len(x)**(-1/5)
|
135 |
+
except Exception as e:
|
136 |
+
silverman_bandwidth = scott_bandwidth = None
|
137 |
+
logger.error(f"Failed to compute Silverman/Scott bandwidths: {e}", log_type='univariate_analysis', console=verbose)
|
138 |
+
|
139 |
+
try:
|
140 |
+
grid = GridSearchCV(
|
141 |
+
KernelDensity(kernel='gaussian'),
|
142 |
+
{'bandwidth': np.linspace(0.1, 2, 30)},
|
143 |
+
cv=5
|
144 |
+
)
|
145 |
+
grid.fit(x.reshape(-1, 1))
|
146 |
+
cv_bandwidth = grid.best_params_['bandwidth']
|
147 |
+
except Exception as e:
|
148 |
+
cv_bandwidth = None
|
149 |
+
logger.error(f"Grid search for bandwidth failed: {e}", log_type='univariate_analysis', console=verbose)
|
150 |
+
|
151 |
+
results['bandwidth_selection'] = {
|
152 |
+
'silverman': silverman_bandwidth,
|
153 |
+
'scott': scott_bandwidth,
|
154 |
+
'cross_validated': cv_bandwidth,
|
155 |
+
'selected_bandwidth': cv_bandwidth or silverman_bandwidth or scott_bandwidth
|
156 |
+
}
|
157 |
+
bandwidth = results['bandwidth_selection']['selected_bandwidth']
|
158 |
+
|
159 |
+
kernels = ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']
|
160 |
+
kde_models = {}
|
161 |
+
for kernel in kernels:
|
162 |
+
try:
|
163 |
+
kde = KernelDensity(kernel=kernel, bandwidth=bandwidth)
|
164 |
+
kde.fit(x.reshape(-1, 1))
|
165 |
+
kde_models[kernel] = kde
|
166 |
+
except Exception as e:
|
167 |
+
logger.error(f"Failed to fit KDE with kernel {kernel}: {e}", log_type='univariate_analysis', console=verbose)
|
168 |
+
|
169 |
+
try:
|
170 |
+
x_grid = np.linspace(np.min(x), np.max(x), 1000)
|
171 |
+
log_likelihoods = {}
|
172 |
+
for kernel, model in kde_models.items():
|
173 |
+
log_likelihoods[kernel] = model.score_samples(x_grid.reshape(-1, 1)).sum()
|
174 |
+
|
175 |
+
results['kernel_comparison'] = {
|
176 |
+
'log_likelihoods': log_likelihoods,
|
177 |
+
'best_kernel': max(log_likelihoods, key=log_likelihoods.get)
|
178 |
+
}
|
179 |
+
best_kernel = results['kernel_comparison']['best_kernel']
|
180 |
+
best_kde = kde_models[best_kernel]
|
181 |
+
density = np.exp(best_kde.score_samples(x_grid.reshape(-1, 1)))
|
182 |
+
except Exception as e:
|
183 |
+
results['kernel_comparison'] = {}
|
184 |
+
density = None
|
185 |
+
x_grid = None
|
186 |
+
logger.error(f"Kernel comparison failed: {e}", log_type='univariate_analysis', console=verbose)
|
187 |
+
|
188 |
+
|
189 |
+
dist_names = ['norm', 'lognorm', 'expon', 'gamma', 'beta']
|
190 |
+
dist_results = []
|
191 |
+
for dist_name in dist_names:
|
192 |
+
try:
|
193 |
+
params = getattr(stats, dist_name).fit(x)
|
194 |
+
pdf = getattr(stats, dist_name).pdf(x_grid, *params)
|
195 |
+
kl_div = stats.entropy((density + 1e-10), (pdf + 1e-10))
|
196 |
+
dist_results.append({
|
197 |
+
'distribution': dist_name,
|
198 |
+
'params': params,
|
199 |
+
'kl_divergence': kl_div
|
200 |
+
})
|
201 |
+
except Exception as e:
|
202 |
+
logger.error(f"Failed to fit and evaluate {dist_name}: {e}", log_type='univariate_analysis', console=verbose)
|
203 |
+
|
204 |
+
|
205 |
+
try:
|
206 |
+
results['parametric_comparison'] = sorted(dist_results, key=lambda x: x['kl_divergence'])
|
207 |
+
if results['parametric_comparison']:
|
208 |
+
best_dist = results['parametric_comparison'][0]
|
209 |
+
dist_pdf = getattr(stats, best_dist['distribution']).pdf(x_grid, *best_dist['params'])
|
210 |
+
except Exception as e:
|
211 |
+
results['parametric_comparison'] = []
|
212 |
+
logger.error(f"Parametric comparison failed: {e}", log_type='univariate_analysis', console=verbose)
|
213 |
+
|
214 |
+
|
215 |
+
try:
|
216 |
+
bandwidths = np.linspace(bandwidth * 0.1, bandwidth * 2, 20)
|
217 |
+
log_liks = []
|
218 |
+
for bw in bandwidths:
|
219 |
+
kde = KernelDensity(kernel=best_kernel, bandwidth=bw)
|
220 |
+
kde.fit(x.reshape(-1, 1))
|
221 |
+
log_liks.append(kde.score(x.reshape(-1, 1)))
|
222 |
+
results['bandwidth_sensitivity'] = {
|
223 |
+
'bandwidths': bandwidths.tolist(),
|
224 |
+
'log_likelihoods': log_liks
|
225 |
+
}
|
226 |
+
except Exception as e:
|
227 |
+
logger.error(f"Bandwidth sensitivity analysis failed: {e}", log_type='univariate_analysis', console=verbose)
|
228 |
+
|
229 |
+
results['summary'] = self.summarize_kde_results(kde_results=results, variable=variable)
|
230 |
+
|
231 |
+
return results
|
232 |
+
|
233 |
+
def summarize_kde_results(self, kde_results: Dict[str, Any], variable: str) -> Dict[str, Any]:
|
234 |
+
"""Generate a precise and insightful summary of kernel density estimation results"""
|
235 |
+
|
236 |
+
summary = {
|
237 |
+
'variable': variable,
|
238 |
+
'distribution_characteristics': {},
|
239 |
+
'best_fit': {},
|
240 |
+
'technical_details': {}
|
241 |
+
}
|
242 |
+
|
243 |
+
try:
|
244 |
+
if 'basic_stats' in kde_results:
|
245 |
+
stats = kde_results['basic_stats']
|
246 |
+
shape = "symmetric"
|
247 |
+
if stats.get('skewness') is not None:
|
248 |
+
if stats['skewness'] > 0.5:
|
249 |
+
shape = "right-skewed"
|
250 |
+
elif stats['skewness'] < -0.5:
|
251 |
+
shape = "left-skewed"
|
252 |
+
peakedness = "mesokurtic"
|
253 |
+
if stats.get('kurtosis') is not None:
|
254 |
+
if stats['kurtosis'] > 0.5:
|
255 |
+
peakedness = "leptokurtic (heavy-tailed)"
|
256 |
+
elif stats['kurtosis'] < -0.5:
|
257 |
+
peakedness = "platykurtic (light-tailed)"
|
258 |
+
range_width = stats.get('max', 0) - stats.get('min', 0)
|
259 |
+
std_ranges = range_width / stats.get('std', 1) if stats.get('std') else None
|
260 |
+
summary['distribution_characteristics'] = {
|
261 |
+
'shape': shape,
|
262 |
+
'peakedness': peakedness,
|
263 |
+
'central_tendency': stats.get('mean'),
|
264 |
+
'dispersion': stats.get('std'),
|
265 |
+
'range_in_std_units': round(std_ranges, 2) if std_ranges else None,
|
266 |
+
'sample_size': stats.get('n')
|
267 |
+
}
|
268 |
+
except Exception as e:
|
269 |
+
if variable:
|
270 |
+
summary['distribution_characteristics'] = {'error': str(e)}
|
271 |
+
|
272 |
+
try:
|
273 |
+
if 'kernel_comparison' in kde_results and kde_results['kernel_comparison']:
|
274 |
+
summary['best_fit']['nonparametric'] = {
|
275 |
+
'method': 'kernel_density',
|
276 |
+
'best_kernel': kde_results['kernel_comparison'].get('best_kernel')
|
277 |
+
}
|
278 |
+
except Exception as e:
|
279 |
+
summary['best_fit']['nonparametric'] = {'error': str(e)}
|
280 |
+
|
281 |
+
try:
|
282 |
+
if 'parametric_comparison' in kde_results and kde_results['parametric_comparison']:
|
283 |
+
best_dist = kde_results['parametric_comparison'][0]
|
284 |
+
kl_div = best_dist.get('kl_divergence', None)
|
285 |
+
summary['best_fit']['parametric'] = {
|
286 |
+
'distribution': best_dist.get('distribution'),
|
287 |
+
'kl_divergence': round(kl_div, 4) if kl_div else None
|
288 |
+
}
|
289 |
+
if kl_div is not None:
|
290 |
+
if kl_div < 0.05:
|
291 |
+
fit_quality = "excellent"
|
292 |
+
elif kl_div < 0.1:
|
293 |
+
fit_quality = "good"
|
294 |
+
elif kl_div < 0.3:
|
295 |
+
fit_quality = "moderate"
|
296 |
+
else:
|
297 |
+
fit_quality = "poor"
|
298 |
+
summary['best_fit']['parametric']['fit_quality'] = fit_quality
|
299 |
+
except Exception as e:
|
300 |
+
summary['best_fit']['parametric'] = {'error': str(e)}
|
301 |
+
|
302 |
+
try:
|
303 |
+
if 'bandwidth_selection' in kde_results:
|
304 |
+
bw = kde_results['bandwidth_selection']
|
305 |
+
selected = bw.get('selected_bandwidth')
|
306 |
+
method = next((k for k, v in bw.items()
|
307 |
+
if v == selected and k != 'selected_bandwidth'), None)
|
308 |
+
summary['technical_details']['bandwidth'] = {
|
309 |
+
'selected': selected,
|
310 |
+
'selection_method': method
|
311 |
+
}
|
312 |
+
except Exception as e:
|
313 |
+
summary['technical_details']['bandwidth'] = {'error': str(e)}
|
314 |
+
|
315 |
+
try:
|
316 |
+
if 'basic_stats' in kde_results and 'bandwidth_selection' in kde_results:
|
317 |
+
range_width = kde_results['basic_stats'].get('max', 0) - kde_results['basic_stats'].get('min', 0)
|
318 |
+
selected_bw = kde_results['bandwidth_selection'].get('selected_bandwidth', 0)
|
319 |
+
if selected_bw and range_width:
|
320 |
+
relative_bw = selected_bw / range_width
|
321 |
+
if relative_bw < 0.03:
|
322 |
+
multimodality = "highly likely"
|
323 |
+
elif relative_bw < 0.06:
|
324 |
+
multimodality = "possible"
|
325 |
+
else:
|
326 |
+
multimodality = "unlikely"
|
327 |
+
summary['distribution_characteristics']['multimodality'] = multimodality
|
328 |
+
except Exception as e:
|
329 |
+
summary['distribution_characteristics']['multimodality_error'] = str(e)
|
330 |
+
|
331 |
+
try:
|
332 |
+
recommendations = []
|
333 |
+
if 'basic_stats' in kde_results and kde_results['basic_stats'].get('n', 0) < 30:
|
334 |
+
recommendations.append("Sample size is small; interpret results with caution")
|
335 |
+
if ('basic_stats' in kde_results and
|
336 |
+
abs(kde_results['basic_stats'].get('skewness', 0)) > 1.5 or
|
337 |
+
abs(kde_results['basic_stats'].get('kurtosis', 0)) > 2):
|
338 |
+
recommendations.append("Distribution has extreme values; consider transformation or robust methods")
|
339 |
+
if ('parametric_comparison' in kde_results and kde_results['parametric_comparison'] and
|
340 |
+
kde_results['parametric_comparison'][0].get('kl_divergence', 1) < 0.1):
|
341 |
+
best_dist = kde_results['parametric_comparison'][0]['distribution']
|
342 |
+
recommendations.append(f"Consider using {best_dist} distribution for parametric modeling")
|
343 |
+
summary['recommendations'] = recommendations
|
344 |
+
except Exception as e:
|
345 |
+
summary['recommendations'] = [f"Failed to generate recommendations: {str(e)}"]
|
346 |
+
|
347 |
+
return summary
|
348 |
+
|
349 |
+
def run_normality_tests(self, variable, verbose=False):
|
350 |
+
"""Run battery of normality tests"""
|
351 |
+
try:
|
352 |
+
data = self.data[variable].dropna()
|
353 |
+
sample_size = len(data)
|
354 |
+
|
355 |
+
if len(self.data) > 10000:
|
356 |
+
sample_size = 1000
|
357 |
+
elif 5000 < len(self.data) <= 10000:
|
358 |
+
sample_size = int(0.2 * len(self.data))
|
359 |
+
else:
|
360 |
+
sample_size = len(self.data)
|
361 |
+
|
362 |
+
if sample_size < len(data):
|
363 |
+
data = data.sample(n=sample_size, random_state=42)
|
364 |
+
|
365 |
+
normality_tests = {}
|
366 |
+
|
367 |
+
try:
|
368 |
+
if sample_size < 30:
|
369 |
+
normality_tests['shapiro_wilk'] = self.shapiro_wilk_normality_test(data)
|
370 |
+
normality_tests['dagostino_pearson'] = self.dagostino_pearson_normality_test(data)
|
371 |
+
elif 30 <= sample_size <= 2000:
|
372 |
+
normality_tests['shapiro_wilk'] = self.shapiro_wilk_normality_test(data)
|
373 |
+
normality_tests['dagostino_pearson'] = self.dagostino_pearson_normality_test(data)
|
374 |
+
normality_tests['anderson_darling'] = self.anderson_darling_normality_test(data)
|
375 |
+
elif sample_size > 2000:
|
376 |
+
normality_tests['dagostino_pearson'] = self.dagostino_pearson_normality_test(data)
|
377 |
+
normality_tests['anderson_darling'] = self.anderson_darling_normality_test(data)
|
378 |
+
normality_tests['kolmogorov_smirnov'] = self.kolmogorov_smirnov_normality_test(data)
|
379 |
+
except Exception as e:
|
380 |
+
logger.error(f"Error running normality tests for {variable}: {str(e)}", log_type="data_quality_assessment", console=verbose)
|
381 |
+
normality_tests['error'] = str(e)
|
382 |
+
|
383 |
+
normality_tests['summary'] = self.generate_normality_test_summary(normality_tests, variable)
|
384 |
+
|
385 |
+
return normality_tests
|
386 |
+
|
387 |
+
except Exception as e:
|
388 |
+
logger.error(f"Error processing variable {variable}: {str(e)}", log_type="data_quality_assessment", console=verbose)
|
389 |
+
return {'error': str(e)}
|
390 |
+
|
391 |
+
def shapiro_wilk_normality_test(self, data):
|
392 |
+
"""Perform Shapiro-Wilk test for normality"""
|
393 |
+
try:
|
394 |
+
stat, p = shapiro(data)
|
395 |
+
return {
|
396 |
+
'test_statistic': stat,
|
397 |
+
'p_value': p,
|
398 |
+
'interpretation': 'Data looks normally distributed (fail to reject H0)' if p > 0.05 else 'Data does not look normally distributed (reject H0)'
|
399 |
+
}
|
400 |
+
except Exception as e:
|
401 |
+
return {
|
402 |
+
'error': f"Shapiro-Wilk test failed: {str(e)}",
|
403 |
+
'test_statistic': None,
|
404 |
+
'p_value': None
|
405 |
+
}
|
406 |
+
|
407 |
+
def dagostino_pearson_normality_test(self, data):
|
408 |
+
"""Perform D'Agostino-Pearson test for normality"""
|
409 |
+
try:
|
410 |
+
stat, p = normaltest(data)
|
411 |
+
return {
|
412 |
+
'test_statistic': stat,
|
413 |
+
'p_value': p,
|
414 |
+
'interpretation': 'Data looks normally distributed (fail to reject H0)' if p > 0.05 else 'Data does not look normally distributed (reject H0)'
|
415 |
+
}
|
416 |
+
except Exception as e:
|
417 |
+
return {
|
418 |
+
'error': f"D'Agostino-Pearson test failed: {str(e)}",
|
419 |
+
'test_statistic': None,
|
420 |
+
'p_value': None
|
421 |
+
}
|
422 |
+
|
423 |
+
def anderson_darling_normality_test(self, data):
|
424 |
+
"""Perform Anderson-Darling test for normality"""
|
425 |
+
try:
|
426 |
+
result = anderson(data, dist='norm')
|
427 |
+
return {
|
428 |
+
'test_statistic': result.statistic,
|
429 |
+
'critical_values': result.critical_values,
|
430 |
+
'significance_levels': result.significance_level,
|
431 |
+
'interpretation': 'Data looks normally distributed at 5% level' if result.statistic < result.critical_values[2] else 'Data does not look normally distributed at 5% level'
|
432 |
+
}
|
433 |
+
except Exception as e:
|
434 |
+
return {
|
435 |
+
'error': f"Anderson-Darling test failed: {str(e)}",
|
436 |
+
'test_statistic': None,
|
437 |
+
'critical_values': None
|
438 |
+
}
|
439 |
+
|
440 |
+
def kolmogorov_smirnov_normality_test(self, data):
|
441 |
+
"""Perform Kolmogorov-Smirnov test for normality"""
|
442 |
+
try:
|
443 |
+
mu, std = norm.fit(data)
|
444 |
+
|
445 |
+
stat, p = kstest(data, 'norm', args=(mu, std))
|
446 |
+
|
447 |
+
return {
|
448 |
+
'test_statistic': stat,
|
449 |
+
'p_value': p,
|
450 |
+
'interpretation': 'Data looks normally distributed (fail to reject H0)' if p > 0.05 else 'Data does not look normally distributed (reject H0)'
|
451 |
+
}
|
452 |
+
except Exception as e:
|
453 |
+
return {
|
454 |
+
'error': f"Kolmogorov-Smirnov test failed: {str(e)}",
|
455 |
+
'test_statistic': None,
|
456 |
+
'p_value': None
|
457 |
+
}
|
458 |
+
|
459 |
+
def generate_normality_test_summary(self, normality_tests_results, variable_name):
|
460 |
+
"""Generate a structured and insightful summary of normality test results"""
|
461 |
+
try:
|
462 |
+
if 'error' in normality_tests_results:
|
463 |
+
return {
|
464 |
+
'variable': variable_name,
|
465 |
+
'normality_assessment': 'Error in analysis',
|
466 |
+
'error_message': normality_tests_results['error'],
|
467 |
+
'recommendation': 'Check the data quality or transform the variable'
|
468 |
+
}
|
469 |
+
except Exception as e:
|
470 |
+
return {
|
471 |
+
'variable': variable_name,
|
472 |
+
'normality_assessment': 'Error in checking error flag',
|
473 |
+
'error_message': str(e),
|
474 |
+
'recommendation': 'Ensure error key structure is valid'
|
475 |
+
}
|
476 |
+
|
477 |
+
total_tests = 0
|
478 |
+
normal_tests = 0
|
479 |
+
tests_performed = []
|
480 |
+
test_details = {}
|
481 |
+
|
482 |
+
try:
|
483 |
+
for test_name, result in normality_tests_results.items():
|
484 |
+
try:
|
485 |
+
if 'error' in result:
|
486 |
+
test_details[test_name] = {
|
487 |
+
'status': 'Failed',
|
488 |
+
'reason': result['error']
|
489 |
+
}
|
490 |
+
continue
|
491 |
+
|
492 |
+
total_tests += 1
|
493 |
+
tests_performed.append(test_name)
|
494 |
+
|
495 |
+
is_normal = False
|
496 |
+
|
497 |
+
if test_name in ['shapiro_wilk', 'dagostino_pearson', 'kolmogorov_smirnov']:
|
498 |
+
is_normal = result.get('p_value', 0) > 0.05
|
499 |
+
test_details[test_name] = {
|
500 |
+
'test_statistic': result.get('test_statistic'),
|
501 |
+
'p_value': result.get('p_value'),
|
502 |
+
'suggests_normality': is_normal
|
503 |
+
}
|
504 |
+
|
505 |
+
elif test_name == 'anderson_darling':
|
506 |
+
is_normal = result.get('test_statistic', float('inf')) < result.get('critical_values', [0, 0, 0])[2]
|
507 |
+
test_details[test_name] = {
|
508 |
+
'test_statistic': result.get('test_statistic'),
|
509 |
+
'critical_value_5pct': result.get('critical_values', [0, 0, 0])[2],
|
510 |
+
'suggests_normality': is_normal
|
511 |
+
}
|
512 |
+
|
513 |
+
if is_normal:
|
514 |
+
normal_tests += 1
|
515 |
+
|
516 |
+
except Exception as e:
|
517 |
+
test_details[test_name] = {
|
518 |
+
'status': 'Failed',
|
519 |
+
'reason': f"Exception during processing: {str(e)}"
|
520 |
+
}
|
521 |
+
except Exception as e:
|
522 |
+
return {
|
523 |
+
'variable': variable_name,
|
524 |
+
'normality_assessment': 'Error in parsing test results',
|
525 |
+
'error_message': str(e),
|
526 |
+
'recommendation': 'Ensure the test result structure is consistent'
|
527 |
+
}
|
528 |
+
|
529 |
+
try:
|
530 |
+
normality_score = normal_tests / total_tests if total_tests > 0 else 0
|
531 |
+
|
532 |
+
if normality_score >= 0.75:
|
533 |
+
normality_assessment = 'Normal'
|
534 |
+
confidence = 'High'
|
535 |
+
recommendation = 'Proceed with parametric tests'
|
536 |
+
elif normality_score >= 0.5:
|
537 |
+
normality_assessment = 'Likely Normal'
|
538 |
+
confidence = 'Moderate'
|
539 |
+
recommendation = 'Consider parametric tests, but verify with visual inspection'
|
540 |
+
elif normality_score >= 0.25:
|
541 |
+
normality_assessment = 'Likely Non-Normal'
|
542 |
+
confidence = 'Moderate'
|
543 |
+
recommendation = 'Consider non-parametric alternatives or data transformation'
|
544 |
+
else:
|
545 |
+
normality_assessment = 'Non-Normal'
|
546 |
+
confidence = 'High'
|
547 |
+
recommendation = 'Use non-parametric tests or transform the data'
|
548 |
+
except Exception as e:
|
549 |
+
return {
|
550 |
+
'variable': variable_name,
|
551 |
+
'normality_assessment': 'Error in calculating summary statistics',
|
552 |
+
'error_message': str(e),
|
553 |
+
'recommendation': 'Check calculations or inputs for division errors'
|
554 |
+
}
|
555 |
+
|
556 |
+
try:
|
557 |
+
summary = {
|
558 |
+
'variable': variable_name,
|
559 |
+
'normality_assessment': normality_assessment,
|
560 |
+
'confidence': confidence,
|
561 |
+
'normal_tests_ratio': f"{normal_tests}/{total_tests}",
|
562 |
+
'normality_score': normality_score,
|
563 |
+
'tests_performed': tests_performed,
|
564 |
+
'test_details': test_details,
|
565 |
+
'recommendation': recommendation
|
566 |
+
}
|
567 |
+
except Exception as e:
|
568 |
+
return {
|
569 |
+
'variable': variable_name,
|
570 |
+
'normality_assessment': 'Error in creating summary',
|
571 |
+
'error_message': str(e),
|
572 |
+
'recommendation': 'Inspect result structure and try again'
|
573 |
+
}
|
574 |
+
|
575 |
+
return summary
|
576 |
+
|
577 |
+
def fit_distributions(self, variable, verbose=False):
|
578 |
+
"""Fit common distributions and assess goodness-of-fit"""
|
579 |
+
pass
|
580 |
+
|
581 |
+
def test_modality(self, variable, verbose=False):
|
582 |
+
""" Test for unimodal vs multimodal distributions using Hartigan's Dip Test"""
|
583 |
+
try:
|
584 |
+
x = np.asarray(variable).flatten()
|
585 |
+
|
586 |
+
x = x[~np.isnan(x)]
|
587 |
+
|
588 |
+
if len(x) <= 3:
|
589 |
+
if verbose:
|
590 |
+
logger.info("Warning: Too few data points for reliable modality testing.", log_type='univariate_analysis', console=verbose)
|
591 |
+
return {
|
592 |
+
'dip_statistic': None,
|
593 |
+
'p_value': None,
|
594 |
+
'conclusion': "Not enough data points for reliable testing (minimum 4 required)",
|
595 |
+
'is_multimodal': None,
|
596 |
+
'sample_size': len(x)
|
597 |
+
}
|
598 |
+
|
599 |
+
dip, p_value = self.hartigan_dip_test(x)
|
600 |
+
|
601 |
+
alpha = 0.05
|
602 |
+
if p_value < alpha:
|
603 |
+
conclusion = "Likely multimodal distribution (reject unimodality)"
|
604 |
+
is_multimodal = True
|
605 |
+
else:
|
606 |
+
conclusion = "Likely unimodal distribution (fail to reject unimodality)"
|
607 |
+
is_multimodal = False
|
608 |
+
|
609 |
+
results = {
|
610 |
+
'dip_statistic': dip,
|
611 |
+
'p_value': p_value,
|
612 |
+
'conclusion': conclusion,
|
613 |
+
'is_multimodal': is_multimodal,
|
614 |
+
'sample_size': len(x)
|
615 |
+
}
|
616 |
+
|
617 |
+
results['summary'] = self.summarize_modality_test(results)
|
618 |
+
|
619 |
+
return results
|
620 |
+
|
621 |
+
except Exception as e:
|
622 |
+
logger.error(f"Failed to run modality test with error: {e}", log_type="univariate_analysis", console=verbose)
|
623 |
+
return {"error": str(e)}
|
624 |
+
|
625 |
+
def hartigan_dip_test(self, x: np.ndarray) -> Tuple[float, float]:
|
626 |
+
"""Implement Hartigan's dip test for unimodality """
|
627 |
+
|
628 |
+
x = np.sort(x)
|
629 |
+
n = len(x)
|
630 |
+
|
631 |
+
ecdf = np.arange(1, n + 1) / n
|
632 |
+
|
633 |
+
dip = self.calculate_dip_statistic(x, ecdf)
|
634 |
+
|
635 |
+
p_value = self.calculate_dip_pvalue(dip, n)
|
636 |
+
|
637 |
+
return dip, p_value
|
638 |
+
|
639 |
+
def calculate_dip_statistic(self, x: np.ndarray, ecdf: np.ndarray) -> float:
|
640 |
+
"""Calculate the Hartigan's dip statistic """
|
641 |
+
|
642 |
+
n = len(x)
|
643 |
+
gcm = self.compute_gcm(x, ecdf)
|
644 |
+
|
645 |
+
lcm = self.compute_lcm(x, ecdf)
|
646 |
+
|
647 |
+
diffs = np.maximum(
|
648 |
+
np.abs(ecdf - gcm),
|
649 |
+
np.abs(ecdf - lcm)
|
650 |
+
)
|
651 |
+
|
652 |
+
dip = np.max(diffs)
|
653 |
+
|
654 |
+
return dip
|
655 |
+
|
656 |
+
def compute_gcm(self, x: np.ndarray, ecdf: np.ndarray) -> np.ndarray:
|
657 |
+
"""Compute the greatest convex minorant (GCM) of the empirical CDF"""
|
658 |
+
n = len(x)
|
659 |
+
gcm = np.zeros(n)
|
660 |
+
|
661 |
+
gcm[0] = ecdf[0]
|
662 |
+
|
663 |
+
for i in range(1, n):
|
664 |
+
slopes = (ecdf[i] - gcm[:i]) / (x[i] - x[:i])
|
665 |
+
max_slope_idx = np.argmax(slopes)
|
666 |
+
gcm[i] = gcm[max_slope_idx] + slopes[max_slope_idx] * (x[i] - x[max_slope_idx])
|
667 |
+
|
668 |
+
gcm[i] = min(gcm[i], ecdf[i])
|
669 |
+
|
670 |
+
return gcm
|
671 |
+
|
672 |
+
def compute_lcm(self, x: np.ndarray, ecdf: np.ndarray) -> np.ndarray:
|
673 |
+
"""Compute the least concave majorant (LCM) of the empirical CDF"""
|
674 |
+
n = len(x)
|
675 |
+
lcm = np.zeros(n)
|
676 |
+
|
677 |
+
lcm[-1] = ecdf[-1]
|
678 |
+
|
679 |
+
for i in range(n-2, -1, -1):
|
680 |
+
slopes = (lcm[i+1:] - ecdf[i]) / (x[i+1:] - x[i])
|
681 |
+
min_slope_idx = np.argmin(slopes) + i + 1
|
682 |
+
lcm[i] = ecdf[i] + slopes[min_slope_idx - i - 1] * (x[min_slope_idx] - x[i])
|
683 |
+
|
684 |
+
lcm[i] = max(lcm[i], ecdf[i])
|
685 |
+
|
686 |
+
return lcm
|
687 |
+
|
688 |
+
def calculate_dip_pvalue(self, dip: float, n: int) -> float:
|
689 |
+
"""Calculate the p-value for the dip statistic"""
|
690 |
+
adjusted_dip = dip * (np.sqrt(n) + 0.12 + 0.11 / np.sqrt(n))
|
691 |
+
|
692 |
+
if adjusted_dip < 0.01:
|
693 |
+
p_value = 1.0
|
694 |
+
elif adjusted_dip < 0.7:
|
695 |
+
p_value = np.exp(-4.0 * adjusted_dip * adjusted_dip)
|
696 |
+
elif adjusted_dip < 0.9:
|
697 |
+
p_value = 0.15
|
698 |
+
elif adjusted_dip < 1.0:
|
699 |
+
p_value = 0.10
|
700 |
+
elif adjusted_dip < 1.1:
|
701 |
+
p_value = 0.05
|
702 |
+
elif adjusted_dip < 1.2:
|
703 |
+
p_value = 0.025
|
704 |
+
elif adjusted_dip < 1.3:
|
705 |
+
p_value = 0.01
|
706 |
+
elif adjusted_dip < 1.5:
|
707 |
+
p_value = 0.005
|
708 |
+
else:
|
709 |
+
p_value = 0.001
|
710 |
+
|
711 |
+
return p_value
|
712 |
+
|
713 |
+
def summarize_modality_test(self, test_results: Dict) -> Dict:
|
714 |
+
"""Generate a precise and insightful summary of modality test results"""
|
715 |
+
|
716 |
+
if test_results.get('dip_statistic') is None:
|
717 |
+
return {
|
718 |
+
'summary': "Insufficient data for modality testing.",
|
719 |
+
'recommendation': "Collect more data points (minimum 4 required).",
|
720 |
+
'reliability': "Not applicable"
|
721 |
+
}
|
722 |
+
|
723 |
+
dip = test_results['dip_statistic']
|
724 |
+
p_value = test_results['p_value']
|
725 |
+
n = test_results['sample_size']
|
726 |
+
is_multimodal = test_results['is_multimodal']
|
727 |
+
|
728 |
+
confidence_level = 1 - p_value
|
729 |
+
|
730 |
+
if p_value < 0.001:
|
731 |
+
strength = "Very strong"
|
732 |
+
reliability = "High"
|
733 |
+
elif p_value < 0.01:
|
734 |
+
strength = "Strong"
|
735 |
+
reliability = "High"
|
736 |
+
elif p_value < 0.05:
|
737 |
+
strength = "Moderate"
|
738 |
+
reliability = "Moderate"
|
739 |
+
elif p_value < 0.1:
|
740 |
+
strength = "Weak"
|
741 |
+
reliability = "Low"
|
742 |
+
else:
|
743 |
+
strength = "Very weak"
|
744 |
+
reliability = "Low"
|
745 |
+
|
746 |
+
if n < 20:
|
747 |
+
reliability = "Low (small sample size)"
|
748 |
+
elif n < 50:
|
749 |
+
reliability = reliability + " (moderate sample size)"
|
750 |
+
else:
|
751 |
+
reliability = reliability + " (large sample size)"
|
752 |
+
|
753 |
+
if is_multimodal:
|
754 |
+
conclusion = f"{strength} evidence of multimodality (dip={dip:.6f}, p={p_value:.6f})"
|
755 |
+
recommendation = "Consider mixture modeling or clustering approaches."
|
756 |
+
else:
|
757 |
+
conclusion = f"{strength} evidence for unimodality (dip={dip:.6f}, p={p_value:.6f})"
|
758 |
+
recommendation = "Parametric distributional analyses may be appropriate."
|
759 |
+
|
760 |
+
return {
|
761 |
+
'summary': conclusion,
|
762 |
+
'dip_statistic_interpretation': f"Dip statistic of {dip:.6f} indicates " +
|
763 |
+
("significant" if is_multimodal else "non-significant") +
|
764 |
+
" deviation from unimodality.",
|
765 |
+
'confidence_level': f"{confidence_level:.1%}",
|
766 |
+
'evidence_strength': strength,
|
767 |
+
'reliability': reliability,
|
768 |
+
'recommendation': recommendation,
|
769 |
+
'sample_size_adequacy': "Adequate" if n >= 50 else "Limited" if n >= 20 else "Inadequate",
|
770 |
+
'technical_details': {
|
771 |
+
'dip_statistic': dip,
|
772 |
+
'p_value': p_value,
|
773 |
+
'sample_size': n,
|
774 |
+
'critical_alpha': 0.05,
|
775 |
+
'null_hypothesis': "The distribution is unimodal",
|
776 |
+
'alternative_hypothesis': "The distribution is multimodal"
|
777 |
+
}
|
778 |
+
}
|
779 |
+
|
780 |
+
def calculate_shape_metrics(self, continuous_cols: List[str] = None, verbose=False):
|
781 |
+
"""Calculate shape characteristics for all continuous variables"""
|
782 |
+
logger.info("Starting to calculate shape metrics..", log_type='univariate_analysis', console=verbose)
|
783 |
+
results = {
|
784 |
+
"skewness": {},
|
785 |
+
"kurtosis": {},
|
786 |
+
"l_moments": {},
|
787 |
+
"entropy": {},
|
788 |
+
"tail_weight": {},
|
789 |
+
"zero_variance": {},
|
790 |
+
"summary": {}
|
791 |
+
}
|
792 |
+
|
793 |
+
if continuous_cols is None:
|
794 |
+
try:
|
795 |
+
continuous_cols = df.select_dtypes(include=['number']).columns.tolist()
|
796 |
+
except Exception as e:
|
797 |
+
logger.error(f"Error detecting continuous columns: {str(e)}", log_type="data_quality_assessment", console=verbose)
|
798 |
+
continuous_cols = []
|
799 |
+
|
800 |
+
if not continuous_cols:
|
801 |
+
return {"error": "No continuous columns found or provided"}
|
802 |
+
|
803 |
+
for col in continuous_cols:
|
804 |
+
try:
|
805 |
+
values = self.data[col].dropna().values
|
806 |
+
|
807 |
+
if len(values) == 0:
|
808 |
+
results["summary"][col] = "All values are NA"
|
809 |
+
continue
|
810 |
+
|
811 |
+
col_results = {}
|
812 |
+
|
813 |
+
try:
|
814 |
+
skewness = stats.skew(values)
|
815 |
+
skewness_interpretation = self.interpret_skewness(skewness)
|
816 |
+
results["skewness"][col] = {
|
817 |
+
"value": float(skewness),
|
818 |
+
"interpretation": skewness_interpretation
|
819 |
+
}
|
820 |
+
col_results["skewness"] = skewness_interpretation
|
821 |
+
except Exception as e:
|
822 |
+
results["skewness"][col] = {"error": str(e)}
|
823 |
+
col_results["skewness"] = "Error calculating"
|
824 |
+
|
825 |
+
try:
|
826 |
+
kurtosis = stats.kurtosis(values)
|
827 |
+
kurtosis_interpretation = self.interpret_kurtosis(kurtosis)
|
828 |
+
results["kurtosis"][col] = {
|
829 |
+
"value": float(kurtosis),
|
830 |
+
"interpretation": kurtosis_interpretation
|
831 |
+
}
|
832 |
+
col_results["kurtosis"] = kurtosis_interpretation
|
833 |
+
except Exception as e:
|
834 |
+
results["kurtosis"][col] = {"error": str(e)}
|
835 |
+
col_results["kurtosis"] = "Error calculating"
|
836 |
+
|
837 |
+
try:
|
838 |
+
l_moments = self.calculate_l_moments(values)
|
839 |
+
results["l_moments"][col] = l_moments
|
840 |
+
col_results["l_moments"] = f"L-CV: {l_moments['l_cv']:.3f}, L-skewness: {l_moments['l_skewness']:.3f}, L-kurtosis: {l_moments['l_kurtosis']:.3f}"
|
841 |
+
except Exception as e:
|
842 |
+
results["l_moments"][col] = {"error": str(e)}
|
843 |
+
col_results["l_moments"] = "Error calculating"
|
844 |
+
|
845 |
+
try:
|
846 |
+
entropy_values = self.calculate_entropy(values)
|
847 |
+
results["entropy"][col] = entropy_values
|
848 |
+
col_results["entropy"] = f"Shannon: {entropy_values['shannon']:.3f}, Differential: {entropy_values['differential']:.3f}"
|
849 |
+
except Exception as e:
|
850 |
+
results["entropy"][col] = {"error": str(e)}
|
851 |
+
col_results["entropy"] = "Error calculating"
|
852 |
+
|
853 |
+
try:
|
854 |
+
tail_weights = self.assess_tail_weight(values)
|
855 |
+
results["tail_weight"][col] = tail_weights
|
856 |
+
col_results["tail_weight"] = f"Left: {tail_weights['left_tail']:.3f}, Right: {tail_weights['right_tail']:.3f}"
|
857 |
+
except Exception as e:
|
858 |
+
results["tail_weight"][col] = {"error": str(e)}
|
859 |
+
col_results["tail_weight"] = "Error calculating"
|
860 |
+
|
861 |
+
try:
|
862 |
+
zero_var_results = self.test_zero_variance(values)
|
863 |
+
results["zero_variance"][col] = zero_var_results
|
864 |
+
if zero_var_results["is_near_zero_variance"]:
|
865 |
+
col_results["zero_variance"] = "Near-zero variance detected"
|
866 |
+
else:
|
867 |
+
col_results["zero_variance"] = "Variable has sufficient variance"
|
868 |
+
except Exception as e:
|
869 |
+
results["zero_variance"][col] = {"error": str(e)}
|
870 |
+
col_results["zero_variance"] = "Error calculating"
|
871 |
+
|
872 |
+
results["summary"][col] = self.generate_column_summary(col, col_results)
|
873 |
+
|
874 |
+
except Exception as e:
|
875 |
+
results["summary"][col] = f"Error processing column {col}: {str(e)}"
|
876 |
+
if verbose:
|
877 |
+
print(f"Error processing column {col}: {str(e)}")
|
878 |
+
|
879 |
+
results["overall_summary"] = self.generate_overall_summary(results)
|
880 |
+
|
881 |
+
return results
|
882 |
+
|
883 |
+
def interpret_skewness(self, skewness: float) -> str:
|
884 |
+
"""Interpret the skewness value."""
|
885 |
+
if abs(skewness) < 0.5:
|
886 |
+
return "Approximately symmetric"
|
887 |
+
elif skewness < -1:
|
888 |
+
return "Highly negatively skewed"
|
889 |
+
elif skewness < -0.5:
|
890 |
+
return "Moderately negatively skewed"
|
891 |
+
elif skewness > 1:
|
892 |
+
return "Highly positively skewed"
|
893 |
+
else: # skewness > 0.5
|
894 |
+
return "Moderately positively skewed"
|
895 |
+
|
896 |
+
def interpret_kurtosis(self, kurtosis: float) -> str:
|
897 |
+
"""Interpret the kurtosis value."""
|
898 |
+
if abs(kurtosis) < 0.5:
|
899 |
+
return "Approximately mesokurtic (normal-like tails)"
|
900 |
+
elif kurtosis < -0.5:
|
901 |
+
return "Platykurtic (thinner tails than normal)"
|
902 |
+
else: # kurtosis > 0.5
|
903 |
+
return "Leptokurtic (heavier tails than normal)"
|
904 |
+
|
905 |
+
def calculate_l_moments(self, values: np.ndarray) -> Dict:
|
906 |
+
"""Calculate L-moments for a sample."""
|
907 |
+
try:
|
908 |
+
lmoms = lm.lmom_ratios(values, nmom=5)
|
909 |
+
return {
|
910 |
+
"l1": float(lmoms[0]), # L-mean
|
911 |
+
"l2": float(lmoms[1]), # L-scale
|
912 |
+
"l_cv": float(lmoms[1] / lmoms[0]) if lmoms[0] != 0 else float('nan'), # L-coefficient of variation
|
913 |
+
"l_skewness": float(lmoms[2]), # L-skewness
|
914 |
+
"l_kurtosis": float(lmoms[3]), # L-kurtosis
|
915 |
+
"tau5": float(lmoms[4]) if len(lmoms) > 4 else float('nan') # 5th L-moment ratio
|
916 |
+
}
|
917 |
+
except Exception:
|
918 |
+
try:
|
919 |
+
sorted_data = np.sort(values)
|
920 |
+
n = len(sorted_data)
|
921 |
+
|
922 |
+
l1 = np.mean(sorted_data)
|
923 |
+
|
924 |
+
cumsum = np.cumsum(sorted_data)
|
925 |
+
indices = np.arange(n)
|
926 |
+
l2 = np.mean((indices / (n - 1)) * sorted_data - cumsum / (n - 1))
|
927 |
+
|
928 |
+
return {
|
929 |
+
"l1": float(l1),
|
930 |
+
"l2": float(l2),
|
931 |
+
"l_cv": float(l2 / l1) if l1 != 0 else float('nan'),
|
932 |
+
"l_skewness": float('nan'),
|
933 |
+
"l_kurtosis": float('nan'),
|
934 |
+
"tau5": float('nan')
|
935 |
+
}
|
936 |
+
except Exception as e:
|
937 |
+
logger.error(f"Error calculating L-moments: {str(e)}", log_type="data_quality_assessment", console=True)
|
938 |
+
|
939 |
+
def calculate_entropy(self, values: np.ndarray) -> Dict:
|
940 |
+
"""Calculate entropy measures for a sample."""
|
941 |
+
try:
|
942 |
+
hist, bin_edges = np.histogram(values, bins='auto', density=True)
|
943 |
+
hist = hist[hist > 0]
|
944 |
+
shannon_entropy = -np.sum(hist * np.log(hist)) * (bin_edges[1] - bin_edges[0])
|
945 |
+
|
946 |
+
n = len(values)
|
947 |
+
std = np.std(values)
|
948 |
+
diff_entropy = 0.5 * np.log(2 * np.pi * np.e * std**2) if std > 0 else float('-inf')
|
949 |
+
|
950 |
+
return {
|
951 |
+
"shannon": float(shannon_entropy),
|
952 |
+
"differential": float(diff_entropy),
|
953 |
+
"normalized_shannon": float(shannon_entropy / np.log(len(hist))) if len(hist) > 1 else 0.0
|
954 |
+
}
|
955 |
+
except Exception as e:
|
956 |
+
logger.error(f"Error calculating entropy: {str(e)}", log_type="data_quality_assessment", console=True)
|
957 |
+
|
958 |
+
def assess_tail_weight(self, values: np.ndarray) -> Dict:
|
959 |
+
"""Assess tail weight of a distribution."""
|
960 |
+
try:
|
961 |
+
q = np.quantile(values, [0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])
|
962 |
+
|
963 |
+
iqr = q[4] - q[2] # 75th - 25th
|
964 |
+
|
965 |
+
left_tail = (q[2] - q[0]) / iqr if iqr > 0 else float('inf')
|
966 |
+
right_tail = (q[6] - q[4]) / iqr if iqr > 0 else float('inf')
|
967 |
+
|
968 |
+
left_sorted = np.sort(values - np.min(values) + 1e-10)
|
969 |
+
left_sorted = left_sorted[:int(0.2 * len(left_sorted))]
|
970 |
+
left_tail_index = 1 / np.mean(np.log(left_sorted[-1] / left_sorted)) if len(left_sorted) > 1 else float('nan')
|
971 |
+
|
972 |
+
right_sorted = np.sort(-values + np.max(values) + 1e-10)
|
973 |
+
right_sorted = right_sorted[:int(0.2 * len(right_sorted))]
|
974 |
+
right_tail_index = 1 / np.mean(np.log(right_sorted[-1] / right_sorted)) if len(right_sorted) > 1 else float('nan')
|
975 |
+
|
976 |
+
interpretation = "Symmetric tails"
|
977 |
+
if abs(left_tail - right_tail) > 0.5:
|
978 |
+
if left_tail > right_tail:
|
979 |
+
interpretation = "Heavier left tail"
|
980 |
+
else:
|
981 |
+
interpretation = "Heavier right tail"
|
982 |
+
|
983 |
+
return {
|
984 |
+
"left_tail": float(left_tail),
|
985 |
+
"right_tail": float(right_tail),
|
986 |
+
"left_tail_index": float(left_tail_index),
|
987 |
+
"right_tail_index": float(right_tail_index),
|
988 |
+
"interpretation": interpretation
|
989 |
+
}
|
990 |
+
except Exception as e:
|
991 |
+
logger.error(f"Error assessing tail weight: {str(e)}", log_type="data_quality_assessment", console=True)
|
992 |
+
|
993 |
+
def test_zero_variance(self, values: np.ndarray) -> Dict:
|
994 |
+
"""Test for zero or near-zero variance."""
|
995 |
+
try:
|
996 |
+
n = len(values)
|
997 |
+
variance = np.var(values)
|
998 |
+
|
999 |
+
n_unique = len(np.unique(values))
|
1000 |
+
unique_ratio = n_unique / n if n > 0 else 0
|
1001 |
+
|
1002 |
+
if n_unique > 1:
|
1003 |
+
counts = np.bincount(np.digitize(values, np.unique(values)))
|
1004 |
+
sorted_counts = np.sort(counts[counts > 0])
|
1005 |
+
freq_ratio = sorted_counts[-1] / sorted_counts[-2] if sorted_counts[-2] > 0 else float('inf')
|
1006 |
+
else:
|
1007 |
+
freq_ratio = float('inf')
|
1008 |
+
|
1009 |
+
is_zero_variance = variance < 1e-10
|
1010 |
+
is_near_zero_variance = (is_zero_variance or
|
1011 |
+
unique_ratio < 0.1 or
|
1012 |
+
freq_ratio > 20)
|
1013 |
+
|
1014 |
+
mean = np.mean(values)
|
1015 |
+
cv = np.sqrt(variance) / mean if mean != 0 else float('inf')
|
1016 |
+
|
1017 |
+
return {
|
1018 |
+
"variance": float(variance),
|
1019 |
+
"unique_ratio": float(unique_ratio),
|
1020 |
+
"n_unique": int(n_unique),
|
1021 |
+
"freq_ratio": float(freq_ratio),
|
1022 |
+
"cv": float(cv),
|
1023 |
+
"is_zero_variance": bool(is_zero_variance),
|
1024 |
+
"is_near_zero_variance": bool(is_near_zero_variance)
|
1025 |
+
}
|
1026 |
+
except Exception as e:
|
1027 |
+
logger.error(f"Error testing zero variance: {str(e)}", log_type="data_quality_assessment", console=True)
|
1028 |
+
|
1029 |
+
def generate_column_summary(self, col_name: str, col_results: Dict) -> str:
|
1030 |
+
"""Generate a summary string for a single column."""
|
1031 |
+
summary = f"Column '{col_name}': "
|
1032 |
+
|
1033 |
+
points = []
|
1034 |
+
|
1035 |
+
if "skewness" in col_results:
|
1036 |
+
points.append(f"distribution is {col_results['skewness'].lower()}")
|
1037 |
+
|
1038 |
+
if "kurtosis" in col_results:
|
1039 |
+
points.append(f"{col_results['kurtosis'].lower()}")
|
1040 |
+
|
1041 |
+
if "tail_weight" in col_results and "tail_weight" in col_results:
|
1042 |
+
if "Heavier" in col_results["tail_weight"]:
|
1043 |
+
points.append(col_results["tail_weight"])
|
1044 |
+
|
1045 |
+
if "zero_variance" in col_results and "Near-zero variance" in col_results["zero_variance"]:
|
1046 |
+
points.append("WARNING: near-zero variance detected, may not be useful for modeling")
|
1047 |
+
|
1048 |
+
if points:
|
1049 |
+
summary += ", ".join(points)
|
1050 |
+
else:
|
1051 |
+
summary += "insufficient data to calculate shape metrics"
|
1052 |
+
|
1053 |
+
return summary
|
1054 |
+
|
1055 |
+
def generate_overall_summary(self, results: Dict) -> str:
|
1056 |
+
"""Generate an overall summary of shape metrics for all columns."""
|
1057 |
+
summary_lines = []
|
1058 |
+
|
1059 |
+
skewed_cols = []
|
1060 |
+
heavy_tailed_cols = []
|
1061 |
+
near_zero_variance_cols = []
|
1062 |
+
|
1063 |
+
for col, summary in results["summary"].items():
|
1064 |
+
if "highly" in summary.lower() and "skew" in summary.lower():
|
1065 |
+
skewed_cols.append(col)
|
1066 |
+
|
1067 |
+
if "leptokurtic" in summary.lower() or "heavier tail" in summary.lower():
|
1068 |
+
heavy_tailed_cols.append(col)
|
1069 |
+
|
1070 |
+
if "zero variance" in summary.lower():
|
1071 |
+
near_zero_variance_cols.append(col)
|
1072 |
+
|
1073 |
+
summary_lines.append(f"Analyzed {len(results['summary'])} continuous variables.")
|
1074 |
+
|
1075 |
+
if skewed_cols:
|
1076 |
+
summary_lines.append(f"Found {len(skewed_cols)} highly skewed variables: {', '.join(skewed_cols[:5])}" +
|
1077 |
+
(f" and {len(skewed_cols) - 5} more" if len(skewed_cols) > 5 else ""))
|
1078 |
+
|
1079 |
+
if heavy_tailed_cols:
|
1080 |
+
summary_lines.append(f"Found {len(heavy_tailed_cols)} variables with heavy tails: {', '.join(heavy_tailed_cols[:5])}" +
|
1081 |
+
(f" and {len(heavy_tailed_cols) - 5} more" if len(heavy_tailed_cols) > 5 else ""))
|
1082 |
+
|
1083 |
+
if near_zero_variance_cols:
|
1084 |
+
summary_lines.append(f"WARNING: Found {len(near_zero_variance_cols)} variables with near-zero variance which may not be useful for modeling: {', '.join(near_zero_variance_cols[:5])}" +
|
1085 |
+
(f" and {len(near_zero_variance_cols) - 5} more" if len(near_zero_variance_cols) > 5 else ""))
|
1086 |
+
|
1087 |
+
if skewed_cols or heavy_tailed_cols:
|
1088 |
+
summary_lines.append("Recommendation: Consider applying transformations (log, Box-Cox, etc.) to heavily skewed or heavy-tailed variables.")
|
1089 |
+
|
1090 |
+
if near_zero_variance_cols:
|
1091 |
+
summary_lines.append("Recommendation: Consider removing or carefully reviewing near-zero variance variables.")
|
1092 |
+
|
1093 |
+
return "\n".join(summary_lines)
|
1094 |
+
|
1095 |
+
def detect_outliers(self, verbose=False):
|
1096 |
+
"""Run multiple outlier detection methods"""
|
1097 |
+
pass
|
1098 |
+
|
1099 |
+
def analyze_categoricals(
|
1100 |
+
self,
|
1101 |
+
data: pd.DataFrame,
|
1102 |
+
categorical_columns: Optional[List[str]] = None,
|
1103 |
+
alpha: float = 0.05,
|
1104 |
+
rare_threshold: float = 0.05,
|
1105 |
+
verbose: bool = False) -> Dict[str, Any]:
|
1106 |
+
'''Perform comprehensive analysis on categorical variables in a dataset.'''
|
1107 |
+
results = {
|
1108 |
+
"frequency_tables": {},
|
1109 |
+
"prevalence_rates": {},
|
1110 |
+
"rare_categories": {},
|
1111 |
+
"entropy": {},
|
1112 |
+
"gini_impurity": {},
|
1113 |
+
"chi_square_results": {},
|
1114 |
+
"simpson_diversity": {},
|
1115 |
+
"shannon_diversity": {},
|
1116 |
+
"cardinality": {},
|
1117 |
+
"errors": [],
|
1118 |
+
"summary": ""
|
1119 |
+
}
|
1120 |
+
|
1121 |
+
try:
|
1122 |
+
if categorical_columns is None:
|
1123 |
+
categorical_columns = list(data.select_dtypes(include=['object', 'category']).columns)
|
1124 |
+
|
1125 |
+
for col in data.select_dtypes(include=['int64', 'float64']).columns:
|
1126 |
+
if data[col].nunique() <= 30 and data[col].nunique() / len(data) < 0.05:
|
1127 |
+
categorical_columns.append(col)
|
1128 |
+
except Exception as e:
|
1129 |
+
error_msg = f"Error identifying categorical columns: {str(e)}"
|
1130 |
+
logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
|
1131 |
+
results["errors"].append(error_msg)
|
1132 |
+
categorical_columns = []
|
1133 |
+
|
1134 |
+
if not categorical_columns:
|
1135 |
+
results["summary"] = "No categorical columns identified for analysis."
|
1136 |
+
return results
|
1137 |
+
|
1138 |
+
for col in categorical_columns:
|
1139 |
+
try:
|
1140 |
+
if col not in data.columns:
|
1141 |
+
continue
|
1142 |
+
|
1143 |
+
missing_rate = data[col].isna().mean()
|
1144 |
+
if missing_rate > 0.8:
|
1145 |
+
results["errors"].append(f"Column {col} has {missing_rate:.2%} missing values - skipping analysis")
|
1146 |
+
continue
|
1147 |
+
|
1148 |
+
column_data = data[col].dropna()
|
1149 |
+
|
1150 |
+
if len(column_data) == 0:
|
1151 |
+
results["errors"].append(f"Column {col} has no valid values after removing NA - skipping analysis")
|
1152 |
+
continue
|
1153 |
+
|
1154 |
+
try:
|
1155 |
+
freq_table = column_data.value_counts(dropna=False).reset_index()
|
1156 |
+
freq_table.columns = ['value', 'frequency']
|
1157 |
+
freq_table['percentage'] = (freq_table['frequency'] / len(column_data)) * 100
|
1158 |
+
# results["frequency_tables"][col] = freq_table.to_dict('records')
|
1159 |
+
except Exception as e:
|
1160 |
+
error_msg = f"Error calculating frequency table for {col}: {str(e)}"
|
1161 |
+
logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
|
1162 |
+
results["errors"].append(error_msg)
|
1163 |
+
|
1164 |
+
try:
|
1165 |
+
prevalence_rates = {}
|
1166 |
+
value_counts = column_data.value_counts(normalize=True)
|
1167 |
+
for value, rate in value_counts.items():
|
1168 |
+
prevalence_rates[str(value)] = float(rate)
|
1169 |
+
# results["prevalence_rates"][col] = prevalence_rates
|
1170 |
+
except Exception as e:
|
1171 |
+
error_msg = f"Error calculating prevalence rates for {col}: {str(e)}"
|
1172 |
+
logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
|
1173 |
+
results["errors"].append(error_msg)
|
1174 |
+
|
1175 |
+
try:
|
1176 |
+
rare_categories = {}
|
1177 |
+
for value, rate in value_counts.items():
|
1178 |
+
if rate < rare_threshold:
|
1179 |
+
rare_categories[str(value)] = float(rate)
|
1180 |
+
results["rare_categories"][col] = rare_categories
|
1181 |
+
except Exception as e:
|
1182 |
+
error_msg = f"Error detecting rare categories for {col}: {str(e)}"
|
1183 |
+
logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
|
1184 |
+
results["errors"].append(error_msg)
|
1185 |
+
|
1186 |
+
try:
|
1187 |
+
entropy_value = 0
|
1188 |
+
for prob in value_counts:
|
1189 |
+
if prob > 0:
|
1190 |
+
entropy_value -= prob * math.log2(prob)
|
1191 |
+
results["entropy"][col] = float(entropy_value)
|
1192 |
+
except Exception as e:
|
1193 |
+
error_msg = f"Error calculating entropy for {col}: {str(e)}"
|
1194 |
+
logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
|
1195 |
+
results["errors"].append(error_msg)
|
1196 |
+
|
1197 |
+
try:
|
1198 |
+
gini_impurity = 1 - sum(prob ** 2 for prob in value_counts)
|
1199 |
+
results["gini_impurity"][col] = float(gini_impurity)
|
1200 |
+
except Exception as e:
|
1201 |
+
error_msg = f"Error calculating Gini impurity for {col}: {str(e)}"
|
1202 |
+
logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
|
1203 |
+
results["errors"].append(error_msg)
|
1204 |
+
|
1205 |
+
try:
|
1206 |
+
|
1207 |
+
observed = column_data.value_counts().values
|
1208 |
+
n_categories = len(observed)
|
1209 |
+
expected = np.ones(n_categories) * len(column_data) / n_categories
|
1210 |
+
|
1211 |
+
if n_categories >= 2:
|
1212 |
+
chi2_stat, p_value = stats.chisquare(observed, expected)
|
1213 |
+
results["chi_square_results"][col] = {
|
1214 |
+
"chi2_statistic": float(chi2_stat),
|
1215 |
+
"p_value": float(p_value),
|
1216 |
+
"reject_null_hypothesis": p_value < alpha,
|
1217 |
+
"interpretation": "Distribution is not uniform" if p_value < alpha else "Distribution may be uniform"
|
1218 |
+
}
|
1219 |
+
else:
|
1220 |
+
results["chi_square_results"][col] = {
|
1221 |
+
"chi2_statistic": None,
|
1222 |
+
"p_value": None,
|
1223 |
+
"reject_null_hypothesis": None,
|
1224 |
+
"interpretation": "Not enough categories for chi-square test"
|
1225 |
+
}
|
1226 |
+
except Exception as e:
|
1227 |
+
error_msg = f"Error performing chi-square test for {col}: {str(e)}"
|
1228 |
+
logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
|
1229 |
+
results["errors"].append(error_msg)
|
1230 |
+
|
1231 |
+
try:
|
1232 |
+
simpson_index = sum(prob ** 2 for prob in value_counts)
|
1233 |
+
simpson_diversity = 1 - simpson_index
|
1234 |
+
results["simpson_diversity"][col] = float(simpson_diversity)
|
1235 |
+
except Exception as e:
|
1236 |
+
error_msg = f"Error calculating Simpson's diversity index for {col}: {str(e)}"
|
1237 |
+
logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
|
1238 |
+
results["errors"].append(error_msg)
|
1239 |
+
|
1240 |
+
try:
|
1241 |
+
shannon_diversity = -sum(prob * np.log(prob) for prob in value_counts if prob > 0)
|
1242 |
+
results["shannon_diversity"][col] = float(shannon_diversity)
|
1243 |
+
except Exception as e:
|
1244 |
+
error_msg = f"Error calculating Shannon's diversity index for {col}: {str(e)}"
|
1245 |
+
logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
|
1246 |
+
results["errors"].append(error_msg)
|
1247 |
+
|
1248 |
+
try:
|
1249 |
+
cardinality = len(value_counts)
|
1250 |
+
results["cardinality"][col] = {
|
1251 |
+
"unique_values": int(cardinality),
|
1252 |
+
"total_records": int(len(column_data)),
|
1253 |
+
"ratio": float(cardinality / len(column_data))
|
1254 |
+
}
|
1255 |
+
except Exception as e:
|
1256 |
+
error_msg = f"Error performing cardinality analysis for {col}: {str(e)}"
|
1257 |
+
logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
|
1258 |
+
results["errors"].append(error_msg)
|
1259 |
+
|
1260 |
+
except Exception as e:
|
1261 |
+
error_msg = f"Error analyzing column {col}: {str(e)}"
|
1262 |
+
logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
|
1263 |
+
results["errors"].append(error_msg)
|
1264 |
+
|
1265 |
+
try:
|
1266 |
+
summary_lines = []
|
1267 |
+
summary_lines.append(f"Categorical Analysis Summary for {len(categorical_columns)} columns:")
|
1268 |
+
|
1269 |
+
if results["cardinality"]:
|
1270 |
+
high_cardinality = [col for col, data in results["cardinality"].items()
|
1271 |
+
if data["ratio"] > 0.5 and data["unique_values"] > 10]
|
1272 |
+
if high_cardinality:
|
1273 |
+
summary_lines.append(f"- High cardinality columns ({len(high_cardinality)}): {', '.join(high_cardinality)}")
|
1274 |
+
|
1275 |
+
if results["rare_categories"]:
|
1276 |
+
cols_with_rare = [col for col, rare_cats in results["rare_categories"].items() if rare_cats]
|
1277 |
+
if cols_with_rare:
|
1278 |
+
summary_lines.append(f"- Columns with rare categories ({len(cols_with_rare)}): {', '.join(cols_with_rare)}")
|
1279 |
+
|
1280 |
+
if results["shannon_diversity"]:
|
1281 |
+
high_diversity = [col for col, value in results["shannon_diversity"].items() if value > 2.0]
|
1282 |
+
low_diversity = [col for col, value in results["shannon_diversity"].items() if value < 0.5]
|
1283 |
+
if high_diversity:
|
1284 |
+
summary_lines.append(f"- High diversity columns ({len(high_diversity)}): {', '.join(high_diversity)}")
|
1285 |
+
if low_diversity:
|
1286 |
+
summary_lines.append(f"- Low diversity columns ({len(low_diversity)}): {', '.join(low_diversity)}")
|
1287 |
+
|
1288 |
+
if results["chi_square_results"]:
|
1289 |
+
non_uniform = [col for col, result in results["chi_square_results"].items()
|
1290 |
+
if result.get("reject_null_hypothesis") is True]
|
1291 |
+
if non_uniform:
|
1292 |
+
summary_lines.append(f"- Columns with non-uniform distributions ({len(non_uniform)}): {', '.join(non_uniform)}")
|
1293 |
+
|
1294 |
+
if results["errors"]:
|
1295 |
+
summary_lines.append(f"- Analysis encountered {len(results['errors'])} errors during processing")
|
1296 |
+
|
1297 |
+
results["summary"] = "\n".join(summary_lines)
|
1298 |
+
except Exception as e:
|
1299 |
+
error_msg = f"Error generating summary: {str(e)}"
|
1300 |
+
logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
|
1301 |
+
results["errors"].append(error_msg)
|
1302 |
+
results["summary"] = "Error generating summary."
|
1303 |
+
|
1304 |
+
return results
|
1305 |
+
|
1306 |
+
def analyze_all_categoricals(self, verbose=False):
|
1307 |
+
"""Analyze all categorical variables in the datase"""
|
1308 |
+
logger.info("Starting to analyze all categories..", log_type='univariate_analysis', console=verbose)
|
1309 |
+
try:
|
1310 |
+
if self.data is None:
|
1311 |
+
return {"error": "No data loaded", "summary": "Error: No data loaded"}
|
1312 |
+
|
1313 |
+
categorical_columns = list(self.data.select_dtypes(include=['object', 'category']).columns)
|
1314 |
+
|
1315 |
+
for col in self.data.select_dtypes(include=['int64', 'float64']).columns:
|
1316 |
+
if self.data[col].nunique() <= 30 and self.data[col].nunique() / len(self.data) < 0.05:
|
1317 |
+
categorical_columns.append(col)
|
1318 |
+
|
1319 |
+
results = self.analyze_categoricals(
|
1320 |
+
data=self.data,
|
1321 |
+
categorical_columns=categorical_columns,
|
1322 |
+
verbose=verbose
|
1323 |
+
)
|
1324 |
+
|
1325 |
+
return results
|
1326 |
+
|
1327 |
+
except Exception as e:
|
1328 |
+
error_msg = f"Categorical analysis failed with error: {str(e)}"
|
1329 |
+
logger.error(error_msg, log_type="univariate_analysis", console=True)
|
1330 |
+
return {"error": error_msg, "summary": f"Error in categorical analysis: {str(e)}"}
|
1331 |
+
|
1332 |
+
def generate_report_from_agent(self, input)->str:
|
1333 |
+
'''Transform the json output to a user-readable report'''
|
1334 |
+
try:
|
1335 |
+
input = f"ML Task: {self.ml_task}\n{input}"
|
1336 |
+
response: RunResponse = self.writer.run(input, stream=False)
|
1337 |
+
return response.content
|
1338 |
+
except Exception as e:
|
1339 |
+
return f"Failed to generate report with error: {e}"
|
1340 |
+
|
1341 |
+
def convert_numpy_types(self, obj):
|
1342 |
+
if isinstance(obj, dict):
|
1343 |
+
return {k: self.convert_numpy_types(v) for k, v in obj.items()}
|
1344 |
+
elif isinstance(obj, list):
|
1345 |
+
return [self.convert_numpy_types(item) for item in obj]
|
1346 |
+
elif isinstance(obj, np.integer):
|
1347 |
+
return int(obj)
|
1348 |
+
elif isinstance(obj, np.floating):
|
1349 |
+
val = float(obj)
|
1350 |
+
if np.isnan(val) or np.isinf(val):
|
1351 |
+
return None
|
1352 |
+
return val
|
1353 |
+
elif isinstance(obj, float):
|
1354 |
+
if math.isnan(obj) or math.isinf(obj):
|
1355 |
+
return None
|
1356 |
+
return obj
|
1357 |
+
elif isinstance(obj, np.bool_):
|
1358 |
+
return bool(obj)
|
1359 |
+
elif isinstance(obj, np.ndarray):
|
1360 |
+
return self.convert_numpy_types(obj.tolist())
|
1361 |
+
else:
|
1362 |
+
return obj
|
1363 |
+
|
1364 |
+
|
1365 |
+
def run(self, verbose=False) -> Dict[str, dict]:
|
1366 |
+
'''Trigger point of the entire pipeline'''
|
1367 |
+
distrbution_analysis_results = self.analyze_distributions(verbose=verbose)
|
1368 |
+
continuous_vars, _ = self.detect_variable_types()
|
1369 |
+
shape_metrics_analysis_results = self.calculate_shape_metrics(continuous_cols=continuous_vars, verbose=verbose)
|
1370 |
+
categorical_analysis_results = self.analyze_all_categoricals(verbose=verbose)
|
1371 |
+
|
1372 |
+
kde_results = {}
|
1373 |
+
normality_results = {}
|
1374 |
+
kde_summary = {}
|
1375 |
+
normality_summary = {}
|
1376 |
+
|
1377 |
+
for column in distrbution_analysis_results:
|
1378 |
+
try:
|
1379 |
+
kde_results[column] = distrbution_analysis_results[column]['kde']
|
1380 |
+
normality_results[column] = distrbution_analysis_results[column]['normality_tests']
|
1381 |
+
kde_summary[column] = distrbution_analysis_results[column]['kde']['summary']
|
1382 |
+
normality_summary[column] = distrbution_analysis_results[column]['normality_tests']['summary']
|
1383 |
+
except:
|
1384 |
+
continue
|
1385 |
+
|
1386 |
+
logger.info("Generating final reports....", log_type='univariate_analysis', console=verbose)
|
1387 |
+
|
1388 |
+
final_result = {
|
1389 |
+
"kde_analysis": {
|
1390 |
+
'dict': self.convert_numpy_types(kde_results),
|
1391 |
+
'report': ""
|
1392 |
+
},
|
1393 |
+
"normality_analysis": {
|
1394 |
+
'dict': self.convert_numpy_types(normality_results),
|
1395 |
+
'report': ""
|
1396 |
+
},
|
1397 |
+
"shape_metrics_analysis": {
|
1398 |
+
"dict": self.convert_numpy_types(shape_metrics_analysis_results),
|
1399 |
+
"report": ""
|
1400 |
+
},
|
1401 |
+
"categorical_analysis": {
|
1402 |
+
# "dict": categorical_analysis_results,
|
1403 |
+
"report": ""
|
1404 |
+
}
|
1405 |
+
}
|
1406 |
+
|
1407 |
+
try:
|
1408 |
+
kde_results_str = json.dumps(kde_summary, indent=2, default=str, allow_nan=True)
|
1409 |
+
final_result['kde_analysis']['report'] = self.generate_report_from_agent(kde_results_str)
|
1410 |
+
except:
|
1411 |
+
logger.error("Failed to generate report for kde....", log_type='univariate_analysis', console=verbose)
|
1412 |
+
pass
|
1413 |
+
|
1414 |
+
try:
|
1415 |
+
normality_results_str = json.dumps(normality_summary, indent=2, default=str, allow_nan=True)
|
1416 |
+
final_result['normality_analysis']['report'] = self.generate_report_from_agent(normality_results_str)
|
1417 |
+
except:
|
1418 |
+
logger.error("Failed to generate report for normality....", log_type='univariate_analysis', console=verbose)
|
1419 |
+
pass
|
1420 |
+
|
1421 |
+
try:
|
1422 |
+
shape_metrics_analysis_results_str = json.dumps(shape_metrics_analysis_results, indent=2, default=str, allow_nan=True)
|
1423 |
+
final_result['shape_metrics_analysis']['report'] = self.generate_report_from_agent(shape_metrics_analysis_results_str)
|
1424 |
+
except:
|
1425 |
+
logger.error("Failed to generate report for shape metrics....", log_type='univariate_analysis', console=verbose)
|
1426 |
+
pass
|
1427 |
+
|
1428 |
+
try:
|
1429 |
+
categorical_analysis_results_str = json.dumps(categorical_analysis_results, indent=2, default=str, allow_nan=True)
|
1430 |
+
final_result['categorical_analysis']['report'] = self.generate_report_from_agent(categorical_analysis_results_str)
|
1431 |
+
except:
|
1432 |
+
logger.error("Failed to generate report for categorical analysis....", log_type='univariate_analysis', console=verbose)
|
1433 |
+
pass
|
1434 |
+
|
1435 |
+
return final_result
|
1436 |
+
|
1437 |
+
|
src/app/pipelines/task_analysis/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .ml_analysis_workflow import MLAnalysisWorkflow
|
2 |
+
from .ml_implementation_planner_workflow import MLImplementationPlannerWorkflow
|
src/app/pipelines/task_analysis/ml_analysis_workflow.py
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from agno.models.openai import OpenAIChat # type: ignore
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from src.core.utils import logger
|
5 |
+
from typing import Optional
|
6 |
+
from .model import RequirementsAnalysis, TechnicalResearch, ModelResponseStatus
|
7 |
+
from agno.agent import Agent, RunResponse # type: ignore
|
8 |
+
from agno.tools.duckduckgo import DuckDuckGoTools # type: ignore
|
9 |
+
from typing import Iterator, List
|
10 |
+
import json
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
class MLAnalysisWorkflow:
|
15 |
+
def __init__(self, user_prompt: str):
|
16 |
+
self.user_prompt = user_prompt
|
17 |
+
self.llm = OpenAIChat(id="gpt-4o", api_key=os.getenv('OPENAI_API_KEY'))
|
18 |
+
|
19 |
+
def analyze_requirements(self, verbose=False) -> Optional[RequirementsAnalysis]:
|
20 |
+
"""Stream requirements analysis"""
|
21 |
+
|
22 |
+
logger.info("Analyzing requirements...", log_type="pipeline: task_analysis", console=verbose)
|
23 |
+
|
24 |
+
prompt = f"Analyze this business problem and provide initial technical specifications: {self.user_prompt}"
|
25 |
+
|
26 |
+
self.requirements_analyst: Agent = Agent(
|
27 |
+
name="ML Requirements Analyst",
|
28 |
+
model=self.llm,
|
29 |
+
description="Expert ML Solutions Architect specialized in analyzing business requirements",
|
30 |
+
instructions=[
|
31 |
+
"Analyze business problems and translate them into technical ML specifications.",
|
32 |
+
"1. Understand the core business problem and objectives",
|
33 |
+
"2. Identify the type of ML task required",
|
34 |
+
"3. Determine data requirements and constraints",
|
35 |
+
"4. List unclear points that need clarification",
|
36 |
+
"5. Specify areas that need technical research",
|
37 |
+
"Be precise in identifying what information is missing or needs validation."
|
38 |
+
],
|
39 |
+
response_model=RequirementsAnalysis,
|
40 |
+
structured_outputs=True,
|
41 |
+
reasoning=True,
|
42 |
+
)
|
43 |
+
|
44 |
+
analyse_stream = self.requirements_analyst.run(prompt, stream=False)
|
45 |
+
return analyse_stream.content
|
46 |
+
|
47 |
+
def write_requirements_post(self, requirements_results: RequirementsAnalysis, verbose=False) -> Iterator[RunResponse]:
|
48 |
+
"""
|
49 |
+
Write a blog post on a topic.
|
50 |
+
:param requirements_results: requirements_analyst response
|
51 |
+
:return: iterator for the workflow response
|
52 |
+
"""
|
53 |
+
logger.info("Writing requirements analysis...", log_type="pipeline: task_analysis", console=verbose)
|
54 |
+
|
55 |
+
writer_input = {"model_response": requirements_results.model_response.model_dump(),
|
56 |
+
"unclear_points": requirements_results.unclear_points,
|
57 |
+
"search_queries": requirements_results.search_queries,
|
58 |
+
"business_understanding": requirements_results.business_understanding
|
59 |
+
}
|
60 |
+
|
61 |
+
model_response = self.writer.run(json.dumps(writer_input, indent=4), stream=True)
|
62 |
+
|
63 |
+
return model_response
|
64 |
+
|
65 |
+
def write_research_post(self, research_results: TechnicalResearch, verbose=False) -> Iterator[RunResponse]:
|
66 |
+
"""
|
67 |
+
Write a blog post on a topic.
|
68 |
+
:param research_results: research content
|
69 |
+
:return: iterator for the workflow response
|
70 |
+
"""
|
71 |
+
logger.info("Writing research findings...", log_type="pipeline: task_analysis", console=verbose)
|
72 |
+
|
73 |
+
writer_input = {"research_findings": research_results.research_findings,
|
74 |
+
"reference_implementations": research_results.reference_implementations,
|
75 |
+
"sources": research_results.sources
|
76 |
+
}
|
77 |
+
|
78 |
+
self.writer: Agent = Agent(
|
79 |
+
model=self.llm,
|
80 |
+
instructions=[
|
81 |
+
"You will be provided with lots of structured outputs. Your work is to display this"
|
82 |
+
"in a nicely formatted manner without changing any of the content. Present all the links"
|
83 |
+
"as they are, with explicitly mentioned hyperlinks. Do not change any content."
|
84 |
+
],
|
85 |
+
markdown=True,
|
86 |
+
)
|
87 |
+
|
88 |
+
model_response = self.writer.run(json.dumps(writer_input, indent=4), stream=True)
|
89 |
+
|
90 |
+
return model_response
|
91 |
+
|
92 |
+
def validate_model_response(self, response: ModelResponseStatus, verbose=False) -> List[str]:
|
93 |
+
"""Check for missing or incomplete fields in ModelResponseStatus"""
|
94 |
+
logger.info("Checking for missing or incomplete fields in ModelResponseStatus...", log_type="pipeline: task_analysis", console=verbose)
|
95 |
+
|
96 |
+
missing_fields = []
|
97 |
+
response_dict = response.model_dump()
|
98 |
+
|
99 |
+
for field, value in response_dict.items():
|
100 |
+
if value == "..." or value == ["..."]:
|
101 |
+
missing_fields.append(field)
|
102 |
+
elif isinstance(value, list) and not value:
|
103 |
+
missing_fields.append(field)
|
104 |
+
|
105 |
+
return missing_fields
|
106 |
+
|
107 |
+
def conduct_research(self, research_prompt: str, verbose=False) -> Optional[TechnicalResearch]:
|
108 |
+
"""Stream technical research"""
|
109 |
+
logger.info("Conducting technical research...", log_type="pipeline: task_analysis", console=verbose)
|
110 |
+
|
111 |
+
self.technical_researcher: Agent = Agent(
|
112 |
+
name="ML Technical Researcher",
|
113 |
+
model=self.llm,
|
114 |
+
description="ML Expert specialized in researching technical implementations",
|
115 |
+
tools=[DuckDuckGoTools(search=True, news=False)],
|
116 |
+
instructions=[
|
117 |
+
"Research and validate technical aspects of ML solutions.",
|
118 |
+
"1. Search for similar ML implementations and best practices",
|
119 |
+
"2. Find recommended models and architectures",
|
120 |
+
"3. Research typical hyperparameters and evaluation metrics",
|
121 |
+
"4. Look for implementation constraints and requirements",
|
122 |
+
"5. Validate technical feasibility",
|
123 |
+
"Provide sources for all technical information.",
|
124 |
+
"Focus on recent and reliable technical sources."
|
125 |
+
],
|
126 |
+
response_model=TechnicalResearch,
|
127 |
+
structured_outputs=True,
|
128 |
+
reasoning=True,
|
129 |
+
# debug_mode=True,
|
130 |
+
)
|
131 |
+
|
132 |
+
conduct_stream = self.technical_researcher.run(research_prompt)
|
133 |
+
return conduct_stream.content
|
134 |
+
|
135 |
+
def finalize_analysis(self, final_prompt: str, verbose=False) -> Optional[RequirementsAnalysis]:
|
136 |
+
"""Stream final analysis"""
|
137 |
+
logger.info("Finalizing analysis...", log_type="pipeline: task_analysis", console=verbose)
|
138 |
+
|
139 |
+
finalise_stream = self.requirements_analyst.run(final_prompt)
|
140 |
+
return finalise_stream.content
|
141 |
+
|
142 |
+
def run(self, verbose=False):
|
143 |
+
"""
|
144 |
+
Run the ML analysis workflow
|
145 |
+
Args:
|
146 |
+
user_query: Description of the business problem
|
147 |
+
"""
|
148 |
+
try:
|
149 |
+
requirements_result: Optional[RequirementsAnalysis] = self.analyze_requirements(verbose=verbose)
|
150 |
+
|
151 |
+
# logger.info("Writing initial requirements analysis...", log_type="pipeline: task_analysis", console=verbose)
|
152 |
+
# yield from self.write_requirements_post(requirements_result, verbose=verbose)
|
153 |
+
|
154 |
+
'''Check what needs research'''
|
155 |
+
missing_fields = self.validate_model_response(requirements_result.model_response, verbose=verbose)
|
156 |
+
logger.info("Missing fields found!", log_type="pipeline: task_analysis", console=verbose)
|
157 |
+
search_queries = requirements_result.search_queries
|
158 |
+
logger.info("Search queries found!", log_type="pipeline: task_analysis", console=verbose)
|
159 |
+
unclear_points = requirements_result.unclear_points
|
160 |
+
logger.info("Unclear points found!", log_type="pipeline: task_analysis", console=verbose)
|
161 |
+
|
162 |
+
if missing_fields or search_queries:
|
163 |
+
'''Conduct technical research'''
|
164 |
+
logger.info("Researching technical specifications...", log_type="pipeline: task_analysis", console=verbose)
|
165 |
+
|
166 |
+
research_prompt = (
|
167 |
+
f"Research the following for this ML problem: {self.user_prompt}\n"
|
168 |
+
f"Missing information needed for: {', '.join(missing_fields)}\n"
|
169 |
+
f"Specific topics to research: {', '.join(search_queries)}\n"
|
170 |
+
f"Points needing clarification: {', '.join(unclear_points)}\n"
|
171 |
+
f"Current understanding: {requirements_result.business_understanding}"
|
172 |
+
)
|
173 |
+
logger.info("Conducting research...", log_type="pipeline: task_analysis", console=verbose)
|
174 |
+
|
175 |
+
research_result: Optional[TechnicalResearch] = self.conduct_research(research_prompt, verbose=verbose)
|
176 |
+
|
177 |
+
# logger.info("Sharing research findings...", log_type="pipeline: task_analysis", console=verbose)
|
178 |
+
# research_post = self.write_research_post(research_result, verbose=verbose)
|
179 |
+
|
180 |
+
final_prompt = (
|
181 |
+
f"Original problem: {self.user_prompt}\n"
|
182 |
+
f"Research findings: {research_result.research_findings}\n"
|
183 |
+
"Please provide final technical specifications incorporating this research."
|
184 |
+
)
|
185 |
+
|
186 |
+
logger.info("Obtaining final requirements", log_type="pipeline: task_analysis", console=verbose)
|
187 |
+
final_result: Optional[RequirementsAnalysis] = self.finalize_analysis(final_prompt, verbose=verbose)
|
188 |
+
|
189 |
+
# logger.info("Writing final requirements...", log_type="pipeline: task_analysis", console=verbose)
|
190 |
+
# requirement_post = self.write_requirements_post(final_result, verbose=verbose)
|
191 |
+
|
192 |
+
return (final_result, research_result)
|
193 |
+
|
194 |
+
except Exception as e:
|
195 |
+
logger.error(f"Workflow error: {str(e)}", log_type="pipeline: task_analysis", console=verbose)
|
196 |
+
|
197 |
+
|
198 |
+
|
199 |
+
|
200 |
+
|
201 |
+
|
202 |
+
|
src/app/pipelines/task_analysis/ml_implementation_planner_workflow.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from agno.models.openai import OpenAIChat # type: ignore
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from src.core.utils import logger
|
5 |
+
from typing import Optional
|
6 |
+
from .model import RequirementsAnalysis, TechnicalResearch, ImplementationPlan
|
7 |
+
from agno.agent import Agent, RunResponse # type: ignore
|
8 |
+
from typing import Iterator
|
9 |
+
import json
|
10 |
+
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
class MLImplementationPlannerWorkflow:
|
14 |
+
def __init__(self, requirements_analysis: RequirementsAnalysis, technical_research: Optional[TechnicalResearch] = None):
|
15 |
+
self.requirements_analysis = requirements_analysis
|
16 |
+
self.technical_research = technical_research
|
17 |
+
self.llm = OpenAIChat(id="gpt-4o", api_key=os.getenv('OPENAI_API_KEY'))
|
18 |
+
|
19 |
+
def create_implementation_plan(self, planning_prompt: str, verbose=False) -> Optional[ImplementationPlan]:
|
20 |
+
"""Stream implementation plan creation"""
|
21 |
+
logger.info("Creating implementation plan...", log_type="pipeline: task_analysis", console=verbose)
|
22 |
+
|
23 |
+
self.architect: Agent = Agent(
|
24 |
+
name="ML System Architect",
|
25 |
+
model=self.llm,
|
26 |
+
description="Expert ML System Architect specialized in detailed implementation planning",
|
27 |
+
instructions=[
|
28 |
+
"Create detailed technical implementation plans for ML systems.",
|
29 |
+
"1. Break down the system into logical components",
|
30 |
+
"2. Define detailed function specifications for each component",
|
31 |
+
"3. Specify clear interfaces between components",
|
32 |
+
"4. Consider error handling and edge cases",
|
33 |
+
"5. Plan testing and deployment strategies",
|
34 |
+
"Be extremely specific about function signatures and component interactions.",
|
35 |
+
"Focus on maintainability and scalability in the design."
|
36 |
+
],
|
37 |
+
response_model=ImplementationPlan,
|
38 |
+
structured_outputs=True,
|
39 |
+
reasoning=True,
|
40 |
+
# debug_mode=True,
|
41 |
+
)
|
42 |
+
|
43 |
+
planning_stream = self.architect.run(planning_prompt)
|
44 |
+
return planning_stream.content
|
45 |
+
|
46 |
+
def validate_interfaces(self, validation_prompt: str, verbose=False) -> Optional[ImplementationPlan]:
|
47 |
+
"""Stream interface validation"""
|
48 |
+
logger.info("Validating interfaces...", log_type="pipeline: task_analysis", console=verbose)
|
49 |
+
|
50 |
+
architect_stream = self.architect.run(validation_prompt)
|
51 |
+
return architect_stream.content
|
52 |
+
|
53 |
+
def write_implementation_post(self, implementation_results: ImplementationPlan, verbose=False) -> Iterator[RunResponse]:
|
54 |
+
"""
|
55 |
+
Write a blog post on a topic.
|
56 |
+
:param implementation_results: implementation plan results
|
57 |
+
:return: iterator for the workflow response
|
58 |
+
"""
|
59 |
+
logger.info("Writing implementation plan...", log_type="pipeline: task_analysis", console=verbose)
|
60 |
+
|
61 |
+
writer_input = {"components": [comp.model_dump() for comp in implementation_results.components],
|
62 |
+
"system_requirements": implementation_results.system_requirements,
|
63 |
+
"deployment_notes": implementation_results.deployment_notes,
|
64 |
+
"testing_strategy": implementation_results.testing_strategy,
|
65 |
+
"implementation_order": implementation_results.implementation_order
|
66 |
+
}
|
67 |
+
|
68 |
+
self.writer: Agent = Agent(
|
69 |
+
model=self.llm,
|
70 |
+
instructions=[
|
71 |
+
"You will be provided with lots of structured outputs. Your work is to display this"
|
72 |
+
"in a nicely formatted manner without changing any of the content."
|
73 |
+
],
|
74 |
+
markdown=True,
|
75 |
+
)
|
76 |
+
|
77 |
+
model_response = self.writer.run(json.dumps(writer_input, indent=4), stream=True)
|
78 |
+
|
79 |
+
return model_response.content
|
80 |
+
|
81 |
+
def run(self, verbose=False):
|
82 |
+
"""
|
83 |
+
Create implementation plan based on requirements analysis and research
|
84 |
+
|
85 |
+
Args:
|
86 |
+
requirements_analysis: Results from requirements analysis
|
87 |
+
technical_research: Optional results from technical research
|
88 |
+
"""
|
89 |
+
try:
|
90 |
+
logger.info("Starting planning workflow...", log_type="pipeline: task_analysis", console=verbose)
|
91 |
+
|
92 |
+
'''Prepare comprehensive prompt for the architect'''
|
93 |
+
planning_prompt = (
|
94 |
+
f"Create a detailed implementation plan for this ML system.\n\n"
|
95 |
+
f"Business Understanding:\n{self.requirements_analysis.business_understanding}\n\n"
|
96 |
+
f"Technical Specifications:\n"
|
97 |
+
f"- Task Type: {self.requirements_analysis.model_response.task}\n"
|
98 |
+
f"- Models: {', '.join(self.requirements_analysis.model_response.models)}\n"
|
99 |
+
f"- Data Requirements: {self.requirements_analysis.model_response.data_source}\n"
|
100 |
+
f"- Technical Requirements: {self.requirements_analysis.model_response.technical_requirements}\n"
|
101 |
+
)
|
102 |
+
|
103 |
+
if self.technical_research:
|
104 |
+
logger.info("Technical Research found! Modifying context...", log_type="pipeline: task_analysis", console=verbose)
|
105 |
+
|
106 |
+
planning_prompt += (
|
107 |
+
f"\nResearch Findings:\n{self.technical_research.research_findings}\n"
|
108 |
+
f"Reference Implementations:\n"
|
109 |
+
f"{chr(10).join(self.technical_research.reference_implementations)}"
|
110 |
+
)
|
111 |
+
|
112 |
+
'''Stream implementation plan'''
|
113 |
+
|
114 |
+
logger.info("generating implementation plan...", log_type="pipeline: task_analysis", console=verbose)
|
115 |
+
plan_result: Optional[ImplementationPlan] = self.create_implementation_plan(planning_prompt, verbose=verbose)
|
116 |
+
|
117 |
+
if plan_result:
|
118 |
+
validation_prompt = (
|
119 |
+
"Validate the interfaces between these components "
|
120 |
+
"and ensure all dependencies are properly specified:\n"
|
121 |
+
f"{plan_result.components}"
|
122 |
+
)
|
123 |
+
logger.info("validating results...", log_type="pipeline: task_analysis", console=verbose)
|
124 |
+
|
125 |
+
validate_result: Optional[ImplementationPlan] = self.validate_interfaces(validation_prompt, verbose=verbose)
|
126 |
+
# logger.info("writing validated implementation plan...", log_type="pipeline: task_analysis", console=verbose)
|
127 |
+
# final_response = self.write_implementation_post(validate_result, verbose=verbose)
|
128 |
+
|
129 |
+
return validate_result
|
130 |
+
|
131 |
+
except Exception as e:
|
132 |
+
logger.error("Error in planning workflow".format(e), log_type="pipeline: task_analysis", console=verbose)
|
src/app/pipelines/task_analysis/model.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
from typing import Iterator, List, Optional
|
3 |
+
from enum import Enum
|
4 |
+
from pydantic import BaseModel, Field
|
5 |
+
|
6 |
+
class MLTaskType(str, Enum):
|
7 |
+
CLASSIFICATION = "classification"
|
8 |
+
REGRESSION = "regression"
|
9 |
+
CLUSTERING = "clustering"
|
10 |
+
NLP = "natural_language_processing"
|
11 |
+
COMPUTER_VISION = "computer_vision"
|
12 |
+
TIME_SERIES = "time_series"
|
13 |
+
ANOMALY_DETECTION = "anomaly_detection"
|
14 |
+
RECOMMENDATION = "recommendation"
|
15 |
+
OTHER = "other"
|
16 |
+
|
17 |
+
|
18 |
+
class ModelResponseStatus(BaseModel):
|
19 |
+
"""Technical specification for ML implementation"""
|
20 |
+
data_source: str = Field(
|
21 |
+
# default="...",
|
22 |
+
description="Required data sources and their characteristics"
|
23 |
+
)
|
24 |
+
data_format: str = Field(
|
25 |
+
# default="...",
|
26 |
+
description="Expected format of input data"
|
27 |
+
)
|
28 |
+
additional_data_requirement: bool = Field(
|
29 |
+
# default=False,
|
30 |
+
description="Whether additional data is needed"
|
31 |
+
)
|
32 |
+
constraints: str = Field(
|
33 |
+
# default="...",
|
34 |
+
description="Business and technical constraints"
|
35 |
+
)
|
36 |
+
task: MLTaskType = Field(
|
37 |
+
# default=MLTaskType.OTHER,
|
38 |
+
description="Type of ML task"
|
39 |
+
)
|
40 |
+
models: List[str] = Field(
|
41 |
+
# default=["..."],
|
42 |
+
description="Suggested ML models"
|
43 |
+
)
|
44 |
+
hyperparameters: List[str] = Field(
|
45 |
+
# default=["..."],
|
46 |
+
description="Key hyperparameters to consider"
|
47 |
+
)
|
48 |
+
eval_metrics: List[str] = Field(
|
49 |
+
# default=["..."],
|
50 |
+
description="Evaluation metrics for the solution"
|
51 |
+
)
|
52 |
+
technical_requirements: str = Field(
|
53 |
+
# default="...",
|
54 |
+
description="Technical implementation requirements"
|
55 |
+
)
|
56 |
+
|
57 |
+
|
58 |
+
class RequirementsAnalysis(BaseModel):
|
59 |
+
"""Initial analysis of business requirements"""
|
60 |
+
model_response: ModelResponseStatus
|
61 |
+
unclear_points: List[str] = Field(
|
62 |
+
default_factory=list,
|
63 |
+
description="Points needing clarification"
|
64 |
+
)
|
65 |
+
search_queries: List[str] = Field(
|
66 |
+
default_factory=list,
|
67 |
+
description="Topics to research"
|
68 |
+
)
|
69 |
+
business_understanding: str = Field(
|
70 |
+
description="Summary of business problem understanding"
|
71 |
+
)
|
72 |
+
|
73 |
+
|
74 |
+
class TechnicalResearch(BaseModel):
|
75 |
+
"""Results from technical research"""
|
76 |
+
model_response: ModelResponseStatus
|
77 |
+
research_findings: str = Field(
|
78 |
+
description="Key findings from research"
|
79 |
+
)
|
80 |
+
reference_implementations: List[str] = Field(
|
81 |
+
default_factory=list,
|
82 |
+
description="Similar implementation examples found"
|
83 |
+
)
|
84 |
+
sources: List[str] = Field(
|
85 |
+
default_factory=list,
|
86 |
+
description="Sources of information"
|
87 |
+
)
|
88 |
+
|
89 |
+
|
90 |
+
# Implementation Planning Models
|
91 |
+
class ComponentType(str, Enum):
|
92 |
+
DATA_PIPELINE = "data_pipeline"
|
93 |
+
PREPROCESSOR = "preprocessor"
|
94 |
+
MODEL = "model"
|
95 |
+
EVALUATOR = "evaluator"
|
96 |
+
INFERENCE = "inference"
|
97 |
+
MONITORING = "monitoring"
|
98 |
+
UTILITY = "utility"
|
99 |
+
|
100 |
+
|
101 |
+
class ParameterSpec(BaseModel):
|
102 |
+
"""Specification for a single parameter"""
|
103 |
+
name: str = Field(description="Name of the parameter")
|
104 |
+
param_type: str = Field(description="Type of the parameter")
|
105 |
+
description: str = Field(description="Description of the parameter")
|
106 |
+
default_value: str = Field(description="Default value if any")
|
107 |
+
required: bool = Field(description="Whether the parameter is required")
|
108 |
+
|
109 |
+
|
110 |
+
class ConfigParam(BaseModel):
|
111 |
+
"""Specification for a configuration parameter"""
|
112 |
+
name: str = Field(description="Name of the configuration parameter")
|
113 |
+
value_type: str = Field(description="Type of value expected")
|
114 |
+
description: str = Field(description="Description of the configuration parameter")
|
115 |
+
default: str = Field(description="Default value if any")
|
116 |
+
|
117 |
+
|
118 |
+
class FunctionSpec(BaseModel):
|
119 |
+
"""Detailed specification for a single function"""
|
120 |
+
name: str = Field(description="Name of the function")
|
121 |
+
description: str = Field(description="Detailed description of function's purpose")
|
122 |
+
input_params: List[ParameterSpec] = Field(
|
123 |
+
description="List of input parameters and their specifications"
|
124 |
+
)
|
125 |
+
return_type: str = Field(description="Return type and description")
|
126 |
+
dependencies: List[str] = Field(
|
127 |
+
description="Required dependencies/imports"
|
128 |
+
)
|
129 |
+
error_handling: List[str] = Field(
|
130 |
+
description="Expected errors and handling strategies"
|
131 |
+
)
|
132 |
+
|
133 |
+
|
134 |
+
class ComponentSpec(BaseModel):
|
135 |
+
"""Specification for a component (module) of the system"""
|
136 |
+
name: str = Field(description="Name of the component")
|
137 |
+
type: ComponentType = Field(description="Type of component")
|
138 |
+
description: str = Field(description="Detailed description of component's purpose")
|
139 |
+
functions: List[FunctionSpec] = Field(description="Functions within this component")
|
140 |
+
dependencies: List[str] = Field(
|
141 |
+
description="External package dependencies"
|
142 |
+
)
|
143 |
+
config_params: List[ConfigParam] = Field(
|
144 |
+
description="Configuration parameters needed"
|
145 |
+
)
|
146 |
+
|
147 |
+
|
148 |
+
class ImplementationPlan(BaseModel):
|
149 |
+
"""Complete implementation plan for the ML system"""
|
150 |
+
components: List[ComponentSpec] = Field(description="System components")
|
151 |
+
system_requirements: List[str] = Field(
|
152 |
+
description="System-level requirements and dependencies"
|
153 |
+
)
|
154 |
+
deployment_notes: str = Field(
|
155 |
+
description="Notes on deployment and infrastructure"
|
156 |
+
)
|
157 |
+
testing_strategy: str = Field(
|
158 |
+
description="Strategy for testing components"
|
159 |
+
)
|
160 |
+
implementation_order: List[str] = Field(
|
161 |
+
description="Suggested order of implementation"
|
162 |
+
)
|
src/app/schemas/requests/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .task_analysis import TaskAnalysisRequestSchema
|
src/app/schemas/requests/eda.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field
|
2 |
+
from src.app.pipelines.task_analysis.model import ImplementationPlan, RequirementsAnalysis, TechnicalResearch
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
class EdaRequestSchema(BaseModel):
|
6 |
+
dataset_path: str = Field(
|
7 |
+
default=None,
|
8 |
+
description="Path of the dataset stored locally"
|
9 |
+
)
|
10 |
+
user_prompt: str = Field(
|
11 |
+
default=None,
|
12 |
+
description="Contains the initial prompt given by the user"
|
13 |
+
)
|
14 |
+
requirement_analysis: Optional[RequirementsAnalysis] = Field(
|
15 |
+
default=None,
|
16 |
+
description="Contains the analysis of the user task/prompt"
|
17 |
+
)
|
18 |
+
technical_research: Optional[TechnicalResearch] = Field(
|
19 |
+
default=None,
|
20 |
+
description="Contains the technical research of the user task/prompt"
|
21 |
+
)
|
22 |
+
implementation_plan: Optional[ImplementationPlan] = Field(
|
23 |
+
default=None,
|
24 |
+
description="Includes the detailed plan for what steps to take"
|
25 |
+
)
|
src/app/schemas/requests/task_analysis.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field
|
2 |
+
|
3 |
+
class TaskAnalysisRequestSchema(BaseModel):
|
4 |
+
'''Schema for accepting user instructions/prompts/task and dataset (Currently only supporting CSV)'''
|
5 |
+
user_prompt: str = Field(
|
6 |
+
default = None,
|
7 |
+
description = "Defines the user prompt/instructions/task"
|
8 |
+
)
|
9 |
+
file_name: str = Field(
|
10 |
+
default = None,
|
11 |
+
description = "Contains the filename of the dataset. Stored in a temporary storage"
|
12 |
+
)
|
src/app/schemas/responses/eda.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field
|
2 |
+
from typing import List
|
3 |
+
from src.app.pipelines.eda.agents.models import OrchestratorAgentResponseSchema, ExecuterAgentResponseSchema, AnalyzerAgentResponseSchema, JudgingAgentResponseSchema
|
4 |
+
|
5 |
+
class IterationDetails(BaseModel):
|
6 |
+
iteration_number: int = Field(
|
7 |
+
default=None,
|
8 |
+
description="Contains the iteration number"
|
9 |
+
)
|
10 |
+
orchestrator_response: OrchestratorAgentResponseSchema = Field(
|
11 |
+
default=None,
|
12 |
+
description="Contains orchestrator agent's response for this iteration"
|
13 |
+
)
|
14 |
+
executer_response: ExecuterAgentResponseSchema = Field(
|
15 |
+
default=None,
|
16 |
+
description="Contains executer agent's response for this iteration"
|
17 |
+
)
|
18 |
+
analyzer_response: AnalyzerAgentResponseSchema = Field(
|
19 |
+
default=None,
|
20 |
+
description="Contains analyzer agent's response for this iteration"
|
21 |
+
)
|
22 |
+
judge_response: JudgingAgentResponseSchema = Field(
|
23 |
+
default=None,
|
24 |
+
description="Contains judging agent's response for this iteration"
|
25 |
+
)
|
26 |
+
|
27 |
+
class IterationLogs(BaseModel):
|
28 |
+
logs: List[IterationDetails] = Field(
|
29 |
+
default=None,
|
30 |
+
description="Contains a list of logs for each iteration"
|
31 |
+
)
|
src/core/cache/redis_cache.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import redis.asyncio as redis # type: ignore
|
3 |
+
from typing import Any, Optional
|
4 |
+
from src.core.utils import read_config
|
5 |
+
|
6 |
+
|
7 |
+
class RedisCache:
|
8 |
+
'''Reads the server settings from config file. Can import from anywhere'''
|
9 |
+
def __init__(self):
|
10 |
+
settings = read_config(config_path="config.yaml")
|
11 |
+
|
12 |
+
self._client = redis.Redis(
|
13 |
+
host = settings['redis_server']['host'],
|
14 |
+
port = settings['redis_server']['port'],
|
15 |
+
db = settings['redis_server']['db'],
|
16 |
+
decode_responses = True
|
17 |
+
)
|
18 |
+
async def set(self, key: str, value: Any)->None:
|
19 |
+
await self._client.set(key, json.dumps(value))
|
20 |
+
|
21 |
+
async def get(self, key: str)-> Optional[Any]:
|
22 |
+
value = await self._client.get(key)
|
23 |
+
if value:
|
24 |
+
return json.loads(value)
|
25 |
+
return None
|
26 |
+
|
27 |
+
async def delete(self, key: str)->None:
|
28 |
+
await self._client.delete(key)
|
29 |
+
|
30 |
+
async def clear(self)->None:
|
31 |
+
await self._client.flushdb()
|
32 |
+
|
33 |
+
cache = RedisCache()
|
src/core/server.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from src.api import router
|
3 |
+
from src.core.utils import logger, read_config
|
4 |
+
|
5 |
+
def init_routers(app_: FastAPI) -> None:
|
6 |
+
app_.include_router(router)
|
7 |
+
|
8 |
+
def create_app() -> FastAPI:
|
9 |
+
config = read_config("config.yaml")
|
10 |
+
|
11 |
+
app_ = FastAPI(
|
12 |
+
title="Franky API",
|
13 |
+
description="In Development",
|
14 |
+
version="1.0.0",
|
15 |
+
)
|
16 |
+
|
17 |
+
app_.state.config = config
|
18 |
+
|
19 |
+
init_routers(app_=app_)
|
20 |
+
logger.info("Server started successfully", log_type="server", console=True)
|
21 |
+
return app_
|
22 |
+
|
23 |
+
app = create_app()
|
src/core/utils/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .read_config import read_config
|
2 |
+
from .logger import logger
|
3 |
+
from .knowledge_base import KnowledgeBaseClass
|
src/core/utils/knowledge_base.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import chromadb
|
3 |
+
from typing import List
|
4 |
+
from agno.agent import Agent
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from pydantic import BaseModel, Field
|
7 |
+
from agno.models.openai import OpenAIChat
|
8 |
+
from llama_index.core import StorageContext
|
9 |
+
from llama_index.core import VectorStoreIndex
|
10 |
+
from llama_index.core.retrievers import VectorIndexRetriever
|
11 |
+
from agno.knowledge.llamaindex import LlamaIndexKnowledgeBase
|
12 |
+
from llama_index.vector_stores.chroma import ChromaVectorStore
|
13 |
+
from .prompts import missing_value_analysis_agent_desc, missing_value_analysis_agent_instructions
|
14 |
+
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
class ResponseSchema(BaseModel):
|
18 |
+
code_generated: str = Field(..., description="Python code generated for the identified statistical tests.")
|
19 |
+
libraries_necessary: List[str] = Field(..., description="List of necessary Python libraries with preferred versions (e.g., ['pandas>=1.5.0', 'numpy>=1.24.0', 'scipy>=1.10.0']).")
|
20 |
+
reasoning: str = Field(..., description="Detailed reasoning for choosing the specific statistical tests based on the input report and the knowledge base.")
|
21 |
+
|
22 |
+
class KnowledgeBaseClass:
|
23 |
+
def __init__(self)->None:
|
24 |
+
|
25 |
+
self.knowlede_base_map = {
|
26 |
+
"classification": {
|
27 |
+
"raw_data_path": r"knowledge_base\raw\classification",
|
28 |
+
"vector_index_path": r"knowledge_base\vector\classification",
|
29 |
+
"collection_name": "classification_db"
|
30 |
+
},
|
31 |
+
"regression": {
|
32 |
+
"raw_data_path": r"knowledge_base\raw\regression",
|
33 |
+
"vector_index_path": r"knowledge_base\vector\regression",
|
34 |
+
"collection_name": "regression_db"
|
35 |
+
},
|
36 |
+
"time_series": {
|
37 |
+
"raw_data_path": r"knowledge_base\raw\time_series",
|
38 |
+
"vector_index_path": r"knowledge_base\vector\time_series",
|
39 |
+
"collection_name": "time_series_db"
|
40 |
+
}
|
41 |
+
}
|
42 |
+
|
43 |
+
self.agent_map = {
|
44 |
+
'missing_value_analysis_agent': {
|
45 |
+
'description': missing_value_analysis_agent_desc,
|
46 |
+
'instructions': missing_value_analysis_agent_instructions
|
47 |
+
}
|
48 |
+
}
|
49 |
+
|
50 |
+
def initialize_knowledge_base(self, task_type: str)->LlamaIndexKnowledgeBase:
|
51 |
+
selected_knowledge_base = self.knowlede_base_map[task_type]
|
52 |
+
db = chromadb.PersistentClient(path=selected_knowledge_base['vector_index_path'])
|
53 |
+
chroma_collection = db.get_or_create_collection(selected_knowledge_base['collection_name'])
|
54 |
+
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
55 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
56 |
+
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)
|
57 |
+
retriever = VectorIndexRetriever(index)
|
58 |
+
knowledge_base = LlamaIndexKnowledgeBase(retriever=retriever)
|
59 |
+
|
60 |
+
return knowledge_base
|
61 |
+
|
62 |
+
def initialize_agent(self, agent_name: str, llm_choice: str, knowledge_base: LlamaIndexKnowledgeBase)->Agent:
|
63 |
+
selected_agent = self.agent_map[agent_name]
|
64 |
+
llm = OpenAIChat(id=llm_choice, api_key=os.getenv('OPENAI_API_KEY'))
|
65 |
+
|
66 |
+
agent = Agent(
|
67 |
+
model = llm,
|
68 |
+
description=selected_agent['description'],
|
69 |
+
instructions=selected_agent['instructions'],
|
70 |
+
knowledge=knowledge_base,
|
71 |
+
search_knowledge=True,
|
72 |
+
response_model=ResponseSchema
|
73 |
+
)
|
74 |
+
|
75 |
+
return agent
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
|
src/core/utils/logger.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from datetime import datetime
|
3 |
+
|
4 |
+
class LogManager:
|
5 |
+
def __init__(self, log_file_path: str):
|
6 |
+
self.logger = logging.getLogger("LogManager")
|
7 |
+
self.logger.setLevel(logging.DEBUG)
|
8 |
+
|
9 |
+
if not self.logger.handlers:
|
10 |
+
formatter = logging.Formatter('%(asctime)s - %(levelname)s - [%(log_type)s] - %(message)s')
|
11 |
+
|
12 |
+
file_handler = logging.FileHandler(log_file_path, encoding='utf-8')
|
13 |
+
file_handler.setFormatter(formatter)
|
14 |
+
file_handler.setLevel(logging.DEBUG)
|
15 |
+
self.logger.addHandler(file_handler)
|
16 |
+
|
17 |
+
self.console_handler = logging.StreamHandler()
|
18 |
+
self.console_handler.setFormatter(formatter)
|
19 |
+
self.console_handler.setLevel(logging.DEBUG)
|
20 |
+
|
21 |
+
else:
|
22 |
+
self.console_handler = None
|
23 |
+
|
24 |
+
def log(self, level, message, log_type: str, console=False):
|
25 |
+
extra = {'log_type': log_type}
|
26 |
+
|
27 |
+
if console and self.console_handler:
|
28 |
+
self.logger.addHandler(self.console_handler)
|
29 |
+
|
30 |
+
self.logger.log(level, message, extra=extra)
|
31 |
+
|
32 |
+
if console and self.console_handler:
|
33 |
+
self.logger.removeHandler(self.console_handler)
|
34 |
+
|
35 |
+
def info(self, message, log_type: str, console=False):
|
36 |
+
self.log(logging.INFO, message, log_type, console)
|
37 |
+
|
38 |
+
def error(self, message, log_type: str, console=False):
|
39 |
+
self.log(logging.ERROR, message, log_type, console)
|
40 |
+
|
41 |
+
def debug(self, message, log_type: str, console=False):
|
42 |
+
self.log(logging.DEBUG, message, log_type, console)
|
43 |
+
|
44 |
+
|
45 |
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H")
|
46 |
+
logger = LogManager(log_file_path=f"src/core/logs/log_{timestamp}.log")
|