architojha commited on
Commit
8675ade
·
1 Parent(s): 7d927f4

pushing api

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.template +1 -0
  2. .gitignore +8 -0
  3. Dockerfile +16 -0
  4. README.md +111 -10
  5. config.yaml +13 -0
  6. main.py +47 -0
  7. prepare_kb.ipynb +118 -0
  8. requirements.txt +201 -0
  9. src/api/__init__.py +5 -0
  10. src/api/v1/__init__.py +9 -0
  11. src/api/v1/build_ml_plan/__init__.py +7 -0
  12. src/api/v1/build_ml_plan/eda.py +25 -0
  13. src/api/v1/build_ml_plan/task_analysis.py +54 -0
  14. src/api/v1/eda_engine/__init__.py +12 -0
  15. src/api/v1/eda_engine/data_quality.py +58 -0
  16. src/api/v1/eda_engine/data_statistics.py +61 -0
  17. src/api/v1/eda_engine/data_understanding.py +61 -0
  18. src/api/v1/eda_engine/univariate_analysis.py +59 -0
  19. src/app/pipelines/eda/agents/agents.py +65 -0
  20. src/app/pipelines/eda/agents/models.py +77 -0
  21. src/app/pipelines/eda/agents/prompts.py +333 -0
  22. src/app/pipelines/eda/helper.py +89 -0
  23. src/app/pipelines/eda/model.py +78 -0
  24. src/app/pipelines/eda/pipeline.py +256 -0
  25. src/app/pipelines/eda/tools/analysis_tools/__init__.py +3 -0
  26. src/app/pipelines/eda/tools/analysis_tools/bivariate_analysis.py +1028 -0
  27. src/app/pipelines/eda/tools/analysis_tools/multivariate_analysis.py +1039 -0
  28. src/app/pipelines/eda/tools/analysis_tools/univariate_analysis.py +517 -0
  29. src/app/pipelines/eda/tools/data_cleaning_tools/__init__.py +2 -0
  30. src/app/pipelines/eda/tools/data_cleaning_tools/handle_missing_values.py +64 -0
  31. src/app/pipelines/eda/tools/data_cleaning_tools/handle_outliers.py +83 -0
  32. src/app/pipelines/eda/tools/lib.py +59 -0
  33. src/app/pipelines/modules/__init__.py +4 -0
  34. src/app/pipelines/modules/data_quality_assessment.py +1657 -0
  35. src/app/pipelines/modules/data_statistics.py +1270 -0
  36. src/app/pipelines/modules/data_understanding_context.py +332 -0
  37. src/app/pipelines/modules/univariate_analysis.py +1437 -0
  38. src/app/pipelines/task_analysis/__init__.py +2 -0
  39. src/app/pipelines/task_analysis/ml_analysis_workflow.py +202 -0
  40. src/app/pipelines/task_analysis/ml_implementation_planner_workflow.py +132 -0
  41. src/app/pipelines/task_analysis/model.py +162 -0
  42. src/app/schemas/requests/__init__.py +1 -0
  43. src/app/schemas/requests/eda.py +25 -0
  44. src/app/schemas/requests/task_analysis.py +12 -0
  45. src/app/schemas/responses/eda.py +31 -0
  46. src/core/cache/redis_cache.py +33 -0
  47. src/core/server.py +23 -0
  48. src/core/utils/__init__.py +3 -0
  49. src/core/utils/knowledge_base.py +81 -0
  50. src/core/utils/logger.py +46 -0
.env.template ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.lock
3
+ .env
4
+ dev
5
+ *.log
6
+ *.csv
7
+ raw/
8
+ vector/
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.4
2
+
3
+ WORKDIR /app
4
+
5
+ COPY . /app/
6
+
7
+ RUN apt-get update && \
8
+ apt-get install -y \
9
+ redis-server \
10
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
11
+
12
+ RUN pip install --upgrade pip && \
13
+ pip install uv && \
14
+ uv pip install --system -r requirements.txt
15
+
16
+ CMD ["python", "-m", "main"]
README.md CHANGED
@@ -1,10 +1,111 @@
1
- ---
2
- title: Eda Franky V1
3
- emoji: 📊
4
- colorFrom: green
5
- colorTo: gray
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Franky EDA Engine
2
+
3
+ ## Project Structure
4
+
5
+ ```plaintext
6
+ FRANKY-API/
7
+ ├── knowledge_base/
8
+ │ ├── raw/
9
+ │ └── vector/
10
+ ├── src/
11
+ │ ├── api/
12
+ │ │ ├── v1/
13
+ │ │ │ ├── build_ml_plan/
14
+ │ │ │ │ ├── __init__.py
15
+ │ │ │ │ ├── eda.py
16
+ │ │ │ │ └── task_analysis.py
17
+ │ │ │ ├── eda_engine/
18
+ │ │ │ │ ├── __init__.py
19
+ │ │ │ │ ├── data_quality.py
20
+ │ │ │ │ ├── data_statistics.py
21
+ │ │ │ │ ├── data_understanding.py
22
+ │ │ │ │ └── univariate_analysis.py
23
+ │ │ │ └── __init__.py
24
+ │ │ └── __init__.py
25
+ │ ├── app/
26
+ │ │ ├── pipelines/
27
+ │ │ │ ├── eda/
28
+ │ │ │ │ ├── agemts/
29
+ │ │ │ │ │ ├── agents.py
30
+ │ │ │ │ │ ├── models.py
31
+ │ │ │ │ │ └── prompts.py
32
+ │ │ │ │ ├── tools/
33
+ │ │ │ │ │ ├── analysis_tools/
34
+ │ │ │ │ │ │ ├── __init__.py
35
+ │ │ │ │ │ │ ├── bivariate_analysis.py
36
+ │ │ │ │ │ │ ├── multivariate_analysis.py
37
+ │ │ │ │ │ │ └── univariate_analysis.py
38
+ │ │ │ │ │ ├── data_cleaning_tools/
39
+ │ │ │ │ │ │ ├── __init__.py
40
+ │ │ │ │ │ │ ├── handle_missing_values.py
41
+ │ │ │ │ │ │ └── handle_outliers.py
42
+ │ │ │ │ │ ├── transformations_tools/
43
+ │ │ │ │ │ └── lib.py
44
+ │ │ │ │ ├── helper.py
45
+ │ │ │ │ ├── model.py
46
+ │ │ │ │ └── pipeline.py
47
+ │ │ │ ├── modules/
48
+ │ │ │ │ ├── __init__.py
49
+ │ │ │ │ ├── data_quality_assessment.py
50
+ │ │ │ │ ├── data_statistics.py
51
+ │ │ │ │ ├── data_understanding_context.py
52
+ │ │ │ │ └── univariate_analysis.py
53
+ │ │ │ └── task_analysis/
54
+ │ │ │ ├── __init__.py
55
+ │ │ │ ├── ml_analysis_workflow.py
56
+ │ │ │ ├── ml_implementation_planner_workflow.py
57
+ │ │ │ └── model.py
58
+ │ │ └── schemas/
59
+ │ │ ├── requests/
60
+ │ │ │ ├── __init__.py
61
+ │ │ │ ├── eda.py
62
+ │ │ │ └── task_analysis.py
63
+ │ │ └── responses/
64
+ │ │ └── eda.py
65
+ │ └── core/
66
+ │ ├── cache/
67
+ │ │ ├── code_generated/
68
+ │ │ ├── dataset_logs/
69
+ │ │ ├── downloads/
70
+ │ │ └── redis_cache.py
71
+ │ ├── database/
72
+ │ ├── logs/
73
+ │ ├── utils/
74
+ │ │ ├── __init__.py
75
+ │ │ ├── knowledge_base.py
76
+ │ │ ├── logger.py
77
+ │ │ ├── prompts.py
78
+ │ │ └── read_config.py
79
+ │ └── server.py
80
+ ├── .env.template
81
+ ├── .gitignore
82
+ ├── config.yaml
83
+ ├── Dockerfile
84
+ ├── main.py
85
+ ├── prepare_kb.py
86
+ ├── README.md
87
+ └── Dockerfile
88
+ ```
89
+
90
+ ## Getting Started
91
+
92
+ ### 1. Clone the Repository
93
+
94
+ ### 2. Running the Docker Container
95
+
96
+ To build the docker image, run the following cmd:
97
+
98
+ ```bash
99
+ docker build -t franky-api .
100
+ ```
101
+
102
+ To run the docker container, run the following cmd:
103
+
104
+ ```bash
105
+ docker run -d -p 8000:8000 --name franky-api-container franky-api
106
+ ```
107
+
108
+ You can access the api at:
109
+
110
+ [http://localhost:8000/](http://localhost:8000/docs)
111
+
config.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ app:
2
+ verbose: True
3
+
4
+ server:
5
+ host: "127.0.0.1"
6
+ port: 8000
7
+ reload: True
8
+ workers: 1
9
+
10
+ redis_server:
11
+ host: "127.0.0.1"
12
+ port: 6379
13
+ db: 0
main.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+ import subprocess
3
+ import shutil
4
+ import time
5
+ import asyncio
6
+ from src.core.utils import logger, read_config
7
+
8
+ def start_redis_server(redis_config: dict):
9
+ redis_path = shutil.which("redis-server")
10
+ if not redis_path:
11
+ raise RuntimeError("redis-server is not installed or not in PATH")
12
+
13
+ process = subprocess.Popen(
14
+ [redis_path, "--port", str(redis_config['port']), "--bind", redis_config['host']],
15
+ stdout=subprocess.DEVNULL,
16
+ stderr=subprocess.DEVNULL
17
+ )
18
+
19
+ time.sleep(1)
20
+ logger.info(
21
+ f"Redis server started successfully on {redis_config['host']}:{redis_config['port']}",
22
+ log_type="server",
23
+ console=True
24
+ )
25
+ return process
26
+
27
+ def initialize_config() -> dict:
28
+ return read_config(config_path="config.yaml")
29
+
30
+ async def main():
31
+ config = initialize_config()
32
+ redis_process = start_redis_server(redis_config=config['redis_server'])
33
+
34
+ try:
35
+ uvicorn.run(
36
+ app="src.core.server:app",
37
+ host=config['server']['host'],
38
+ port=config['server']['port'],
39
+ reload=config['server']['reload'],
40
+ workers=config['server']['workers']
41
+ )
42
+ finally:
43
+ logger.info("Shutting down Redis server...", log_type="server", console=config['app']['verbose'])
44
+ redis_process.terminate()
45
+
46
+ if __name__ == "__main__":
47
+ asyncio.run(main())
prepare_kb.ipynb ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "f5a0d75d",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import chromadb\n",
11
+ "from llama_index.core import StorageContext\n",
12
+ "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
13
+ "# from llama_index.embeddings.fastembed import FastEmbedEmbedding\n",
14
+ "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n",
15
+ "from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex\n",
16
+ "\n",
17
+ "# embed_model = FastEmbedEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
18
+ "data_dir = r\"knowledge_base\\raw\\classification\"\n",
19
+ "\n",
20
+ "documents = SimpleDirectoryReader(str(data_dir)).load_data()\n",
21
+ "data_path = r\"knowledge_base\\vector\\classification\"\n",
22
+ "db = chromadb.PersistentClient(path=data_path)"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "markdown",
27
+ "id": "b52b6ba8",
28
+ "metadata": {},
29
+ "source": [
30
+ "### Storing the data locally"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "id": "348df588",
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "chroma_collection = db.get_or_create_collection(\"classification_db\")\n",
41
+ "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
42
+ "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
43
+ "index = VectorStoreIndex.from_documents(\n",
44
+ " documents=documents,\n",
45
+ " storage_context=storage_context,\n",
46
+ " show_progress=True,\n",
47
+ " # embed_model=embed_model\n",
48
+ ")"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "markdown",
53
+ "id": "f7411c03",
54
+ "metadata": {},
55
+ "source": [
56
+ "### Loading the locally stored vector index"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 6,
62
+ "id": "4d9cbd1b",
63
+ "metadata": {},
64
+ "outputs": [],
65
+ "source": [
66
+ "import chromadb\n",
67
+ "from llama_index.core import StorageContext\n",
68
+ "from llama_index.core import VectorStoreIndex\n",
69
+ "from llama_index.core.retrievers import VectorIndexRetriever\n",
70
+ "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
71
+ "# from llama_index.embeddings.fastembed import FastEmbedEmbedding\n",
72
+ "\n",
73
+ "# embed_model = FastEmbedEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
74
+ "\n",
75
+ "data_path = r\"knowledge_base\\vector\\classification\"\n",
76
+ "db = chromadb.PersistentClient(path=data_path)\n",
77
+ "chroma_collection = db.get_or_create_collection(\"classification_db\")\n",
78
+ "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
79
+ "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
80
+ "\n",
81
+ "index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)\n",
82
+ "retriever = VectorIndexRetriever(\n",
83
+ " index, \n",
84
+ " # embed_model=embed_model\n",
85
+ ")"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": null,
91
+ "id": "05804310",
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": []
95
+ }
96
+ ],
97
+ "metadata": {
98
+ "kernelspec": {
99
+ "display_name": "dev",
100
+ "language": "python",
101
+ "name": "python3"
102
+ },
103
+ "language_info": {
104
+ "codemirror_mode": {
105
+ "name": "ipython",
106
+ "version": 3
107
+ },
108
+ "file_extension": ".py",
109
+ "mimetype": "text/x-python",
110
+ "name": "python",
111
+ "nbconvert_exporter": "python",
112
+ "pygments_lexer": "ipython3",
113
+ "version": "3.11.4"
114
+ }
115
+ },
116
+ "nbformat": 4,
117
+ "nbformat_minor": 5
118
+ }
requirements.txt ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ agno==1.1.3
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.11.18
4
+ aiosignal==1.3.2
5
+ annotated-types==0.7.0
6
+ anyio==4.9.0
7
+ appnope==0.1.4
8
+ asgiref==3.8.1
9
+ asttokens==3.0.0
10
+ attrs==25.3.0
11
+ backoff==2.2.1
12
+ banks==2.1.2
13
+ bcrypt==4.3.0
14
+ build==1.2.2.post1
15
+ cachetools==5.5.2
16
+ certifi==2025.1.31
17
+ charset-normalizer==3.4.1
18
+ chromadb==1.0.9
19
+ click==8.1.8
20
+ colorama==0.4.6
21
+ coloredlogs==15.0.1
22
+ comm==0.2.2
23
+ contourpy==1.3.1
24
+ cycler==0.12.1
25
+ dataclasses-json==0.6.7
26
+ debugpy==1.8.14
27
+ decorator==5.2.1
28
+ Deprecated==1.2.18
29
+ dirtyjson==1.0.8
30
+ distro==1.9.0
31
+ dnspython==2.7.0
32
+ docling-core==2.28.1
33
+ docstring_parser==0.16
34
+ duckduckgo_search==8.0.0
35
+ durationpy==0.9
36
+ email_validator==2.2.0
37
+ executing==2.2.0
38
+ faiss-cpu==1.11.0
39
+ fastapi==0.115.9
40
+ fastapi-cli==0.0.7
41
+ fastembed==0.6.1
42
+ filelock==3.18.0
43
+ filetype==1.2.0
44
+ flatbuffers==25.2.10
45
+ fonttools==4.57.0
46
+ frozenlist==1.6.0
47
+ fsspec==2025.3.2
48
+ gitdb==4.0.12
49
+ GitPython==3.1.44
50
+ google-auth==2.40.1
51
+ googleapis-common-protos==1.70.0
52
+ greenlet==3.2.1
53
+ griffe==1.7.3
54
+ groq==0.23.1
55
+ grpcio==1.71.0
56
+ h11==0.14.0
57
+ httpcore==1.0.8
58
+ httptools==0.6.4
59
+ httpx==0.28.1
60
+ huggingface-hub==0.30.2
61
+ humanfriendly==10.0
62
+ idna==3.10
63
+ importlib_metadata==8.6.1
64
+ importlib_resources==6.5.2
65
+ ipykernel==6.29.5
66
+ ipython==9.1.0
67
+ ipython_pygments_lexers==1.1.1
68
+ jedi==0.19.2
69
+ Jinja2==3.1.6
70
+ jiter==0.9.0
71
+ joblib==1.4.2
72
+ jsonref==1.1.0
73
+ jsonschema==4.23.0
74
+ jsonschema-specifications==2025.4.1
75
+ jupyter_client==8.6.3
76
+ jupyter_core==5.7.2
77
+ kiwisolver==1.4.8
78
+ kubernetes==32.0.1
79
+ latex2mathml==3.77.0
80
+ llama-index-core==0.12.34.post1
81
+ llama-index-embeddings-fastembed==0.3.1
82
+ llama-index-embeddings-openai==0.3.1
83
+ llama-index-vector-stores-chroma==0.4.1
84
+ llama-index-vector-stores-faiss==0.4.0
85
+ lmoments3==1.0.8
86
+ loguru==0.7.3
87
+ lxml==5.3.2
88
+ markdown-it-py==3.0.0
89
+ MarkupSafe==3.0.2
90
+ marshmallow==3.26.1
91
+ matplotlib==3.10.1
92
+ matplotlib-inline==0.1.7
93
+ mdurl==0.1.2
94
+ missingno==0.5.2
95
+ mmh3==5.1.0
96
+ mpmath==1.3.0
97
+ multidict==6.4.3
98
+ mypy_extensions==1.1.0
99
+ nest-asyncio==1.6.0
100
+ networkx==3.4.2
101
+ nltk==3.9.1
102
+ numpy==2.2.4
103
+ oauthlib==3.2.2
104
+ onnxruntime==1.21.1
105
+ openai==1.72.0
106
+ opentelemetry-api==1.33.1
107
+ opentelemetry-exporter-otlp-proto-common==1.33.1
108
+ opentelemetry-exporter-otlp-proto-grpc==1.33.1
109
+ opentelemetry-instrumentation==0.54b1
110
+ opentelemetry-instrumentation-asgi==0.54b1
111
+ opentelemetry-instrumentation-fastapi==0.54b1
112
+ opentelemetry-proto==1.33.1
113
+ opentelemetry-sdk==1.33.1
114
+ opentelemetry-semantic-conventions==0.54b1
115
+ opentelemetry-util-http==0.54b1
116
+ orjson==3.10.18
117
+ overrides==7.7.0
118
+ packaging==24.2
119
+ pandas==2.2.3
120
+ parso==0.8.4
121
+ patsy==1.0.1
122
+ pexpect==4.9.0
123
+ pillow==11.1.0
124
+ platformdirs==4.3.7
125
+ posthog==4.0.1
126
+ primp==0.14.0
127
+ prompt_toolkit==3.0.50
128
+ propcache==0.3.1
129
+ protobuf==5.29.4
130
+ psutil==7.0.0
131
+ ptyprocess==0.7.0
132
+ pure_eval==0.2.3
133
+ py_rust_stemmers==0.1.5
134
+ pyasn1==0.6.1
135
+ pyasn1_modules==0.4.2
136
+ pydantic==2.11.3
137
+ pydantic-settings==2.8.1
138
+ pydantic_core==2.33.1
139
+ Pygments==2.19.1
140
+ pyparsing==3.2.3
141
+ PyPika==0.48.9
142
+ pyproject_hooks==1.2.0
143
+ pyreadline3==3.5.4
144
+ python-dateutil==2.9.0.post0
145
+ python-dotenv==1.1.0
146
+ python-multipart==0.0.20
147
+ pytz==2025.2
148
+ pywin32==310
149
+ PyYAML==6.0.2
150
+ pyzmq==26.4.0
151
+ RapidFuzz==3.13.0
152
+ redis==5.2.1
153
+ redis-cli==1.0.1
154
+ referencing==0.36.2
155
+ regex==2024.11.6
156
+ requests==2.32.3
157
+ requests-oauthlib==2.0.0
158
+ rich==14.0.0
159
+ rich-toolkit==0.14.1
160
+ rpds-py==0.24.0
161
+ rsa==4.9.1
162
+ safetensors==0.5.3
163
+ scikit-learn==1.6.1
164
+ scipy==1.15.2
165
+ seaborn==0.13.2
166
+ shellingham==1.5.4
167
+ six==1.17.0
168
+ smmap==5.0.2
169
+ sniffio==1.3.1
170
+ SQLAlchemy==2.0.40
171
+ stack-data==0.6.3
172
+ starlette==0.45.3
173
+ statsmodels==0.14.4
174
+ sympy==1.13.3
175
+ tabulate==0.9.0
176
+ tenacity==9.1.2
177
+ threadpoolctl==3.6.0
178
+ tiktoken==0.9.0
179
+ tokenizers==0.21.1
180
+ tomli==2.2.1
181
+ torch==2.7.0
182
+ tornado==6.4.2
183
+ tqdm==4.67.1
184
+ traitlets==5.14.3
185
+ transformers==4.51.3
186
+ typer==0.15.2
187
+ typing-inspect==0.9.0
188
+ typing-inspection==0.4.0
189
+ typing_extensions==4.13.2
190
+ tzdata==2025.2
191
+ urllib3==2.4.0
192
+ uv==0.6.14
193
+ uvicorn==0.34.0
194
+ watchfiles==1.0.5
195
+ wcwidth==0.2.13
196
+ websocket-client==1.8.0
197
+ websockets==15.0.1
198
+ win32_setctime==1.2.0
199
+ wrapt==1.17.2
200
+ yarl==1.20.0
201
+ zipp==3.21.0
src/api/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ from .v1 import v1_router
3
+
4
+ router = APIRouter()
5
+ router.include_router(v1_router, prefix='/v1')
src/api/v1/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ from .build_ml_plan import build_ml_plan_router
3
+ from .eda_engine import eda_engine_router
4
+
5
+ v1_router = APIRouter()
6
+
7
+ v1_router.include_router(build_ml_plan_router, prefix='/build-ml-plan')
8
+ v1_router.include_router(eda_engine_router, prefix='/eda-engine')
9
+
src/api/v1/build_ml_plan/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ from .task_analysis import analysis_router
3
+ from .eda import eda_router
4
+
5
+ build_ml_plan_router = APIRouter()
6
+ build_ml_plan_router.include_router(analysis_router, prefix="/analyze-task")
7
+ # build_ml_plan_router.include_router(eda_router, prefix="/run-eda")
src/api/v1/build_ml_plan/eda.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ from src.app.schemas.requests.eda import EdaRequestSchema
3
+ from src.app.pipelines.eda.pipeline import EdaLoop
4
+ import os
5
+
6
+ eda_router = APIRouter()
7
+
8
+ def delete_dir_contents(directory: str)->None:
9
+ for filename in os.listdir(directory):
10
+ file_path = os.path.join(directory, filename)
11
+ if os.path.isfile(file_path):
12
+ os.remove(file_path)
13
+
14
+ @eda_router.post('/')
15
+ async def main(response: EdaRequestSchema):
16
+
17
+ eda = EdaLoop(payload=response, verbose=True)
18
+ logs = eda.loop(verbose=True)
19
+
20
+ delete_dir_contents(directory="src/core/cache/dataset_logs")
21
+ delete_dir_contents(directory="src/core/cache/downloads")
22
+
23
+ return {
24
+ "execution_logs": logs
25
+ }
src/api/v1/build_ml_plan/task_analysis.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, File, UploadFile, Form, HTTPException, Request
2
+ from src.app.pipelines.task_analysis import MLImplementationPlannerWorkflow, MLAnalysisWorkflow
3
+ from src.app.schemas.requests.eda import EdaRequestSchema
4
+ from src.core.utils import logger
5
+ from typing import Optional
6
+ import os
7
+ import shutil
8
+
9
+ analysis_router = APIRouter()
10
+
11
+ UPLOAD_DIR = "src/core/cache/downloads"
12
+
13
+ @analysis_router.post("/")
14
+ async def main(
15
+ request: Request,
16
+ user_input: str = Form(...),
17
+ file: UploadFile = File(...)
18
+ ):
19
+ '''Retrieving config from app state. This config is also stored in redis cache'''
20
+ config = request.app.state.config
21
+
22
+ if not file.filename.endswith('.csv'):
23
+ logger.error("Only CSV files are allowed", log_type="api: /analyze-task", console=config['app']['verbose'])
24
+ raise HTTPException(status_code=400, detail="Only CSV files are allowed.")
25
+
26
+ file_path = os.path.join(UPLOAD_DIR, file.filename)
27
+
28
+ '''Storing the file in /downloads'''
29
+ try:
30
+ with open(file_path, "wb") as buffer:
31
+ shutil.copyfileobj(file.file, buffer)
32
+ logger.info("File uploaded successfully!", log_type="api: /analyze-task", console=config['app']['verbose'])
33
+ except Exception as e:
34
+ logger.error(f"Error saving file: {str(e)}", log_type="api: /analyze-task", console=config['app']['verbose'])
35
+ raise HTTPException(status_code=500, detail=f"Error saving file: {str(e)}")
36
+
37
+
38
+ ml_analysis_wf = MLAnalysisWorkflow(user_prompt=user_input)
39
+ ml_analysis_results = ml_analysis_wf.run(verbose=True)
40
+
41
+ ml_imp_planner_wf = MLImplementationPlannerWorkflow(requirements_analysis=ml_analysis_results[0], technical_research=ml_analysis_results[1])
42
+ ml_imp_planner_results = ml_imp_planner_wf.run(verbose=True)
43
+
44
+ api_response = EdaRequestSchema(
45
+ dataset_path = file_path,
46
+ user_prompt = user_input,
47
+ requirement_analysis = ml_analysis_results[0],
48
+ technical_research = ml_analysis_results[1],
49
+ implementation_plan = ml_imp_planner_results
50
+ )
51
+
52
+ return {
53
+ "response": api_response
54
+ }
src/api/v1/eda_engine/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ from .data_understanding import data_understanding_router
3
+ from .data_statistics import data_statistics_router
4
+ from .data_quality import data_quality_router
5
+ from .univariate_analysis import univariate_analysis_router
6
+
7
+ eda_engine_router = APIRouter()
8
+ eda_engine_router.include_router(data_understanding_router, prefix="/get-data-understanding-context")
9
+ eda_engine_router.include_router(data_statistics_router, prefix="/get-data-statistics")
10
+ eda_engine_router.include_router(data_quality_router, prefix="/get-data-quality")
11
+ eda_engine_router.include_router(univariate_analysis_router, prefix="/get-univariate-analysis")
12
+
src/api/v1/eda_engine/data_quality.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from fastapi import APIRouter
4
+ from src.core.utils import logger
5
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Form
6
+ from src.app.pipelines.modules import DataQualityAssessmentWorkflow
7
+
8
+ data_quality_router = APIRouter()
9
+
10
+ def delete_dir_contents(directory: str)->None:
11
+ for filename in os.listdir(directory):
12
+ file_path = os.path.join(directory, filename)
13
+ if os.path.isfile(file_path):
14
+ os.remove(file_path)
15
+
16
+ @data_quality_router.post('/')
17
+ async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
18
+ ''' ## This endpoint accepts a CSV file upload to initiate the Data Quality Workflow.
19
+
20
+ ### Parameters:
21
+ -----------
22
+ - file : CSV File for the dataset
23
+
24
+ ### Returns:
25
+ --------
26
+ - dict: Markdown Report
27
+ '''
28
+
29
+ if not file.filename.endswith('.csv'):
30
+ raise HTTPException(status_code=400, detail="Only CSV files are allowed.")
31
+
32
+ '''Clears the /downloads folder and stores the recieved file under 'dataset.csv' '''
33
+
34
+ downloads_path = "src/core/cache/downloads"
35
+ os.makedirs(downloads_path, exist_ok=True)
36
+ delete_dir_contents(downloads_path)
37
+ destination_path = os.path.join(downloads_path, "dataset.csv")
38
+ with open(destination_path, "wb") as buffer:
39
+ shutil.copyfileobj(file.file, buffer)
40
+
41
+ logger.info(f"CSV file saved to {destination_path}", log_type='eda-engine/data_quality', console=True)
42
+
43
+ '''Runs the data quality assessment workflow'''
44
+
45
+ try:
46
+ ds_wf = DataQualityAssessmentWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
47
+ results = ds_wf.run(verbose=True)
48
+
49
+ return {
50
+ "status": "Pipeline finished running",
51
+ "results": results
52
+ }
53
+
54
+ except Exception as e:
55
+ logger.error(f"DataQualityAssessmentWorkflow failed with error: {e}", log_type='eda-engine/data_quality', console=True)
56
+ return {
57
+ "status": "Pipeline failed to finish",
58
+ }
src/api/v1/eda_engine/data_statistics.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from fastapi import APIRouter
4
+ from src.core.utils import logger
5
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Form
6
+ from src.app.pipelines.modules import DataStatisticsWorkflow
7
+
8
+ data_statistics_router = APIRouter()
9
+
10
+ def delete_dir_contents(directory: str)->None:
11
+ for filename in os.listdir(directory):
12
+ file_path = os.path.join(directory, filename)
13
+ if os.path.isfile(file_path):
14
+ os.remove(file_path)
15
+
16
+ @data_statistics_router.post('/')
17
+ async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
18
+ ''' ## This endpoint accepts a CSV file upload to initiate the Data Statistics Workflow.
19
+
20
+ ### Parameters:
21
+ -----------
22
+ - file : CSV File for the dataset
23
+ \n
24
+ - ml_task : Final machine learning task/target
25
+
26
+ ### Returns:
27
+ --------
28
+ - dict: Markdown Report
29
+ '''
30
+
31
+ if not file.filename.endswith('.csv'):
32
+ raise HTTPException(status_code=400, detail="Only CSV files are allowed.")
33
+
34
+ '''Clears the /downloads folder and stores the recieved file under 'dataset.csv' '''
35
+
36
+ downloads_path = "src/core/cache/downloads"
37
+ os.makedirs(downloads_path, exist_ok=True)
38
+ delete_dir_contents(downloads_path)
39
+ destination_path = os.path.join(downloads_path, "dataset.csv")
40
+ with open(destination_path, "wb") as buffer:
41
+ shutil.copyfileobj(file.file, buffer)
42
+
43
+ logger.info(f"CSV file saved to {destination_path}", log_type='eda-engine/data_statistics', console=True)
44
+
45
+ '''Runs the data statistics workflow'''
46
+ try:
47
+ ds_wf = DataStatisticsWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
48
+ results = ds_wf.run(verbose=True)
49
+
50
+ return {
51
+ "status": "Pipeline finished running",
52
+ "results": results
53
+ }
54
+
55
+ except Exception as e:
56
+ logger.error(f"DataStatisticsWorkflow failed with error: {e}", log_type='eda-engine/data_statistics', console=True)
57
+ return {
58
+ "status": "Pipeline failed to finish",
59
+ }
60
+
61
+
src/api/v1/eda_engine/data_understanding.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from typing import Optional
4
+ from src.core.utils import logger
5
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Form
6
+ from src.app.pipelines.modules import DataUnderstandingContextWorkflow
7
+
8
+ data_understanding_router = APIRouter()
9
+
10
+ def delete_dir_contents(directory: str) -> None:
11
+ for filename in os.listdir(directory):
12
+ file_path = os.path.join(directory, filename)
13
+ if os.path.isfile(file_path):
14
+ os.remove(file_path)
15
+
16
+ @data_understanding_router.post('/')
17
+ async def main(file: UploadFile = File(...), business_requirements: Optional[str] = Form(None)):
18
+ ''' ## This endpoint accepts a CSV file upload & additional business requirements/context to initiate the Data Understanding Context Workflow.
19
+
20
+ ### Parameters:
21
+ -----------
22
+ - file : CSV File for the dataset
23
+ \n
24
+ - business_context : Additional business context information about the dataset
25
+
26
+ ### Returns:
27
+ --------
28
+ - dict: Markdown Report
29
+ '''
30
+
31
+ if not file.filename.endswith('.csv'):
32
+ raise HTTPException(status_code=400, detail="Only CSV files are allowed.")
33
+
34
+ '''Clears the /downloads folder and stores the recieved file under 'dataset.csv' '''
35
+
36
+ downloads_path = "src/core/cache/downloads"
37
+ os.makedirs(downloads_path, exist_ok=True)
38
+ delete_dir_contents(downloads_path)
39
+ destination_path = os.path.join(downloads_path, "dataset.csv")
40
+ with open(destination_path, "wb") as buffer:
41
+ shutil.copyfileobj(file.file, buffer)
42
+
43
+ logger.info(f"CSV file saved to {destination_path}", log_type='eda-engine/data_understanding', console=True)
44
+
45
+
46
+ '''Runs the data understanding workflow'''
47
+
48
+ try:
49
+ duc_wf = DataUnderstandingContextWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", business_context=business_requirements)
50
+ results = duc_wf.run(verbose=True)
51
+
52
+ return {
53
+ "status": "Pipeline finished running",
54
+ "results": results
55
+ }
56
+
57
+ except Exception as e:
58
+ logger.error(f"DataUnderstandingContextWorkflow failed with error: {e}", log_type='eda-engine/data_understanding', console=True)
59
+ return {
60
+ "status": "Pipeline failed to finish",
61
+ }
src/api/v1/eda_engine/univariate_analysis.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from typing import Optional
4
+ from src.core.utils import logger
5
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Form
6
+ from src.app.pipelines.modules import UnivariateAnalysisWorkflow
7
+
8
+ univariate_analysis_router = APIRouter()
9
+
10
+ def delete_dir_contents(directory: str)->None:
11
+ for filename in os.listdir(directory):
12
+ file_path = os.path.join(directory, filename)
13
+ if os.path.isfile(file_path):
14
+ os.remove(file_path)
15
+
16
+ @univariate_analysis_router.post('/')
17
+ async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
18
+ ''' ## This endpoint accepts a CSV file upload to initiate the Univarite Analysis Workflow.
19
+
20
+ ### Parameters:
21
+ -----------
22
+ - file : CSV File for the dataset
23
+ \n
24
+ - ml_task : Final machine learning task/target
25
+
26
+ ### Returns:
27
+ --------
28
+ - dict: Markdown Report
29
+ '''
30
+
31
+ if not file.filename.endswith('.csv'):
32
+ raise HTTPException(status_code=400, detail="Only CSV files are allowed.")
33
+
34
+ '''Clears the /downloads folder and stores the recieved file under 'dataset.csv' '''
35
+
36
+ downloads_path = "src/core/cache/downloads"
37
+ os.makedirs(downloads_path, exist_ok=True)
38
+ delete_dir_contents(downloads_path)
39
+ destination_path = os.path.join(downloads_path, "dataset.csv")
40
+ with open(destination_path, "wb") as buffer:
41
+ shutil.copyfileobj(file.file, buffer)
42
+
43
+ logger.info(f"CSV file saved to {destination_path}", log_type='eda-engine/univariate_analysis', console=True)
44
+
45
+ '''Runs the data univariate analysis workflow'''
46
+ try:
47
+ ua_wf = UnivariateAnalysisWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
48
+ results = ua_wf.run(verbose=True)
49
+
50
+ return {
51
+ "status": "Pipeline finished running",
52
+ "results": results
53
+ }
54
+
55
+ except Exception as e:
56
+ logger.error(f"UnivariateAnalysisWorkflow failed with error: {e}", log_type='eda-engine/dataunivariate_analysis_statistics', console=True)
57
+ return {
58
+ "status": "Pipeline failed to finish"
59
+ }
src/app/pipelines/eda/agents/agents.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .prompts import intel_agent_desc, intel_agent_instructions, orchestrator_agent_desc, orchestrator_agent_instructions, analyzer_agent_desc, analyzer_agent_instructions, judging_agent_desc, judging_agent_instructions
2
+ from .models import IntelAgentResponseSchema, OrchestratorAgentResponseSchema, AnalyzerAgentResponseSchema, JudgingAgentResponseSchema
3
+ from agno.models.openai import OpenAIChat # type: ignore
4
+ from agno.agent import Agent # type: ignore
5
+ from dotenv import load_dotenv
6
+ from typing import List
7
+ import os
8
+
9
+ load_dotenv()
10
+
11
+ class AgentClass:
12
+ def __init__(self):
13
+ self.llm = OpenAIChat(id="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
14
+ self.agents = {}
15
+
16
+ self.build_agent(
17
+ agent_name = "intel_agent",
18
+ agent_desc = intel_agent_desc,
19
+ agent_instructions = intel_agent_instructions,
20
+ agent_response_model = IntelAgentResponseSchema,
21
+ tools = None
22
+ )
23
+
24
+ self.build_agent(
25
+ agent_name = "orchestrator_agent",
26
+ agent_desc = orchestrator_agent_desc,
27
+ agent_instructions = orchestrator_agent_instructions,
28
+ agent_response_model = OrchestratorAgentResponseSchema,
29
+ tools = None
30
+ )
31
+
32
+ self.build_agent(
33
+ agent_name = "analyzer_agent",
34
+ agent_desc = analyzer_agent_desc,
35
+ agent_instructions = analyzer_agent_instructions,
36
+ agent_response_model = AnalyzerAgentResponseSchema,
37
+ tools = None
38
+ )
39
+
40
+ self.build_agent(
41
+ agent_name = "judging_agent",
42
+ agent_desc = judging_agent_desc,
43
+ agent_instructions = judging_agent_instructions,
44
+ agent_response_model = JudgingAgentResponseSchema,
45
+ tools = None
46
+ )
47
+
48
+
49
+ def build_agent(self, agent_name: str, agent_desc: str, agent_instructions: List[str], agent_response_model, tools=None, debug_mode=False):
50
+ '''
51
+ Builds or re-builds an agent dynamically
52
+ '''
53
+ self.agents[agent_name] = Agent(
54
+ model = self.llm,
55
+ description = agent_desc,
56
+ instructions = agent_instructions,
57
+ response_model = agent_response_model,
58
+ structured_outputs = True,
59
+ tools = tools,
60
+ debug_mode=debug_mode,
61
+ )
62
+
63
+
64
+
65
+
src/app/pipelines/eda/agents/models.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import Field, BaseModel
2
+ from ..model import DatasetSummary
3
+ from typing import Optional
4
+
5
+ class IntelAgentResponseSchema(BaseModel):
6
+ dataset_description: str = Field(
7
+ default=None,
8
+ description = "Gives an idea about the dataset while also containing insights from the business understanding and task goal"
9
+ )
10
+ ml_task: str = Field(
11
+ default=None,
12
+ description = "Contains the task type which will help in the EDA process"
13
+ )
14
+ dataset_summary: DatasetSummary = Field(
15
+ default=None,
16
+ description = "Contains the summary of the dataset including num of rows, columns, features, target variables"
17
+ )
18
+
19
+ class OrchestratorAgentResponseSchema(BaseModel):
20
+ tool_name: str = Field(
21
+ default=None,
22
+ description="Name of the tool selected for execution"
23
+ )
24
+ justification: str = Field(
25
+ default=None,
26
+ description="Reason for selecting the tool"
27
+ )
28
+
29
+ class AnalyzerAgentResponseSchema(BaseModel):
30
+ key_insights: str = Field(
31
+ default=None,
32
+ description="Detected Patterns, Key findings, Anamolies, detected trends all must be discussed here"
33
+ )
34
+ potential_issues: str = Field(
35
+ default=None,
36
+ description="Detected Issues, Any problems found in the data, Steps that have not yet been performed, Any unresolved problems with the dataset, **Unmet Business Requirements**"
37
+ )
38
+ recommendations: str = Field(
39
+ default=None,
40
+ description="areas that still need analysis or processing"
41
+ )
42
+ final_task_achieved: bool = Field(
43
+ default=None,
44
+ description="Whether user task has been completed or not"
45
+ )
46
+
47
+ class ExecuterAgentResponseSchema(BaseModel):
48
+ executed_tool: str = Field(
49
+ default=None,
50
+ description="Tool that was executed"
51
+ )
52
+ execution_details: str = Field(
53
+ default=None,
54
+ description="Summary of execution process"
55
+ )
56
+ error_logs: Optional[str] = Field(
57
+ default=None,
58
+ description="Error logs if execution failed"
59
+ )
60
+ output_file_path: Optional[str] = Field(
61
+ default = None,
62
+ description = "Path of output file stored"
63
+ )
64
+
65
+ class JudgingAgentResponseSchema(BaseModel):
66
+ detailed_context: str = Field(
67
+ default=None,
68
+ description="Contains a detailed context about all the previously executed tools/functions"
69
+ )
70
+ stop_loop: bool = Field(
71
+ default=None,
72
+ description="Contains boolean value whether the eda loop must be stopped or not"
73
+ )
74
+ justification: str = Field(
75
+ default=None,
76
+ description="Contains the justification for its decision of stopping the loop or not"
77
+ )
src/app/pipelines/eda/agents/prompts.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ intel_agent_desc = '''
2
+ You are a Dataset Understanding and Problem Analysis Expert.
3
+ Your task is to analyze the provided dataset, interpret the user's problem statement, and extract meaningful insights from both the dataset and business context.
4
+ Your analysis will guide the Exploratory Data Analysis (EDA) and Machine Learning (ML) pipeline setup.
5
+ You will identify target variables, infer feature meanings, and ensure dataset insights are actionable and relevant for model building.
6
+ '''
7
+
8
+ intel_agent_instructions = [
9
+ '''1. Business Understanding:
10
+ - Analyze the user's goal from the user's prompt and business requirements to determine the nature of the problem.
11
+ - Identify the type of ML task (e.g., classification, regression, clustering, ranking, recommendation, etc.).
12
+ - Determine relevant target variable(s) based on the problem statement and dataset structure.
13
+ - For numerical features, interpret the statistics like mean, min., max. and std value and use this analysis to generate informative feature descriptions
14
+ ''',
15
+ '''2. Dataset Analysis:
16
+ - Extract key dataset metadata, including the number of rows, columns, and data types.
17
+ - Identify the target variable(s) that align with the task goal.
18
+ - Examine a **random sample of 10 rows** to infer initial insights about categorical and numerical features.
19
+ - For numerical features, interpret the statistics like mean, min., max. and std value and use this analysis to generate informative feature descriptions
20
+ ''',
21
+ '''3. Output Generation:
22
+ - Construct a structured dataset summary with:
23
+ - Feature descriptions, feature types (numerical, categorical, text), and missing value counts.
24
+ - Key statistics for numerical features (mean, min, max, std deviation) with interpretations.
25
+ - Define a clear **ML task type** that informs feature selection and EDA.
26
+ - Provide a refined list of **target variables** aligned with the business problem.
27
+ - Ensure that dataset insights are **actionable** and relevant for feature engineering and model training.
28
+ - For numerical features, interpret the statistics like mean, min., max. and std value and use this analysis to generate informative feature descriptions
29
+ ''',
30
+ ]
31
+
32
+ orchestrator_agent_desc = '''
33
+ You are the Adaptive Data Analysis & Processing Decision-Making Agent. You are an expert at dynamically selecting and sequencing the most relevant data analysis, preprocessing, and transformation steps based on user instructions, dataset characteristics, and analysis reports from previously executed functions.
34
+ Your primary responsibility is to create a customized analytical pathway for each dataset, ensuring that prerequisite tasks are completed in an optimal sequence while avoiding unnecessary steps.
35
+ You operate in a continuous feedback loop, evaluating each new analysis report to refine your understanding of the dataset and adjust subsequent decisions accordingly.
36
+ '''
37
+
38
+ orchestrator_agent_instructions = [
39
+ '''
40
+ 1. Understand User Intent & Business Requirements:
41
+ - Extract the 'Business Objective', 'ML Task', and 'Final Task' from the prompt.
42
+ - Identify whether the user request requires simple preprocessing, in-depth analysis, feature engineering, or another transformation.
43
+ - Map the requested task to its position in a typical data science workflow to identify prerequisite and dependent steps.
44
+ - Understand the end goal to prioritize steps that directly contribute to achieving the business objective.
45
+ ''',
46
+ '''
47
+ 2. Analyze Dataset Characteristics & Processing History:
48
+ - Carefully examine 'Dataset Description' and 'Dataset Overview' to identify data types, distributions, outliers, and quality issues.
49
+ - Review past tool executions from 'Previous Pre-processing tools executed' chronologically to build context.
50
+ - Critically evaluate the most recent analysis reports to understand:
51
+ * What issues were detected and resolved
52
+ * What issues remain unaddressed
53
+ * How the data distribution and characteristics have changed after previous transformations
54
+ * What new patterns or relationships have emerged from previous analyses
55
+ - Detect dataset-specific challenges that might require specialized handling.
56
+ ''',
57
+ '''
58
+ 3. Make Data-Driven Decisions on Next Steps:
59
+ - Always start with the Univariate Analysis before executing any other analysis or pre-processing / transformation tool.
60
+ - Assess whether the requested operation has dependencies and determine if those dependencies have been satisfied based on previous analysis reports.
61
+ - Select the most appropriate tool from 'Available Tools for Execution' considering:
62
+ * Current dataset state as revealed by the most recent analysis
63
+ * Required prerequisites for the user's requested task
64
+ * Potential impact on downstream analyses and transformations
65
+ - For complex tasks like feature engineering, ensure that sufficient data understanding has been established through appropriate exploratory analyses.
66
+ - Adapt the analytical pathway based on insights from previous steps—if an analysis reveals unexpected patterns, reprioritize subsequent steps accordingly.
67
+ - Identify when a specific analysis can be skipped because previous reports have already provided the necessary information.
68
+ ''',
69
+ '''
70
+ 4. Provide Comprehensive Justification Grounded in Analysis Reports:
71
+ - Reference specific findings from previous analysis reports to justify your current decision.
72
+ - Explain how the chosen step builds upon or addresses issues identified in previous analyses.
73
+ - When bypassing a seemingly logical step, cite specific evidence from previous reports showing why it's unnecessary.
74
+ - Connect your decision to both immediate needs and long-term business objectives.
75
+ - When recommending prerequisite steps, clearly articulate how they enable the user's requested task.
76
+ ''',
77
+ '''
78
+ 5. Output Format:
79
+ - Your response must contain:
80
+ 1. Step / Iteration number the entire loop is currently at
81
+ 2. Function Name to Execute: Choose from the available tools and return the exact name of the tool. Don't add function. to the tool name. Be very precise and under no circumstance will you return a tool name out of the list.
82
+ 3. Comprehensive Justification for Selection that references:
83
+ * Specific findings from previous analysis reports
84
+ * Current state of the dataset
85
+ * How this step advances toward the business objective and user's requested task
86
+ * Why alternative approaches were not selected
87
+ ''',
88
+ '''
89
+ 6. Critical Rules to Follow:
90
+ - Never proceed with feature engineering or advanced transformations without first verifying data quality through previous analysis reports.
91
+ - Continuously adapt your strategy based on new insights—previous analysis results should directly influence current decisions.
92
+ - Balance thoroughness with efficiency—skip steps only when analysis reports provide evidence they're unnecessary.
93
+ - Recognize dataset-specific characteristics that require specialized treatment rather than applying a generic approach.
94
+ - Be opportunistic in identifying when multiple objectives can be achieved with a single analytical step.
95
+ - When analysis reports reveal unexpected data characteristics, be prepared to recommend a different path than originally anticipated.
96
+ - Always validate that a user-requested step is appropriate given the current dataset state—recommend alternatives when necessary.
97
+ - Consider the computational cost of each step relative to its potential value—prioritize high-impact analyses.
98
+ ''',
99
+ '''
100
+ 7. Special Case: First Iteration Handling:
101
+ - If there is no history/logs of any executed tools, always start with univariate analysis.
102
+ - If the user has requested a very specific function, check if it requires any prerequisite steps.
103
+ - If prerequisites are needed, execute those first before running the requested function.
104
+ - If no specific function is requested, always execute univariate analysis as the first step.
105
+ ''',
106
+ '''
107
+ 8. **New Rule: Ensure Each Function is Only Executed Once:**
108
+ - If a function or type of analysis has already been executed, **do not execute it again** under any circumstance.
109
+ - Keep track of all previously executed functions using the history/logs.
110
+ - If a user requests a function that has already been run, return a justification explaining why it will not be repeated.
111
+ - Instead of re-running a function, suggest alternative actions based on current dataset needs and past analysis results.
112
+ '''
113
+ ]
114
+
115
+ analyzer_agent_desc = """
116
+ You are the Analysis Interpretation Agent.
117
+ You are an expert at extracting meaningful insights from analysis results, identifying issues, and determining logical next steps in a data processing workflow.
118
+ Your task is to interpret the results of previously executed functions, provide detailed insights, and suggest relevant next steps based on the dataset characteristics, business requirements, and detected issues.
119
+ Your output will serve as input for a decision-making agent that determines which function to execute next.
120
+ Additionally, you must determine whether the user's task has been successfully completed based on the business objective and user objective.
121
+ """
122
+
123
+ analyzer_agent_instructions = [
124
+ """
125
+ 1. Understand the Received Inputs:
126
+ - Analyze the 'Function Execution Details' (a short summary of results) to grasp key takeaways.
127
+ - Review 'Function Metadata' to understand what steps were taken to generate the results.
128
+ - Carefully examine the full structured output of the function execution to extract insights.
129
+ - Understand the 'Business Objective' and 'User Objective' to assess completeness.
130
+ """,
131
+ """
132
+ 2. Analyze & Interpret Function Results
133
+ - Examine the structure of the function output.
134
+ - Determine what insights are meaningful based on the type of analysis performed.
135
+ - Adaptively interpret key takeaways without assuming a fixed output schema.
136
+ """,
137
+ """
138
+ 3. Identify Issues & Challenges
139
+ - If the function output provides statistical information, look for:
140
+ * Patterns, anomalies, or inconsistencies.
141
+ - If it generates transformed data, check:
142
+ * Completeness, correctness, and adherence to expected formats.
143
+ - If it's a model training result, extract:
144
+ * Performance metrics, overfitting risks, and areas for improvement.
145
+ - For any other type of output, assess:
146
+ * How well the function achieved its intended goal.
147
+ """,
148
+ """
149
+ 4. Determine Logical Next Steps
150
+ - Based on the extracted insights, suggest what should logically follow:
151
+ * Further data cleaning or transformation?
152
+ * A different type of analysis?
153
+ - Ensure that recommendations align with the broader business objective.
154
+ """,
155
+ """
156
+ 5. Identify Unmet Business Requirements & Task Completion Check
157
+ - Check if there are remaining gaps in the analysis pipeline.
158
+ - Verify if the extracted insights align with the original 'Business Objective' and 'User Objective'.
159
+ - Determine if the previous function output sufficiently addresses the business goal.
160
+ - Identify any aspects of the dataset that haven’t been sufficiently analyzed yet.
161
+ - If certain transformations or feature engineering steps are necessary for the final objective, highlight them.
162
+ - Highlight any missing steps required to complete the workflow.
163
+ - **Task Completion Check:**
164
+ - If all required steps have been performed and the results meet the business objective and user goal, return `task_completed = True`.
165
+ - If there are still outstanding issues, required steps, or missing insights, return `task_completed = False`.
166
+ """,
167
+ """
168
+ 6. Output Format:
169
+ - Your response must contain a structured summary with the following components:
170
+ 1. key_insights: Detected Patterns, Key findings, Anamolies, detected trends all must be discussed here.
171
+ 2. potential_issues: Detected Issues, Any problems found in the data, Steps that have not yet been performed, Any unresolved problems with the dataset, **Unmet Business Requirements**.
172
+ 3. recommendations: Areas that still need analysis or processing.
173
+ 4. task_completed: Boolean (True/False) indicating if the user’s objective has been met.
174
+ """
175
+ ]
176
+
177
+ executer_agent_desc = '''
178
+ You are an intelligent ML workflow execution agent responsible for executing selected tools on the dataset and generating a detailed execution report. Your primary role is to apply the given tool to the dataset, monitor the process, and log the results.
179
+
180
+ At each iteration, you must:
181
+ 1. Receive execution parameters, including the tool name, expected changes, and dataset description.
182
+ 2. Execute the specified tool with appropriate parameters.
183
+ 3. Capture the execution outcome, including success status, modifications made, and any errors encountered.
184
+ 4. Generate a structured execution report.
185
+
186
+ Your goal is to ensure the successful execution of ML workflow steps, log any issues encountered, and maintain detailed execution records.
187
+ '''
188
+
189
+ executer_agent_instructions = [
190
+ '''1. Input Processing
191
+ Upon receiving an execution request, analyze the following inputs:
192
+
193
+ - Step Number: The iteration number in the pipeline.
194
+ - Tool Name: The tool to be executed.
195
+ - Expected Changes: Modifications expected after execution.
196
+ - Dataset Description: A textual overview of dataset characteristics.
197
+ - Dataset Overview: Key statistics, structure, feature types, and known issues.
198
+
199
+ Validate whether the provided tool is applicable based on the dataset state.
200
+ ''',
201
+ '''2. Tool Execution
202
+ Execute the specified tool using the appropriate method and parameters. Ensure:
203
+
204
+ - Correct application of the tool based on dataset properties.
205
+ - Efficient execution without unnecessary operations.
206
+ - Handling of potential issues such as missing data, outliers, or incompatible transformations.
207
+
208
+ If execution fails, capture detailed error logs for debugging.
209
+ ''',
210
+ '''3. Capture Execution Outcome
211
+ After execution, document the results:
212
+
213
+ - execution_successful: Whether the tool was executed without errors.
214
+ - execution_details: Summary of operations performed by the tool.
215
+ In this execution_details analyze the python code and summarize whats happening in the code. Don't be vague be very precise. For eg. Detected 12 outliers and removed them etc
216
+ - error_logs: If execution failed, log relevant error messages.
217
+
218
+ Ensure all outputs are structured and informative.
219
+ ''',
220
+ '''4. Generate Execution Report
221
+ Construct a structured execution report with:
222
+
223
+ - step_number: Iteration number.
224
+ - executed_tool: Name of the executed tool.
225
+ - execution_successful: Boolean flag indicating success or failure.
226
+ - execution_details: Summary of what was done.
227
+ - error_logs: Error messages, if any.
228
+
229
+ This report must be clear, precise, and provide actionable insights for further processing.
230
+ ''',
231
+ '''5. You must also output the path of file stored after executing the function. You must output the exact path where the file is stored after executing the function
232
+ Map this to field: 'output_file_path'
233
+
234
+ If there is no output path provided explicitly then return None
235
+
236
+ '''
237
+ ]
238
+
239
+ judging_agent_desc = '''
240
+ You are an intelligent agent that analyzes preprocessing function execution history,
241
+ provides detailed context summaries, and determines when to stop machine learning loops
242
+ based on goal achievement and tool exhaustion analysis.
243
+
244
+ This agent serves two primary functions:
245
+ 1. Synthesize detailed contextual summaries of executed preprocessing functions
246
+ 2. Make informed decisions about stopping ML loops with clear justifications
247
+ '''
248
+
249
+ judging_agent_instructions = [
250
+ '''1. PREPROCESSING FUNCTION ANALYSIS
251
+
252
+ The agent will receive list of previously executed preprocessing functions including:
253
+ - Function names
254
+ - Function logic descriptions
255
+ - Function execution results
256
+
257
+ For each function, the agent must:
258
+ a) Understand the purpose and logic of the function
259
+ b) Analyze what the function accomplished based on its results
260
+ c) Track data transformations and their significance
261
+ d) Identify potential issues or limitations in the preprocessing steps
262
+ e) Recognize how each function contributes to the overall ML pipeline
263
+
264
+ The agent must then generate a comprehensive paragraph that:
265
+ - Chronologically describes the preprocessing journey
266
+ - Highlights key transformations and their significance
267
+ - Identifies data quality improvements
268
+ - Notes important statistical properties revealed
269
+ - Explains how each preprocessing step prepares data for modeling
270
+ - Uses technical but accessible language
271
+ - Provides quantitative details where relevant
272
+
273
+ 2. ML LOOP TERMINATION DECISION
274
+ The agent will receive:
275
+ - The user's stated ML goal/task
276
+ - A list of available tools/functions in the system (provided explicitly)
277
+ - The history of previously executed tools/functions
278
+
279
+ The agent must decide whether to stop the ML loop by analyzing only the provided list and history, without speculating about unlisted tools:
280
+ a) Goal Achievement Analysis:
281
+ - Compare current state to user's goal
282
+ - Check if performance metrics meet or exceed success criteria
283
+ - Confirm if the primary objective has been satisfied
284
+ - Determine if further iterations yield meaningful improvements
285
+
286
+ b) Tool Exhaustion Analysis:
287
+ - Identify which available tools from the provided list have been executed
288
+ - Determine which remaining tools (if any) from the provided list are still unused
289
+ - If no unused tools remain, the loop must terminate
290
+
291
+ c) Diminishing Returns Analysis:
292
+ - Review recent iteration metrics for signs of plateau or minimal gains
293
+ - Evaluate cost-effectiveness of additional iterations
294
+
295
+ Decision Rules (based strictly on the provided tools list):
296
+ - STOP the loop (stop_loop = True) if ANY of the following conditions are met:
297
+ 1. The user's goal has been achieved with satisfactory results
298
+ 2. All tools in the provided list have been executed
299
+ 3. Remaining provided tools (if any) cannot meaningfully improve results toward the goal
300
+ 4. Recent iterations show diminishing returns below a meaningful threshold
301
+ - CONTINUE the loop (stop_loop = False) if ALL of the following conditions are met:
302
+ 1. The user's goal is not yet achieved
303
+ 2. There are unused tools in the provided list
304
+ 3. There is clear evidence that applying an unused tool could improve results
305
+
306
+ Additional Constraints:
307
+ - Each available tool may be executed at most once in the ML loop
308
+ - The agent must not refer to hypothetical or non-existent tools
309
+ - Decisions must be based exclusively on the provided list of tools
310
+
311
+ Justification Requirements:
312
+ For any decision, provide a detailed justification that:
313
+ - References specific evidence from the execution history
314
+ - Cites relevant performance metrics and their values
315
+ - Explains the reasoning process step-by-step
316
+ - Acknowledges any limitations or uncertainties
317
+ - Connects the decision directly to the user's stated goal
318
+ - If continuing, recommends which unused tool to apply next and why
319
+
320
+ 3. OUTPUT FORMATTING
321
+ The agent must return a JSON object matching JudgeAgentResponseSchema with the following keys:
322
+
323
+ a) detailed_context: A comprehensive paragraph describing all executed preprocessing steps
324
+ and their results, explaining the ML pipeline's current state.
325
+
326
+ b) stop_loop: A boolean value (True or False) indicating whether the ML loop should stop.
327
+
328
+ c) justification: A detailed explanation of the decision to stop or continue the loop,
329
+ based on goal achievement, tool exhaustion, and diminishing returns analyses.
330
+
331
+ NOTE: If no additional tools from the provided list remain that can further the user's goal, the agent must set stop_loop to True.
332
+ '''
333
+ ]
src/app/pipelines/eda/helper.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from typing import Union
3
+ from .model import DatasetSummary, FeatureStatistics, Feature
4
+ from src.core.utils import logger
5
+
6
+ def get_summary(dataframe: pd.DataFrame)->DatasetSummary:
7
+ """
8
+ Generates a summary of the dataset, including feature details such as type, missing values,
9
+ and basic statistics for numerical columns.
10
+
11
+ Args:
12
+ dataframe (pd.DataFrame): The input dataset as a Pandas DataFrame.
13
+
14
+ Returns:
15
+ DatasetSummary: A structured summary of the dataset.
16
+ """
17
+
18
+ feature_list = []
19
+
20
+ for col in dataframe.columns:
21
+ column_name = col
22
+ dtype = dataframe[col].dtype
23
+
24
+ if pd.api.types.is_numeric_dtype(dataframe[col]):
25
+ category = "Numerical"
26
+ statistics = FeatureStatistics(
27
+ mean = dataframe[col].mean(),
28
+ min_value = dataframe[col].min(),
29
+ max_value = dataframe[col].max(),
30
+ standard_deviation_value = dataframe[col].std()
31
+ )
32
+ else:
33
+ category = "Non-Numerical"
34
+ statistics = None
35
+
36
+ null_count = dataframe[col].isnull().sum()
37
+ unique_count = dataframe[col].nunique()
38
+
39
+ feature = Feature(
40
+ feature_name = column_name,
41
+ feature_description = None,
42
+ feature_category = category,
43
+ feature_data_type = str(dtype),
44
+ missing_values = null_count,
45
+ unique_values = unique_count,
46
+ statistics = statistics
47
+ )
48
+
49
+ feature_list.append(feature)
50
+
51
+ return DatasetSummary(
52
+ num_rows = dataframe.shape[0],
53
+ num_features = dataframe.shape[1],
54
+ features = feature_list,
55
+ target_features = None
56
+ )
57
+
58
+ def sample_dataset(dataset: pd.DataFrame):
59
+ '''Sample 10 random values from the dataset for prompts'''
60
+ try:
61
+ sampled_data = dataset.sample(n=10, random_state=42).to_string(index=False)
62
+ return sampled_data
63
+ except ValueError:
64
+ logger.error("Failed to sample from the dataset", log_type="eda", console=True)
65
+ return None
66
+
67
+ def get_feature_summary(data: DatasetSummary):
68
+ '''Prepares a string which includes all features with their respective details'''
69
+ feature_details = []
70
+
71
+ for index, feature in enumerate(data.features):
72
+ stats = (
73
+ f"Mean: {feature.statistics.mean}, Min: {feature.statistics.min_value}, "
74
+ f"Max: {feature.statistics.max_value}, Std Dev: {feature.statistics.standard_deviation_value}"
75
+ if feature.statistics else "No statistics available"
76
+ )
77
+
78
+ feature_details.append(f"""
79
+ {index + 1}. '{feature.feature_name}'
80
+ - Description: {feature.feature_description}
81
+ - Category: {feature.feature_category}
82
+ - Data type: {feature.feature_data_type}
83
+ - Unique Values: {feature.unique_values}
84
+ - Missing Values: {feature.missing_values}
85
+ - Statistics: {stats}
86
+ """)
87
+
88
+ feature_summary = "\n".join(feature_details)
89
+ return feature_summary
src/app/pipelines/eda/model.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import Field, BaseModel
2
+ from typing import Optional, List
3
+
4
+ class FeatureStatistics(BaseModel):
5
+ """
6
+ Provides the statistics of numerical features (columns) in the dataset.
7
+ """
8
+ mean: float = Field(
9
+ default=None,
10
+ description = "The mean value of this feature"
11
+ )
12
+ min_value: float = Field(
13
+ default=None,
14
+ description = "The minimum value of this feature"
15
+ )
16
+ max_value: float = Field(
17
+ default=None,
18
+ description = "The maximum value of this feature"
19
+ )
20
+ standard_deviation_value: float = Field(
21
+ default=None,
22
+ description = "The standard deviation of this feature"
23
+ )
24
+
25
+ class Feature(BaseModel):
26
+ """
27
+ Represents a feature (column) in the dataset.
28
+ """
29
+ feature_name: str = Field(
30
+ default=None,
31
+ description = "The name of the feature/column"
32
+ )
33
+ feature_description: Optional[str] = Field(
34
+ default=None,
35
+ description = "A short description of the feature/column"
36
+ )
37
+ feature_category: str = Field(
38
+ default=None,
39
+ description= "Whether a feature is numerical or textual"
40
+ )
41
+ feature_data_type: str = Field(
42
+ default=None,
43
+ description= "The data type of the feature/column"
44
+ )
45
+ unique_values: int = Field(
46
+ default=None,
47
+ description= "The number of unique values in this particular column"
48
+ )
49
+ missing_values: int = Field(
50
+ default=None,
51
+ description= "The number of missing values in this particular column"
52
+ )
53
+ statistics: Optional[FeatureStatistics] = Field(
54
+ default=None,
55
+ description= "Provides statistics for numerical feature"
56
+ )
57
+
58
+ class DatasetSummary(BaseModel):
59
+ """
60
+ Provides a summary of the dataset, including its structure and target features.
61
+ """
62
+ num_rows: int = Field(
63
+ default=None,
64
+ description = "The number of rows in the dataset"
65
+ )
66
+ num_features: int = Field(
67
+ default=None,
68
+ description = "The number of features in the dataset"
69
+ )
70
+ features: List[Feature] = Field(
71
+ default=None,
72
+ description = "A list of features"
73
+ )
74
+ target_features: Optional[List[str]] = Field(
75
+ default=None,
76
+ description = "A list of target features relevant to the business task"
77
+ )
78
+
src/app/pipelines/eda/pipeline.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from src.app.schemas.requests.eda import EdaRequestSchema
3
+ from .helper import get_summary, get_feature_summary, sample_dataset
4
+ from .agents.models import IntelAgentResponseSchema, ExecuterAgentResponseSchema
5
+ from src.app.schemas.responses.eda import IterationDetails, IterationLogs
6
+ from .agents.prompts import executer_agent_desc, executer_agent_instructions
7
+ from .agents.agents import AgentClass
8
+ from .tools.lib import tool_library
9
+ from src.core.utils import logger
10
+ from agno.agent import RunResponse # type: ignore
11
+ from typing import Union
12
+ import os
13
+ import pandas as pd
14
+
15
+ load_dotenv()
16
+
17
+ class EdaLoop:
18
+ def __init__(self, payload: EdaRequestSchema, verbose=False):
19
+ self.ag = AgentClass()
20
+ self.payload = payload
21
+ self.dataset_overview = self._extract_details(verbose=verbose)
22
+
23
+ def _extract_details(self, verbose=False)->Union[IntelAgentResponseSchema, None]:
24
+ '''Prepares an initial dataset summary with feature descriptions'''
25
+ try:
26
+ logger.info("Preparing dataset summary....", log_type="eda", console=verbose)
27
+
28
+ dataset = pd.read_csv(self.payload.dataset_path)
29
+ dataset_summary = get_summary(dataframe=dataset)
30
+
31
+ feature_summary = get_feature_summary(data=dataset_summary)
32
+ sampled_data = sample_dataset(dataset=dataset)
33
+
34
+ prompt = (
35
+ f'User prompt: {self.payload.user_prompt}\n'
36
+ f'Business Understanding: {self.payload.requirement_analysis.business_understanding}\n'
37
+ f'Ml Task identified: {self.payload.technical_research.model_response.task}\n'
38
+ f'Dataset format: {self.payload.requirement_analysis.model_response.data_format}\n'
39
+ f'Features:\n {feature_summary}\n'
40
+ f'10 Sample Values from dataset:\n {sampled_data}\n'
41
+ )
42
+
43
+ intel_agent_response: RunResponse = self.ag.agents['intel_agent'].run(prompt, stream=False)
44
+ return intel_agent_response.content
45
+
46
+ except Exception as e:
47
+ logger.error(f"Dataset summary preparation failed with error: {e}", log_type="eda", console=verbose)
48
+ return None
49
+
50
+ def _build_orchestrator_prompt(self, execution_logs=None)->str:
51
+ available_tools_str = ""
52
+ for idx, tool in enumerate(tool_library):
53
+ idx+=1
54
+ tool_data = f"{idx}. Tool name: {tool}\n Tool details: {tool_library[tool]['metadata']}\n"
55
+ available_tools_str += tool_data
56
+
57
+ execution_history = ""
58
+ if execution_logs:
59
+ execution_history = '\n'.join(execution_logs)
60
+
61
+ prompt = (
62
+ f'User Prompt: {self.payload.user_prompt}\n'
63
+ f'Task: {self.dataset_overview.ml_task}\n'
64
+ f'Business Understanding: {self.payload.requirement_analysis.business_understanding}\n'
65
+ f'Requirements: {self.payload.requirement_analysis.model_response.technical_requirements}\n'
66
+ f'Constraints: {self.payload.requirement_analysis.model_response.constraints}\n\n'
67
+ # f'Previous Pre-Processing Steps Done: \n{execution_history}\n\n'
68
+ f'Detailed Execution Summary: \n{self.execution_summary}\n\n'
69
+ f'Tool Library: \n{available_tools_str}\n'
70
+ )
71
+
72
+ return prompt
73
+
74
+ def _build_judging_prompt(self, execution_logs)->str:
75
+ execution_history = '\n'.join(execution_logs)
76
+
77
+ available_tools_str = ""
78
+ for idx, tool in enumerate(tool_library):
79
+ idx+=1
80
+ tool_data = f"{idx}. Tool name: {tool}\n"
81
+ available_tools_str += tool_data
82
+
83
+ prompt = (
84
+ f'User Task:\n {self.payload.user_prompt}\n'
85
+ f'Businesss Understanding / Goals:\n {self.payload.requirement_analysis.business_understanding}\n'
86
+ f'Execution History:\n {execution_history}'
87
+ f'Available Tools:\n {available_tools_str}\n'
88
+ )
89
+
90
+ return prompt
91
+
92
+ def _build_analyzer_prompt(self, llm_response, tool_executed)->str:
93
+ prompt = (
94
+ f"Name of the function executed: {tool_executed['name']}\n"
95
+ f"Function Details: {tool_executed['metadata']}"
96
+ f'Function execution details:\n{llm_response.content.execution_details}\n'
97
+ f'Function results:\n{llm_response.messages[3].content}'
98
+ )
99
+ return prompt
100
+
101
+ def loop(self, verbose=False)->IterationLogs:
102
+
103
+ logger.info(f"Starting EDA loop with available tools: {tool_library.keys()}", log_type="eda", console=verbose)
104
+
105
+ execution_logs = []
106
+ iteration_logs = []
107
+ self.execution_summary = None
108
+ iteration_count = 1
109
+
110
+ recent_data_stored_path = self.payload.dataset_path
111
+
112
+ while(True):
113
+
114
+ logger.info(f"Running Iteration {iteration_count}", log_type="eda", console=verbose)
115
+
116
+ '''==================== ORCHESTRATOR SEGMENT ===================='''
117
+
118
+ orchestrator_prompt = self._build_orchestrator_prompt(execution_logs=execution_logs)
119
+
120
+ try:
121
+ orchestrator_agent_response: RunResponse = self.ag.agents['orchestrator_agent'].run(orchestrator_prompt)
122
+ except Exception as e:
123
+ logger.error(f"Failed to generate response from orchestator with error: {e}", log_type='eda', console=verbose)
124
+ continue
125
+
126
+ if isinstance(orchestrator_agent_response.content, str):
127
+ logger.error("Failed to fit orchestrator response to data model", log_type='eda', console=verbose)
128
+ continue
129
+
130
+ try:
131
+ selected_tool_for_executioner = tool_library[orchestrator_agent_response.content.tool_name]
132
+ except Exception as e:
133
+ logger.error(f"Tool allocation failed with error: {e}", log_type='eda', console=verbose)
134
+ continue
135
+
136
+ logger.info(f"Executing tool: {orchestrator_agent_response.content.tool_name}. Justification: {orchestrator_agent_response.content.justification}", log_type="eda", console=verbose)
137
+
138
+ '''==================== TOOL EXECUTION SEGMENT ===================='''
139
+
140
+ target_features = ""
141
+ if self.dataset_overview.dataset_summary.target_features:
142
+ target_features = '\n'.join(self.dataset_overview.dataset_summary.target_features)
143
+
144
+ while(True):
145
+ self.ag.build_agent(
146
+ agent_name="executer_agent",
147
+ agent_desc=executer_agent_desc,
148
+ agent_instructions=executer_agent_instructions,
149
+ agent_response_model=ExecuterAgentResponseSchema,
150
+ tools=[selected_tool_for_executioner['function']]
151
+ )
152
+
153
+ prompt = (
154
+ # f"Only execute the given '{selected_tool_for_executioner}'\n"
155
+ f"You can read the data from path: '{recent_data_stored_path}' to execute the given tool: '{selected_tool_for_executioner}' \n"
156
+ f'Details about dataset:\n Target Features/Columns: {target_features}'
157
+ )
158
+
159
+ prompt = f"Read the data from path: '{recent_data_stored_path}'. Use this file path to execute the '{selected_tool_for_executioner}' tool. Target Features/Columns: {target_features}"
160
+
161
+ try:
162
+ executor_agent_response: RunResponse = self.ag.agents['executer_agent'].run(prompt, stream=False)
163
+ # logger.info(f'{executor_agent_response.messages[3].content}', log_type='eda', console=verbose)
164
+ _ = executor_agent_response.messages[3].content
165
+
166
+ if executor_agent_response.content.output_file_path:
167
+ if executor_agent_response.content.output_file_path is not None:
168
+ recent_data_stored_path = executor_agent_response.content.output_file_path
169
+
170
+ logger.info(f"Tool Executed successfully", log_type='eda', console=verbose)
171
+ break
172
+ except Exception as e:
173
+ logger.error(f"Tool execution failed with error: {e}. Trying Again....", log_type='eda', console=verbose)
174
+
175
+ '''==================== ANALYZER SEGMENT ===================='''
176
+
177
+ while(True):
178
+ try:
179
+ logger.info("Generating analysis....", log_type='eda', console=verbose)
180
+
181
+ analyzer_prompt = self._build_analyzer_prompt(llm_response=executor_agent_response, tool_executed=selected_tool_for_executioner)
182
+ analyzer_agent_response: RunResponse = self.ag.agents['analyzer_agent'].run(analyzer_prompt, stream=False)
183
+
184
+ if not isinstance(analyzer_agent_response.content, str):
185
+ break
186
+
187
+ except Exception as e:
188
+ logger.error(f"Failed to generate response from analyzer with error: {e}. Trying again.....", log_type='eda', console=verbose)
189
+
190
+ execution_details = (
191
+ f'Iteration / Step Number: {iteration_count}\n'
192
+ # f'{analyzer_prompt}\n'
193
+ f"Name of the function executed: {selected_tool_for_executioner['name']}\n"
194
+ f"Function Details: {selected_tool_for_executioner['metadata']}"
195
+ f'Function execution details:\n{executor_agent_response.content.execution_details}\n'
196
+ f'Key Details: {analyzer_agent_response.content.key_insights}\n'
197
+ # f'Potential Issues: {analyzer_agent_response.content.potential_issues}\n'
198
+ # f'Recommendations: {analyzer_agent_response.content.recommendations}\n'
199
+ )
200
+
201
+ # logger.info(f"Execution Details: \n{execution_details}", log_type='eda', console=verbose)
202
+ execution_logs.append(execution_details)
203
+
204
+ '''==================== JUDGING SEGMENT ===================='''
205
+
206
+ while(True):
207
+ try:
208
+ logger.info("Evaluating Iteration with a Judge....", log_type='eda', console=verbose)
209
+
210
+ juding_prompt = self._build_judging_prompt(execution_logs = execution_logs)
211
+ judging_agent_response: RunResponse = self.ag.agents['judging_agent'].run(juding_prompt, stream=False)
212
+
213
+ if not isinstance(judging_agent_response.content, str):
214
+ break
215
+
216
+ except Exception as e:
217
+ logger.error(f"Failed to generate response from judge with error: {e}. Trying again.....", log_type='eda', console=verbose)
218
+
219
+
220
+ '''==================== STORING ITERATION DETAILS SEGMENT ===================='''
221
+
222
+ iteration_logs.append(IterationDetails(
223
+ iteration_number = iteration_count,
224
+ orchestrator_response = orchestrator_agent_response.content,
225
+ executer_response = executor_agent_response.content,
226
+ analyzer_response = analyzer_agent_response.content,
227
+ judge_response = judging_agent_response.content
228
+ ))
229
+
230
+ iteration_count+=1
231
+
232
+ if judging_agent_response.content.stop_loop:
233
+ logger.info(f"Stop Loop = {judging_agent_response.content.stop_loop}. Justification: {judging_agent_response.content.justification}", log_type='eda', console=verbose)
234
+ break
235
+ else:
236
+ self.execution_summary = judging_agent_response.content.detailed_context
237
+ logger.info(f"Stop Loop = {judging_agent_response.content.stop_loop}. Justification: {judging_agent_response.content.justification}", log_type='eda', console=verbose)
238
+
239
+ return IterationLogs(logs=iteration_logs)
240
+
241
+
242
+
243
+
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
src/app/pipelines/eda/tools/analysis_tools/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .univariate_analysis import univariate_analysis
2
+ from .bivariate_analysis import bivariate_analysis
3
+ from .multivariate_analysis import multivariate_analysis
src/app/pipelines/eda/tools/analysis_tools/bivariate_analysis.py ADDED
@@ -0,0 +1,1028 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from scipy import stats
4
+ from sklearn.metrics import mutual_info_score
5
+ from statsmodels.stats.multicomp import pairwise_tukeyhsd
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ from scipy.stats import pointbiserialr
9
+ from agno.utils.log import logger
10
+ import json
11
+
12
+ def comprehensive_bivariate_analysis(df, visualize=False, output_dir="bivariate_plots"):
13
+ """
14
+ Performs comprehensive bivariate analysis on all pairs of variables in the dataframe.
15
+
16
+ Parameters:
17
+ -----------
18
+ df : pandas.DataFrame
19
+ The input dataframe to analyze
20
+ visualize : bool, default=True
21
+ Whether to generate visualizations
22
+ output_dir : str, default="bivariate_plots"
23
+ Directory to save visualization plots if visualize=True
24
+
25
+ Returns:
26
+ --------
27
+ dict
28
+ A dictionary containing the results of all bivariate analyses and a summary
29
+ """
30
+ results = {
31
+ 'numerical_vs_numerical': {},
32
+ 'categorical_vs_categorical': {},
33
+ 'numerical_vs_categorical': {},
34
+ 'summary': {
35
+ 'strongest_correlations': [],
36
+ 'significant_category_associations': [],
37
+ 'significant_group_differences': [],
38
+ 'key_insights': []
39
+ }
40
+ }
41
+
42
+ # Identify variable types
43
+ numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
44
+ categorical_features = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
45
+
46
+ # Convert any low-cardinality numerical features to categorical
47
+ for col in numerical_features.copy():
48
+ if df[col].nunique() <= 10: # Threshold for low cardinality
49
+ categorical_features.append(col)
50
+ numerical_features.remove(col)
51
+
52
+ # 1. Numerical vs Numerical Analysis
53
+ if len(numerical_features) >= 2:
54
+ print("Starting Numerical-numerical analysis")
55
+ results['numerical_vs_numerical'] = analyze_numerical_numerical(df, numerical_features, visualize, output_dir)
56
+
57
+ # 2. Categorical vs Categorical Analysis
58
+ if len(categorical_features) >= 2:
59
+ print("Starting categorical-categorical analysis")
60
+ results['categorical_vs_categorical'] = analyze_categorical_categorical(df, categorical_features, visualize, output_dir)
61
+
62
+ # 3. Numerical vs Categorical Analysis
63
+ if len(numerical_features) >= 1 and len(categorical_features) >= 1:
64
+ print("Starting numerical-categorical analysis")
65
+ results['numerical_vs_categorical'] = analyze_numerical_categorical(df, numerical_features, categorical_features, visualize=visualize, output_dir=output_dir)
66
+
67
+ # Generate summary of findings
68
+ print("Generating summaries")
69
+ results['summary'] = generate_summary(results)
70
+
71
+ return results
72
+
73
+ def analyze_numerical_numerical(df, numerical_features, visualize=False, output_dir="bivariate_plots"):
74
+ """Analyze relationships between numerical features"""
75
+ results = {
76
+ 'correlations': {
77
+ 'pearson': {},
78
+ 'spearman': {},
79
+ 'kendall': {}
80
+ },
81
+ 'covariance': {},
82
+ 'best_fit_relationships': {},
83
+ 'significant_correlations': []
84
+ }
85
+
86
+ # Create output directory if it doesn't exist and visualization is enabled
87
+ if visualize:
88
+ import os
89
+ os.makedirs(output_dir, exist_ok=True)
90
+
91
+ # Calculate correlation matrices
92
+ if len(numerical_features) > 1:
93
+ # Pearson correlation (linear)
94
+ pearson_corr = df[numerical_features].corr(method='pearson')
95
+ # Spearman correlation (monotonic)
96
+ spearman_corr = df[numerical_features].corr(method='spearman')
97
+ # Kendall correlation (ordinal)
98
+ kendall_corr = df[numerical_features].corr(method='kendall')
99
+
100
+ # Calculate p-values for Pearson correlation
101
+ pearson_p_values = pd.DataFrame(np.zeros_like(pearson_corr),
102
+ index=pearson_corr.index,
103
+ columns=pearson_corr.columns)
104
+
105
+ # Calculate p-values for Spearman correlation
106
+ spearman_p_values = pd.DataFrame(np.zeros_like(spearman_corr),
107
+ index=spearman_corr.index,
108
+ columns=spearman_corr.columns)
109
+
110
+ # Visualize correlation heatmaps if enabled
111
+ if visualize:
112
+ plt.figure(figsize=(12, 10))
113
+ sns.heatmap(pearson_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
114
+ plt.title('Pearson Correlation Heatmap')
115
+ plt.tight_layout()
116
+ plt.savefig(f"{output_dir}/pearson_correlation_heatmap.png")
117
+ plt.close()
118
+
119
+ plt.figure(figsize=(12, 10))
120
+ sns.heatmap(spearman_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
121
+ plt.title('Spearman Correlation Heatmap')
122
+ plt.tight_layout()
123
+ plt.savefig(f"{output_dir}/spearman_correlation_heatmap.png")
124
+ plt.close()
125
+
126
+ for i in range(len(numerical_features)):
127
+ for j in range(i+1, len(numerical_features)):
128
+ x = df[numerical_features[i]].values
129
+ y = df[numerical_features[j]].values
130
+
131
+ # Remove NaN values
132
+ mask = ~(np.isnan(x) | np.isnan(y))
133
+ x_clean = x[mask]
134
+ y_clean = y[mask]
135
+
136
+ if len(x_clean) > 2: # Need at least 3 points for correlation
137
+ # Calculate Pearson correlation and p-value
138
+ r_pearson, p_pearson = stats.pearsonr(x_clean, y_clean)
139
+ pearson_p_values.loc[numerical_features[i], numerical_features[j]] = p_pearson
140
+ pearson_p_values.loc[numerical_features[j], numerical_features[i]] = p_pearson
141
+
142
+ # Calculate Spearman correlation and p-value
143
+ r_spearman, p_spearman = stats.spearmanr(x_clean, y_clean)
144
+ spearman_p_values.loc[numerical_features[i], numerical_features[j]] = p_spearman
145
+ spearman_p_values.loc[numerical_features[j], numerical_features[i]] = p_spearman
146
+
147
+ # Analyze best fit relationship (linear, polynomial, logarithmic, exponential)
148
+ if len(x_clean) >= 10: # Only try to fit models if we have enough data
149
+ best_fit_info = analyze_best_fit_relationship(x_clean, y_clean)
150
+ results['best_fit_relationships'][f"{numerical_features[i]}_vs_{numerical_features[j]}"] = best_fit_info
151
+
152
+ # Visualize scatterplot and regression line if enabled
153
+ if visualize and abs(r_pearson) >= 0.3: # Only plot more significant relationships
154
+ plt.figure(figsize=(10, 6))
155
+ sns.regplot(x=x_clean, y=y_clean, line_kws={"color":"red"})
156
+ plt.xlabel(numerical_features[i])
157
+ plt.ylabel(numerical_features[j])
158
+ plt.title(f'{numerical_features[i]} vs {numerical_features[j]} (r={r_pearson:.3f}, p={p_pearson:.4f})')
159
+ plt.savefig(f"{output_dir}/{numerical_features[i]}_vs_{numerical_features[j]}_scatter.png")
160
+ plt.close()
161
+
162
+ # Store results
163
+ results['correlations']['pearson'] = {
164
+ 'correlation_matrix': pearson_corr.to_dict(),
165
+ 'p_values': pearson_p_values.to_dict()
166
+ }
167
+
168
+ results['correlations']['spearman'] = {
169
+ 'correlation_matrix': spearman_corr.to_dict(),
170
+ 'p_values': spearman_p_values.to_dict()
171
+ }
172
+
173
+ results['correlations']['kendall'] = {
174
+ 'correlation_matrix': kendall_corr.to_dict()
175
+ }
176
+
177
+ # Calculate covariance matrix
178
+ covariance_matrix = df[numerical_features].cov()
179
+ results['covariance'] = covariance_matrix.to_dict()
180
+
181
+ # Find pairs with significant correlation
182
+ significant_pairs = []
183
+ for i in range(len(numerical_features)):
184
+ for j in range(i+1, len(numerical_features)):
185
+ feat_i = numerical_features[i]
186
+ feat_j = numerical_features[j]
187
+
188
+ if abs(pearson_corr.loc[feat_i, feat_j]) >= 0.3: # Lower threshold to capture more relationships
189
+ p_value = pearson_p_values.loc[feat_i, feat_j]
190
+ correlation_type = "positive" if pearson_corr.loc[feat_i, feat_j] > 0 else "negative"
191
+ correlation_strength = ""
192
+
193
+ if abs(pearson_corr.loc[feat_i, feat_j]) >= 0.8:
194
+ correlation_strength = "very strong"
195
+ elif abs(pearson_corr.loc[feat_i, feat_j]) >= 0.6:
196
+ correlation_strength = "strong"
197
+ elif abs(pearson_corr.loc[feat_i, feat_j]) >= 0.4:
198
+ correlation_strength = "moderate"
199
+ else:
200
+ correlation_strength = "weak"
201
+
202
+ # Compare with Spearman to detect non-linear relationships
203
+ pearson_spearman_diff = abs(pearson_corr.loc[feat_i, feat_j] - spearman_corr.loc[feat_i, feat_j])
204
+ relationship_type = "linear"
205
+ if pearson_spearman_diff > 0.1:
206
+ relationship_type = "potentially non-linear"
207
+
208
+ significant_pairs.append({
209
+ 'feature_1': feat_i,
210
+ 'feature_2': feat_j,
211
+ 'pearson_correlation': pearson_corr.loc[feat_i, feat_j],
212
+ 'spearman_correlation': spearman_corr.loc[feat_i, feat_j],
213
+ 'p_value': p_value,
214
+ 'significant': p_value < 0.05,
215
+ 'correlation_type': correlation_type,
216
+ 'correlation_strength': correlation_strength,
217
+ 'relationship_type': relationship_type,
218
+ 'pearson_spearman_diff': pearson_spearman_diff
219
+ })
220
+
221
+ # Sort by absolute correlation value
222
+ significant_pairs.sort(key=lambda x: abs(x['pearson_correlation']), reverse=True)
223
+ results['significant_correlations'] = significant_pairs
224
+
225
+ return results
226
+
227
+ def analyze_best_fit_relationship(x, y):
228
+ """Analyze which type of relationship (linear, polynomial, log, exponential) best fits the data"""
229
+ results = {}
230
+
231
+ # Check for non-positive values that would cause issues with log/exponential fits
232
+ x_min, y_min = np.min(x), np.min(y)
233
+
234
+ # Linear fit
235
+ try:
236
+ slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
237
+ results['linear'] = {
238
+ 'equation': f'y = {slope:.4f}x + {intercept:.4f}',
239
+ 'r_squared': r_value**2,
240
+ 'p_value': p_value
241
+ }
242
+ except:
243
+ results['linear'] = {'error': 'Failed to fit linear model'}
244
+
245
+ # Polynomial fit (degree 2)
246
+ try:
247
+ coeffs = np.polyfit(x, y, 2)
248
+ p = np.poly1d(coeffs)
249
+ # Calculate R-squared
250
+ yhat = p(x)
251
+ ybar = np.mean(y)
252
+ ssreg = np.sum((yhat - ybar)**2)
253
+ sstot = np.sum((y - ybar)**2)
254
+ r_squared = ssreg / sstot
255
+
256
+ results['polynomial'] = {
257
+ 'equation': f'y = {coeffs[0]:.4f}x² + {coeffs[1]:.4f}x + {coeffs[2]:.4f}',
258
+ 'r_squared': r_squared
259
+ }
260
+ except:
261
+ results['polynomial'] = {'error': 'Failed to fit polynomial model'}
262
+
263
+ # Logarithmic fit (if x > 0)
264
+ if x_min > 0:
265
+ try:
266
+ coeffs = np.polyfit(np.log(x), y, 1)
267
+ # Calculate R-squared
268
+ yhat = coeffs[0] * np.log(x) + coeffs[1]
269
+ ybar = np.mean(y)
270
+ ssreg = np.sum((yhat - ybar)**2)
271
+ sstot = np.sum((y - ybar)**2)
272
+ r_squared = ssreg / sstot
273
+
274
+ results['logarithmic'] = {
275
+ 'equation': f'y = {coeffs[0]:.4f}ln(x) + {coeffs[1]:.4f}',
276
+ 'r_squared': r_squared
277
+ }
278
+ except:
279
+ results['logarithmic'] = {'error': 'Failed to fit logarithmic model'}
280
+
281
+ # Exponential fit (if y > 0)
282
+ if y_min > 0:
283
+ try:
284
+ coeffs = np.polyfit(x, np.log(y), 1)
285
+ # Calculate R-squared
286
+ yhat = np.exp(coeffs[1]) * np.exp(coeffs[0] * x)
287
+ ybar = np.mean(y)
288
+ ssreg = np.sum((yhat - ybar)**2)
289
+ sstot = np.sum((y - ybar)**2)
290
+ r_squared = ssreg / sstot
291
+
292
+ results['exponential'] = {
293
+ 'equation': f'y = {np.exp(coeffs[1]):.4f}e^({coeffs[0]:.4f}x)',
294
+ 'r_squared': r_squared
295
+ }
296
+ except:
297
+ results['exponential'] = {'error': 'Failed to fit exponential model'}
298
+
299
+ # Find best fit model
300
+ best_fit = {'model': None, 'r_squared': -1}
301
+ for model_type in results:
302
+ if 'r_squared' in results[model_type] and results[model_type]['r_squared'] > best_fit['r_squared']:
303
+ best_fit = {'model': model_type, 'r_squared': results[model_type]['r_squared']}
304
+
305
+ results['best_fit'] = best_fit
306
+
307
+ return results
308
+
309
+ def analyze_categorical_categorical(df, categorical_features, visualize=False, output_dir="bivariate_plots"):
310
+ """Analyze relationships between categorical features"""
311
+ results = {
312
+ 'chi_square_tests': {},
313
+ 'cramers_v': {},
314
+ 'contingency_tables': {},
315
+ 'phi_coefficients': {},
316
+ 'lambda_coefficients': {},
317
+ 'significant_associations': []
318
+ }
319
+
320
+ # Create output directory if it doesn't exist and visualization is enabled
321
+ if visualize:
322
+ import os
323
+ os.makedirs(output_dir, exist_ok=True)
324
+
325
+ # Chi-square tests & Cramer's V
326
+ for i in range(len(categorical_features)):
327
+ for j in range(i+1, len(categorical_features)):
328
+ feat_i = categorical_features[i]
329
+ feat_j = categorical_features[j]
330
+
331
+ # Get clean data (remove NaNs)
332
+ data_ij = df[[feat_i, feat_j]].dropna()
333
+
334
+ # Skip if either feature has only one category after dropping NAs
335
+ if data_ij[feat_i].nunique() <= 1 or data_ij[feat_j].nunique() <= 1:
336
+ continue
337
+
338
+ # Create contingency table
339
+ contingency_table = pd.crosstab(data_ij[feat_i], data_ij[feat_j])
340
+ results['contingency_tables'][f"{feat_i}_vs_{feat_j}"] = contingency_table.to_dict()
341
+
342
+ # Create normalized contingency tables (row and column proportions)
343
+ row_proportions = contingency_table.div(contingency_table.sum(axis=1), axis=0)
344
+ col_proportions = contingency_table.div(contingency_table.sum(axis=0), axis=1)
345
+
346
+ # Check if we have enough samples for chi-square test
347
+ # Rule of thumb: 80% of cells should have expected frequencies >= 5
348
+ chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
349
+ expected_array = np.array(expected)
350
+
351
+ # Calculate percentage of cells with expected frequency >= 5
352
+ valid_expected_percentage = np.sum(expected_array >= 5) / expected_array.size
353
+
354
+ # Calculate Cramer's V (measure of association)
355
+ n = contingency_table.sum().sum()
356
+ min_dim = min(contingency_table.shape) - 1
357
+ if min_dim > 0: # Avoid division by zero
358
+ cramers_v = np.sqrt(chi2 / (n * min_dim))
359
+ else:
360
+ cramers_v = np.nan
361
+
362
+ results['chi_square_tests'][f"{feat_i}_vs_{feat_j}"] = {
363
+ 'chi2': chi2,
364
+ 'p_value': p,
365
+ 'degrees_of_freedom': dof,
366
+ 'valid_expected_percentage': valid_expected_percentage,
367
+ 'reliable_test': valid_expected_percentage >= 0.8 # Generally accepted threshold
368
+ }
369
+
370
+ results['cramers_v'][f"{feat_i}_vs_{feat_j}"] = cramers_v
371
+
372
+ # For 2x2 contingency tables, calculate Phi coefficient
373
+ if contingency_table.shape == (2, 2):
374
+ phi_coef = np.sqrt(chi2 / n)
375
+ results['phi_coefficients'][f"{feat_i}_vs_{feat_j}"] = phi_coef
376
+
377
+ # Calculate Lambda coefficient (asymmetric measure of association)
378
+ try:
379
+ # Lambda for predicting feat_j from feat_i
380
+ lambda_ij = calculate_lambda(contingency_table)
381
+ # Lambda for predicting feat_i from feat_j
382
+ lambda_ji = calculate_lambda(contingency_table.T)
383
+
384
+ results['lambda_coefficients'][f"{feat_i}_vs_{feat_j}"] = {
385
+ 'lambda_ij': lambda_ij, # For predicting j from i
386
+ 'lambda_ji': lambda_ji # For predicting i from j
387
+ }
388
+ except:
389
+ pass
390
+
391
+ # Visualize contingency table as heatmap if enabled
392
+ if visualize and valid_expected_percentage >= 0.8 and p < 0.05:
393
+ plt.figure(figsize=(10, 8))
394
+ sns.heatmap(contingency_table, annot=True, fmt='d', cmap='YlGnBu')
395
+ plt.title(f'{feat_i} vs {feat_j} (Cramer\'s V={cramers_v:.3f}, p={p:.4f})')
396
+ plt.savefig(f"{output_dir}/{feat_i}_vs_{feat_j}_contingency.png")
397
+ plt.close()
398
+
399
+ # Also plot normalized contingency table (row proportions)
400
+ plt.figure(figsize=(10, 8))
401
+ sns.heatmap(row_proportions, annot=True, fmt='.2f', cmap='YlGnBu')
402
+ plt.title(f'{feat_i} vs {feat_j} (Row Proportions)')
403
+ plt.savefig(f"{output_dir}/{feat_i}_vs_{feat_j}_row_proportions.png")
404
+ plt.close()
405
+
406
+ # Check for significant association
407
+ if p < 0.05 and valid_expected_percentage >= 0.8:
408
+ association_strength = ""
409
+ if cramers_v >= 0.5:
410
+ association_strength = "strong"
411
+ elif cramers_v >= 0.3:
412
+ association_strength = "moderate"
413
+ elif cramers_v >= 0.1:
414
+ association_strength = "weak"
415
+ else:
416
+ association_strength = "very weak"
417
+
418
+ # Check if we calculated lambda coefficients
419
+ lambda_info = {}
420
+ if f"{feat_i}_vs_{feat_j}" in results['lambda_coefficients']:
421
+ lambda_info = results['lambda_coefficients'][f"{feat_i}_vs_{feat_j}"]
422
+
423
+ # Check if we calculated phi coefficient
424
+ phi_coef = None
425
+ if f"{feat_i}_vs_{feat_j}" in results['phi_coefficients']:
426
+ phi_coef = results['phi_coefficients'][f"{feat_i}_vs_{feat_j}"]
427
+
428
+ results['significant_associations'].append({
429
+ 'feature_1': feat_i,
430
+ 'feature_2': feat_j,
431
+ 'chi_square': chi2,
432
+ 'p_value': p,
433
+ 'cramers_v': cramers_v,
434
+ 'association_strength': association_strength,
435
+ 'phi_coefficient': phi_coef,
436
+ 'lambda_coefficients': lambda_info
437
+ })
438
+
439
+ # Sort by Cramer's V
440
+ results['significant_associations'].sort(key=lambda x: x['cramers_v'], reverse=True)
441
+
442
+ return results
443
+
444
+ def calculate_lambda(contingency_table):
445
+ """
446
+ Calculate Goodman and Kruskal's lambda
447
+ Lambda measures the proportional reduction in error when values of one variable are used to predict values of another
448
+ """
449
+ # Convert to numpy array
450
+ cont_array = np.array(contingency_table)
451
+
452
+ # Total observations
453
+ n = np.sum(cont_array)
454
+
455
+ # Sum of maximum frequency in each row
456
+ row_max_sum = np.sum(np.max(cont_array, axis=1))
457
+
458
+ # Mode of column sums (most frequent category overall)
459
+ col_sums = np.sum(cont_array, axis=0)
460
+ overall_mode = np.max(col_sums)
461
+
462
+ # Calculate lambda
463
+ if overall_mode == n:
464
+ # No predictive ability (all observations are in one category)
465
+ return 0
466
+ else:
467
+ return (row_max_sum - overall_mode) / (n - overall_mode)
468
+
469
+ def analyze_numerical_categorical(df, numerical_features, categorical_features, max_categories=30, sample_size=None, skip_tukey=False, visualize=False, output_dir="bivariate_plots"):
470
+ """
471
+ Optimized analysis of relationships between numerical and categorical features
472
+
473
+ Parameters:
474
+ -----------
475
+ df : pandas.DataFrame
476
+ The input dataframe
477
+ numerical_features : list
478
+ List of numerical feature names
479
+ categorical_features : list
480
+ List of categorical feature names
481
+ max_categories : int, default=30
482
+ Skip categorical features with more unique values than this
483
+ sample_size : int or None, default=None
484
+ If provided, analyze a random sample of this size for large datasets
485
+ skip_tukey : bool, default=True
486
+ Skip Tukey's test which can be computationally expensive
487
+ visualize : bool, default=True
488
+ Whether to generate visualizations
489
+ output_dir : str, default="bivariate_plots"
490
+ Directory to save visualization plots if visualize=True
491
+
492
+ Returns:
493
+ --------
494
+ dict
495
+ Results of numerical-categorical analysis
496
+ """
497
+ results = {
498
+ 'anova_tests': {},
499
+ 'effect_sizes': {},
500
+ 'group_statistics': {},
501
+ 'point_biserial_correlations': {},
502
+ 'significant_differences': []
503
+ }
504
+
505
+ # Create output directory if it doesn't exist and visualization is enabled
506
+ if visualize:
507
+ import os
508
+ os.makedirs(output_dir, exist_ok=True)
509
+
510
+ # Performance optimization: Use a sample for large datasets
511
+ if sample_size is not None and len(df) > sample_size:
512
+ df_sample = df.sample(sample_size, random_state=42)
513
+ else:
514
+ df_sample = df
515
+
516
+ # Pre-compute feature cardinality to avoid repeated calculations
517
+ cat_feature_cardinality = {feat: df_sample[feat].nunique() for feat in categorical_features}
518
+
519
+ for num_feat in numerical_features:
520
+ for cat_feat in categorical_features:
521
+ # Skip high cardinality categorical features
522
+ if cat_feature_cardinality[cat_feat] > max_categories:
523
+ continue
524
+
525
+ # Get clean data (remove NaNs)
526
+ data = df_sample[[num_feat, cat_feat]].dropna()
527
+
528
+ # Skip if categorical feature has only one category after dropping NAs
529
+ if data[cat_feat].nunique() <= 1:
530
+ continue
531
+
532
+ # Calculate group statistics more efficiently using agg
533
+ group_stats = data.groupby(cat_feat)[num_feat].agg(['count', 'mean', 'std', 'min', 'max', 'median'])
534
+
535
+ # Calculate IQR and outlier bounds for each group
536
+ group_stats['q1'] = data.groupby(cat_feat)[num_feat].quantile(0.25)
537
+ group_stats['q3'] = data.groupby(cat_feat)[num_feat].quantile(0.75)
538
+ group_stats['iqr'] = group_stats['q3'] - group_stats['q1']
539
+ group_stats['lower_bound'] = group_stats['q1'] - 1.5 * group_stats['iqr']
540
+ group_stats['upper_bound'] = group_stats['q3'] + 1.5 * group_stats['iqr']
541
+
542
+ # Convert to dictionary with simplified structure to save memory
543
+ results['group_statistics'][f"{num_feat}_by_{cat_feat}"] = {
544
+ 'counts': group_stats['count'].to_dict(),
545
+ 'means': group_stats['mean'].to_dict(),
546
+ 'stds': group_stats['std'].to_dict(),
547
+ 'medians': group_stats['median'].to_dict(),
548
+ 'q1': group_stats['q1'].to_dict(),
549
+ 'q3': group_stats['q3'].to_dict()
550
+ }
551
+
552
+ # Calculate point-biserial correlation for binary categorical variables
553
+ if data[cat_feat].nunique() == 2:
554
+ try:
555
+ # For binary categories, use point-biserial correlation
556
+ # Need to convert categorical to numeric first
557
+ cat_values = data[cat_feat].unique()
558
+ binary_map = {cat_values[0]: 0, cat_values[1]: 1}
559
+ data_numeric = data.copy()
560
+ data_numeric[cat_feat] = data_numeric[cat_feat].map(binary_map)
561
+
562
+ # Calculate point-biserial correlation
563
+ corr, p_value = pointbiserialr(data_numeric[cat_feat], data_numeric[num_feat])
564
+
565
+ results['point_biserial_correlations'][f"{num_feat}_by_{cat_feat}"] = {
566
+ 'correlation': corr,
567
+ 'p_value': p_value,
568
+ 'categorical_mapping': binary_map
569
+ }
570
+ except Exception as e:
571
+ pass
572
+
573
+ # Get groups for ANOVA more efficiently
574
+ groups = [data[data[cat_feat] == cat][num_feat].values
575
+ for cat in data[cat_feat].unique()
576
+ if len(data[data[cat_feat] == cat]) > 0]
577
+
578
+ group_labels = [cat for cat in data[cat_feat].unique()
579
+ if len(data[data[cat_feat] == cat]) > 0]
580
+
581
+ # Create visualizations if enabled
582
+ if visualize:
583
+ # Create box plot
584
+ plt.figure(figsize=(12, 6))
585
+ sns.boxplot(x=cat_feat, y=num_feat, data=data)
586
+ plt.title(f'{num_feat} by {cat_feat}')
587
+ plt.xticks(rotation=45)
588
+ plt.tight_layout()
589
+ plt.savefig(f"{output_dir}/{num_feat}_by_{cat_feat}_boxplot.png")
590
+ plt.close()
591
+
592
+ # Create violin plot for more detail on distributions
593
+ plt.figure(figsize=(12, 6))
594
+ sns.violinplot(x=cat_feat, y=num_feat, data=data)
595
+ plt.title(f'{num_feat} by {cat_feat} (Distribution)')
596
+ plt.xticks(rotation=45)
597
+ plt.tight_layout()
598
+ plt.savefig(f"{output_dir}/{num_feat}_by_{cat_feat}_violinplot.png")
599
+ plt.close()
600
+
601
+ # Run ANOVA if we have at least 2 groups
602
+ if len(groups) >= 2:
603
+ try:
604
+ # One-way ANOVA
605
+ f_stat, p_value = stats.f_oneway(*groups)
606
+
607
+ # Calculate effect size (eta-squared) more efficiently
608
+ grand_mean = data[num_feat].mean()
609
+
610
+ # Vectorized computation of SS between
611
+ group_means = np.array([group.mean() for group in groups])
612
+ group_sizes = np.array([len(group) for group in groups])
613
+ ss_between = np.sum(group_sizes * (group_means - grand_mean)**2)
614
+
615
+ # Vectorized computation of SS total
616
+ ss_total = np.sum((data[num_feat].values - grand_mean)**2)
617
+
618
+ eta_squared = ss_between / ss_total if ss_total != 0 else 0
619
+
620
+ # Calculate omega-squared (less biased estimate of effect size than eta-squared)
621
+ k = len(groups) # Number of groups
622
+ n = len(data) # Total sample size
623
+ df_between = k - 1
624
+ df_within = n - k
625
+ ms_between = ss_between / df_between if df_between > 0 else 0
626
+ ss_within = ss_total - ss_between
627
+ ms_within = ss_within / df_within if df_within > 0 else 0
628
+
629
+ omega_squared = (ss_between - (df_between * ms_within)) / (ss_total + ms_within) if (ss_total + ms_within) != 0 else 0
630
+ omega_squared = max(0, omega_squared) # Ensure non-negative
631
+
632
+ results['anova_tests'][f"{num_feat}_by_{cat_feat}"] = {
633
+ 'f_statistic': float(f_stat), # Convert to native Python types to reduce memory
634
+ 'p_value': float(p_value),
635
+ 'significant': p_value < 0.05,
636
+ 'degrees_of_freedom_between': df_between,
637
+ 'degrees_of_freedom_within': df_within,
638
+ 'ss_between': float(ss_between),
639
+ 'ss_within': float(ss_within),
640
+ 'ss_total': float(ss_total)
641
+ }
642
+
643
+ results['effect_sizes'][f"{num_feat}_by_{cat_feat}"] = {
644
+ 'eta_squared': float(eta_squared),
645
+ 'omega_squared': float(omega_squared)
646
+ }
647
+
648
+ # Calculate Levene's test for homogeneity of variances
649
+ try:
650
+ levene_stat, levene_p = stats.levene(*groups)
651
+ results['anova_tests'][f"{num_feat}_by_{cat_feat}"]["levene_test"] = {
652
+ 'statistic': float(levene_stat),
653
+ 'p_value': float(levene_p),
654
+ 'equal_variances': levene_p >= 0.05
655
+ }
656
+
657
+ # If variances are not equal, calculate Welch's ANOVA
658
+ if levene_p < 0.05:
659
+ try:
660
+ from scipy.stats import f_oneway
661
+ import statsmodels.api as sm
662
+ from statsmodels.formula.api import ols
663
+
664
+ # Create a new dataframe for Welch's test
665
+ welch_data = pd.DataFrame({
666
+ 'value': data[num_feat],
667
+ 'group': data[cat_feat]
668
+ })
669
+
670
+ # Fit the model
671
+ model = ols('value ~ C(group)', data=welch_data).fit()
672
+
673
+ # Perform Welch's ANOVA
674
+ welch_table = sm.stats.anova_lm(model, typ=2)
675
+
676
+ # Extract statistics
677
+ welch_f = welch_table.loc['C(group)', 'F']
678
+ welch_p = welch_table.loc['C(group)', 'PR(>F)']
679
+
680
+ results['anova_tests'][f"{num_feat}_by_{cat_feat}"]["welch_anova"] = {
681
+ 'f_statistic': float(welch_f),
682
+ 'p_value': float(welch_p),
683
+ 'significant': welch_p < 0.05
684
+ }
685
+ except:
686
+ # If Welch's ANOVA fails, skip it
687
+ pass
688
+ except:
689
+ # If Levene's test fails, skip it
690
+ pass
691
+
692
+ # If ANOVA is significant, perform post-hoc Tukey's test (optional)
693
+ if p_value < 0.05 and len(groups) > 2 and not skip_tukey:
694
+ # Prepare data for Tukey's test
695
+ all_data = np.concatenate(groups)
696
+ group_labels_for_tukey = np.repeat(group_labels, [len(group) for group in groups])
697
+
698
+ # Perform Tukey's test (computationally expensive)
699
+ try:
700
+ tukey_results = pairwise_tukeyhsd(all_data, group_labels_for_tukey)
701
+ tukey_summary = pd.DataFrame(data=tukey_results._results_table.data[1:],
702
+ columns=tukey_results._results_table.data[0])
703
+
704
+ # Store only significant pairs to save memory
705
+ significant_pairs = tukey_summary[tukey_summary['p-adj'] < 0.05]
706
+ significant_pairs_dict = significant_pairs.to_dict('records') if not significant_pairs.empty else []
707
+
708
+ results['anova_tests'][f"{num_feat}_by_{cat_feat}"]["tukey_posthoc"] = {
709
+ 'significant_pairs': significant_pairs_dict
710
+ }
711
+
712
+ # Visualize Tukey's test results if enabled
713
+ if visualize and not significant_pairs.empty:
714
+ plt.figure(figsize=(12, len(significant_pairs) * 0.5 + 2))
715
+ significant_pairs_plot = [(f"{row['group1']} vs {row['group2']}",
716
+ row['meandiff'],
717
+ row['lower'],
718
+ row['upper'])
719
+ for _, row in significant_pairs.iterrows()]
720
+
721
+ # Sort by mean difference
722
+ significant_pairs_plot.sort(key=lambda x: x[1])
723
+
724
+ # Plot
725
+ for i, (pair, diff, lower, upper) in enumerate(significant_pairs_plot):
726
+ plt.plot([lower, upper], [i, i], 'b-')
727
+ plt.plot([diff], [i], 'bo')
728
+
729
+ plt.axvline(x=0, color='r', linestyle='--')
730
+ plt.yticks(range(len(significant_pairs_plot)), [pair for pair, _, _, _ in significant_pairs_plot])
731
+ plt.xlabel('Mean Difference')
732
+ plt.title(f'Tukey\'s HSD: Significant Differences in {num_feat} by {cat_feat}')
733
+ plt.tight_layout()
734
+ plt.savefig(f"{output_dir}/{num_feat}_by_{cat_feat}_tukey.png")
735
+ plt.close()
736
+ except Exception as e:
737
+ pass
738
+
739
+ # Calculate Kruskal-Wallis test (non-parametric alternative to ANOVA)
740
+ try:
741
+ h_stat, kw_p_value = stats.kruskal(*groups)
742
+ results['anova_tests'][f"{num_feat}_by_{cat_feat}"]["kruskal_wallis"] = {
743
+ 'h_statistic': float(h_stat),
744
+ 'p_value': float(kw_p_value),
745
+ 'significant': kw_p_value < 0.05
746
+ }
747
+
748
+ # If Kruskal-Wallis is significant, perform post-hoc Dunn's test
749
+ if kw_p_value < 0.05 and len(groups) > 2 and not skip_tukey:
750
+ try:
751
+ from scikit_posthocs import posthoc_dunn
752
+
753
+ # Create a new dataframe for Dunn's test
754
+ dunn_data = pd.DataFrame({
755
+ 'value': data[num_feat],
756
+ 'group': data[cat_feat]
757
+ })
758
+
759
+ # Perform Dunn's test
760
+ dunn_results = posthoc_dunn(dunn_data, val_col='value', group_col='group', p_adjust='bonferroni')
761
+
762
+ # Store results
763
+ results['anova_tests'][f"{num_feat}_by_{cat_feat}"]["dunn_posthoc"] = {
764
+ 'p_values': dunn_results.to_dict()
765
+ }
766
+ except:
767
+ # If Dunn's test fails, skip it
768
+ pass
769
+ except:
770
+ # If Kruskal-Wallis test fails, skip it
771
+ pass
772
+
773
+ # Add to significant differences only if significant with reasonable effect size
774
+ if p_value < 0.05 and eta_squared >= 0.01: # Filter out very small effects
775
+ effect_size_category = ""
776
+ if eta_squared >= 0.14:
777
+ effect_size_category = "strong"
778
+ elif eta_squared >= 0.06:
779
+ effect_size_category = "moderate"
780
+ else:
781
+ effect_size_category = "weak"
782
+
783
+ # Calculate group means more efficiently
784
+ group_means = {group: float(data[data[cat_feat] == group][num_feat].mean())
785
+ for group in data[cat_feat].unique()}
786
+
787
+ # Get point-biserial correlation if available
788
+ point_biserial_info = {}
789
+ if f"{num_feat}_by_{cat_feat}" in results['point_biserial_correlations']:
790
+ point_biserial_info = results['point_biserial_correlations'][f"{num_feat}_by_{cat_feat}"]
791
+
792
+ results['significant_differences'].append({
793
+ 'numerical_feature': num_feat,
794
+ 'categorical_feature': cat_feat,
795
+ 'f_statistic': float(f_stat),
796
+ 'anova_p_value': float(p_value),
797
+ 'eta_squared': float(eta_squared),
798
+ 'omega_squared': float(omega_squared),
799
+ 'effect_size_category': effect_size_category,
800
+ 'group_means': group_means,
801
+ 'point_biserial_correlation': point_biserial_info if point_biserial_info else None,
802
+ 'equal_variances': results['anova_tests'][f"{num_feat}_by_{cat_feat}"].get("levene_test", {}).get("equal_variances", None)
803
+ })
804
+
805
+ except Exception as e:
806
+ # If tests fail, just continue to next pair
807
+ continue
808
+
809
+ # Calculate mutual information for only significant pairs found above
810
+ # This reduces unnecessary calculations
811
+ if results['significant_differences']:
812
+ mutual_info = {}
813
+ sig_pairs = [(d['numerical_feature'], d['categorical_feature']) for d in results['significant_differences']]
814
+
815
+ for num_feat, cat_feat in sig_pairs:
816
+ # Get clean data
817
+ data = df_sample[[num_feat, cat_feat]].dropna()
818
+
819
+ if data.empty or data[cat_feat].nunique() <= 1:
820
+ continue
821
+
822
+ # Discretize numerical feature (needed for mutual information)
823
+ try:
824
+ # Use quantiles to discretize into fewer bins for efficiency
825
+ num_bins = min(5, data[num_feat].nunique()) # Reduced from 10 to 5
826
+ if num_bins > 1:
827
+ data['num_binned'] = pd.qcut(data[num_feat], num_bins, duplicates='drop')
828
+
829
+ # Calculate mutual information
830
+ mi = mutual_info_score(
831
+ data['num_binned'].astype(str).values,
832
+ data[cat_feat].astype(str).values
833
+ )
834
+ mutual_info[f"{num_feat}_vs_{cat_feat}"] = float(mi)
835
+ except Exception as e:
836
+ pass
837
+
838
+ # Only add mutual_information if we calculated something
839
+ if mutual_info:
840
+ results['mutual_information'] = mutual_info
841
+
842
+ # Sort significant differences by effect size
843
+ results['significant_differences'].sort(key=lambda x: x['eta_squared'], reverse=True)
844
+
845
+ return results
846
+
847
+ def generate_summary(results):
848
+
849
+ """Generate a summary of key findings from bivariate analysis"""
850
+ summary = {
851
+ 'strongest_correlations': [],
852
+ 'significant_category_associations': [],
853
+ 'significant_group_differences': [],
854
+ 'key_insights': []
855
+ }
856
+
857
+ # Extract strongest numerical correlations
858
+ if 'numerical_vs_numerical' in results and 'significant_correlations' in results['numerical_vs_numerical']:
859
+ for corr in results['numerical_vs_numerical']['significant_correlations'][:5]: # Top 5
860
+ summary['strongest_correlations'].append({
861
+ 'features': f"{corr['feature_1']} and {corr['feature_2']}",
862
+ 'correlation': corr['pearson_correlation'],
863
+ 'type': corr['correlation_type'],
864
+ 'strength': corr['correlation_strength'],
865
+ 'p_value': corr['p_value'],
866
+ 'relationship_type': corr.get('relationship_type', 'linear')
867
+ })
868
+
869
+ # Extract strongest categorical associations
870
+ if 'categorical_vs_categorical' in results and 'significant_associations' in results['categorical_vs_categorical']:
871
+ for assoc in results['categorical_vs_categorical']['significant_associations'][:5]: # Top 5
872
+ summary['significant_category_associations'].append({
873
+ 'features': f"{assoc['feature_1']} and {assoc['feature_2']}",
874
+ 'cramer_v': assoc['cramers_v'],
875
+ 'strength': assoc['association_strength'],
876
+ 'p_value': assoc['p_value'],
877
+ 'phi_coefficient': assoc.get('phi_coefficient', None)
878
+ })
879
+
880
+ # Extract most significant group differences
881
+ if 'numerical_vs_categorical' in results and 'significant_differences' in results['numerical_vs_categorical']:
882
+ for diff in results['numerical_vs_categorical']['significant_differences'][:5]: # Top 5
883
+ summary['significant_group_differences'].append({
884
+ 'numerical': diff['numerical_feature'],
885
+ 'categorical': diff['categorical_feature'],
886
+ 'eta_squared': diff['eta_squared'],
887
+ 'omega_squared': diff.get('omega_squared', None),
888
+ 'effect_size': diff['effect_size_category'],
889
+ 'anova_p_value': diff['anova_p_value'],
890
+ 'equal_variances': diff.get('equal_variances', None)
891
+ })
892
+
893
+ # Generate key insights
894
+ insights = []
895
+
896
+ # Insight from numerical correlations
897
+ if summary['strongest_correlations']:
898
+ top_corr = summary['strongest_correlations'][0]
899
+ insights.append(f"The strongest numerical relationship is between {top_corr['features']} with a {top_corr['strength']} "
900
+ f"{top_corr['type']} correlation of {top_corr['correlation']:.3f} (p={top_corr['p_value']:.4f}).")
901
+
902
+ # Additional insight on relationship type
903
+ if top_corr.get('relationship_type') == 'potentially non-linear':
904
+ insights.append(f"The relationship between {top_corr['features']} appears to be non-linear, "
905
+ f"as indicated by the difference between Pearson and Spearman correlations.")
906
+
907
+ # Insight from categorical associations
908
+ if summary['significant_category_associations']:
909
+ top_assoc = summary['significant_category_associations'][0]
910
+ insights.append(f"The strongest association between categorical variables is between {top_assoc['features']} "
911
+ f"with a {top_assoc['strength']} relationship (Cramer's V={top_assoc['cramer_v']:.3f}, p={top_assoc['p_value']:.4f}).")
912
+
913
+ # Insight from group differences
914
+ if summary['significant_group_differences']:
915
+ top_diff = summary['significant_group_differences'][0]
916
+ insights.append(f"The categorical variable {top_diff['categorical']} has a {top_diff['effect_size']} effect "
917
+ f"on {top_diff['numerical']} (η²={top_diff['eta_squared']:.3f}, p={top_diff['anova_p_value']:.4f}).")
918
+
919
+ # Add insight about equal variances assumption
920
+ if top_diff.get('equal_variances') is not None:
921
+ if top_diff['equal_variances']:
922
+ insights.append(f"The equal variances assumption is met for {top_diff['numerical']} across {top_diff['categorical']} groups, "
923
+ f"supporting the validity of the ANOVA results.")
924
+ else:
925
+ insights.append(f"The equal variances assumption is violated for {top_diff['numerical']} across {top_diff['categorical']} groups, "
926
+ f"suggesting Welch's ANOVA may be more appropriate.")
927
+
928
+ # Check for data quality issues
929
+ data_quality_issues = []
930
+
931
+ # Check for potentially misleading relationships
932
+ if 'numerical_vs_numerical' in results and 'significant_correlations' in results['numerical_vs_numerical']:
933
+ for corr in results['numerical_vs_numerical']['significant_correlations']:
934
+ if corr['pearson_correlation'] > 0.9 or corr['pearson_correlation'] < -0.9:
935
+ data_quality_issues.append(f"The very strong correlation between {corr['feature_1']} and {corr['feature_2']} "
936
+ f"may indicate multicollinearity issues in predictive modeling.")
937
+
938
+ # Check for imbalanced categorical variables
939
+ if 'numerical_vs_categorical' in results and 'group_statistics' in results['numerical_vs_categorical']:
940
+ for key, stats in results['numerical_vs_categorical']['group_statistics'].items():
941
+ if 'counts' in stats:
942
+ counts = list(stats['counts'].values())
943
+ if counts and max(counts) / min(counts) > 10: # Imbalanced if one group is 10x larger than smallest
944
+ features = key.split('_by_')
945
+ if len(features) == 2:
946
+ data_quality_issues.append(f"The categorical variable {features[1]} has highly imbalanced groups "
947
+ f"which may affect the reliability of {features[0]} analysis.")
948
+
949
+ # Add data quality issues as insights
950
+ for issue in data_quality_issues[:2]: # Limit to top 2 issues
951
+ insights.append(issue)
952
+
953
+ # Add final observations
954
+ if 'numerical_vs_numerical' in results:
955
+ # Check for non-linear relationships
956
+ non_linear_count = 0
957
+ if 'significant_correlations' in results['numerical_vs_numerical']:
958
+ for corr in results['numerical_vs_numerical']['significant_correlations']:
959
+ if corr.get('relationship_type') == 'potentially non-linear':
960
+ non_linear_count += 1
961
+
962
+ if non_linear_count > 0:
963
+ insights.append(f"Found {non_linear_count} potentially non-linear relationships among numerical variables, "
964
+ f"suggesting that linear models may not fully capture the complexity of the data.")
965
+
966
+ # Cross-analysis insights
967
+ has_num_num = 'numerical_vs_numerical' in results and 'significant_correlations' in results['numerical_vs_numerical']
968
+ has_cat_cat = 'categorical_vs_categorical' in results and 'significant_associations' in results['categorical_vs_categorical']
969
+ has_num_cat = 'numerical_vs_categorical' in results and 'significant_differences' in results['numerical_vs_categorical']
970
+
971
+ if has_num_num and has_num_cat:
972
+ insights.append("Both numerical correlations and categorical group differences were detected, "
973
+ "suggesting a mix of continuous and segmented relationships in the data.")
974
+
975
+ # Add mention of mutual information if present
976
+ if 'numerical_vs_categorical' in results and 'mutual_information' in results['numerical_vs_categorical']:
977
+ mutual_info = results['numerical_vs_categorical']['mutual_information']
978
+ if mutual_info:
979
+ # Find highest mutual information score
980
+ top_mi_pair = max(mutual_info.items(), key=lambda x: x[1])
981
+ features = top_mi_pair[0].split('_vs_')
982
+ if len(features) == 2:
983
+ insights.append(f"The strongest general statistical dependency (mutual information) was found between "
984
+ f"{features[0]} and {features[1]} with score {top_mi_pair[1]:.3f}, "
985
+ f"capturing both linear and non-linear relationships.")
986
+
987
+ # Overall data structure insight
988
+ if has_num_num and has_cat_cat and has_num_cat:
989
+ insights.append("The data shows a complex structure with significant relationships across all variable types, "
990
+ "suggesting potential for both feature engineering and dimensionality reduction.")
991
+
992
+ # Add insights to summary
993
+ summary['key_insights'] = insights
994
+
995
+ return summary
996
+
997
+ def bivariate_analysis(data_path: str):
998
+ """
999
+ Perform a comprehensive bivariate analysis on a dataset.
1000
+
1001
+ Args:
1002
+ data_path: Path to a data file or a pandas DataFrame
1003
+ Returns:
1004
+ A dictionary containing all analysis results
1005
+ """
1006
+ try:
1007
+ if isinstance(data_path, str):
1008
+ data = pd.read_csv(data_path)
1009
+ else:
1010
+ logger.error(f"Unsupported file format: {data_path}")
1011
+ return {"error": f"Unsupported file format: {data_path}"}
1012
+
1013
+ if not isinstance(data, pd.DataFrame):
1014
+ logger.error(f"Input is not a valid pandas DataFrame")
1015
+ return {"error": "Input is not a valid pandas DataFrame"}
1016
+
1017
+ logger.warn("Staring Analysis.....")
1018
+
1019
+ results = comprehensive_bivariate_analysis(df = data)
1020
+ summary = generate_summary(results=results)
1021
+
1022
+ serialized_summary = json.dumps(summary, default=lambda o: o.item() if isinstance(o, np.generic) else str(o))
1023
+
1024
+ return serialized_summary
1025
+
1026
+ except Exception as e:
1027
+ logger.error(f"Error in analyze_dataset: {str(e)}")
1028
+ return {"error": str(e)}
src/app/pipelines/eda/tools/analysis_tools/multivariate_analysis.py ADDED
@@ -0,0 +1,1039 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from scipy import stats
4
+ import statsmodels.api as sm
5
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
6
+ from sklearn.preprocessing import StandardScaler
7
+ from sklearn.decomposition import PCA, FactorAnalysis
8
+ from sklearn.manifold import TSNE, MDS
9
+ from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
10
+ from sklearn.mixture import GaussianMixture
11
+ from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
12
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
13
+ from sklearn.feature_selection import mutual_info_classif, mutual_info_regression, f_classif, f_regression, chi2
14
+ from sklearn.feature_selection import SelectKBest, RFE
15
+ from sklearn.neighbors import LocalOutlierFactor
16
+ from sklearn.ensemble import IsolationForest
17
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
18
+ from sklearn.model_selection import cross_val_score
19
+ from sklearn.feature_selection import VarianceThreshold
20
+ from agno.utils.log import logger
21
+ import warnings
22
+ import json
23
+
24
+ warnings.filterwarnings("ignore")
25
+
26
+ def preprocess_dataframe(df, target_column=None, categorical_columns=None, verbose=True):
27
+ """
28
+ Preprocess a dataframe by identifying categorical columns, handling missing values,
29
+ and separating the target variable if specified.
30
+
31
+ Parameters:
32
+ -----------
33
+ df : pandas.DataFrame
34
+ The dataframe to preprocess
35
+ target_column : str, optional
36
+ The name of the target variable column (if any)
37
+ categorical_columns : list, optional
38
+ List of categorical column names (will be auto-detected if None)
39
+ verbose : bool, default=True
40
+ Whether to print detailed messages during preprocessing
41
+
42
+ Returns:
43
+ --------
44
+ dict
45
+ A dictionary containing the preprocessed data and metadata
46
+ """
47
+ # Create a deep copy to avoid modifying the original dataframe
48
+ data = df.copy()
49
+
50
+ # Initialize results dictionary to store preprocessing info
51
+ results = {}
52
+
53
+ # Separate target if specified
54
+ target = None
55
+
56
+ if target_column is not None and target_column in data.columns:
57
+ target = data[target_column].copy()
58
+ data = data.drop(columns=[target_column])
59
+ target_is_numeric = pd.api.types.is_numeric_dtype(target)
60
+ results['target_column'] = target_column
61
+ results['target_is_numeric'] = target_is_numeric
62
+ if verbose:
63
+ logger.warning(f"Target variable '{target_column}' detected as {'numeric' if target_is_numeric else 'categorical'}")
64
+
65
+ # Identify categorical and numerical columns
66
+ if categorical_columns is None:
67
+ categorical_columns = []
68
+ for col in data.columns:
69
+ if pd.api.types.is_object_dtype(data[col]) or pd.api.types.is_categorical_dtype(data[col]) or len(data[col].unique()) < 10:
70
+ categorical_columns.append(col)
71
+
72
+ numerical_columns = [col for col in data.columns if col not in categorical_columns]
73
+
74
+ results['categorical_columns'] = categorical_columns
75
+ results['numerical_columns'] = numerical_columns
76
+
77
+ if verbose:
78
+ logger.warning(f"Detected {len(numerical_columns)} numerical columns and {len(categorical_columns)} categorical columns")
79
+
80
+ # Handle missing values
81
+ missing_data = data.isnull().sum()
82
+ results['missing_values'] = {col: count for col, count in missing_data.items() if count > 0}
83
+ results['missing_values_percentage'] = {col: count/len(data)*100 for col, count in missing_data.items() if count > 0}
84
+
85
+ # For analysis, we'll do simple imputation to handle missing values
86
+ for col in numerical_columns:
87
+ if data[col].isnull().sum() > 0:
88
+ data[col].fillna(data[col].median(), inplace=True)
89
+
90
+ for col in categorical_columns:
91
+ if data[col].isnull().sum() > 0:
92
+ data[col].fillna(data[col].mode()[0], inplace=True)
93
+
94
+ # Create a dataframe of numerical data only for correlation analysis
95
+ numerical_data = data[numerical_columns].copy()
96
+
97
+ # Standardize numerical data
98
+ scaler = StandardScaler()
99
+ if len(numerical_columns) > 0:
100
+ scaled_data = scaler.fit_transform(numerical_data)
101
+ results['preprocessing'] = {
102
+ 'scaler_mean': scaler.mean_.tolist(),
103
+ 'scaler_scale': scaler.scale_.tolist()
104
+ }
105
+ else:
106
+ scaled_data = np.array([])
107
+
108
+ results['data'] = data
109
+ results['numerical_data'] = numerical_data
110
+ results['scaled_data'] = scaled_data
111
+ results['target'] = target
112
+
113
+ return results
114
+
115
+ def analyze_correlations(numerical_data, verbose=True):
116
+ """
117
+ Perform correlation analysis on numerical data.
118
+
119
+ Parameters:
120
+ -----------
121
+ numerical_data : pandas.DataFrame
122
+ DataFrame containing only numerical columns
123
+ verbose : bool, default=True
124
+ Whether to print detailed messages during analysis
125
+
126
+ Returns:
127
+ --------
128
+ dict
129
+ A dictionary containing correlation analysis results
130
+ """
131
+ if len(numerical_data.columns) <= 1:
132
+ if verbose:
133
+ logger.warning("Correlation analysis requires at least 2 numerical columns")
134
+ return {}
135
+
136
+ correlation_results = {}
137
+
138
+ # Generate correlation matrix
139
+ correlation_matrix = numerical_data.corr()
140
+ correlation_results['matrix'] = correlation_matrix
141
+
142
+ # Identify highly correlated features
143
+ high_corr_pairs = []
144
+ for i in range(len(correlation_matrix.columns)):
145
+ for j in range(i+1, len(correlation_matrix.columns)):
146
+ col1 = correlation_matrix.columns[i]
147
+ col2 = correlation_matrix.columns[j]
148
+ corr_value = correlation_matrix.iloc[i, j]
149
+ if abs(corr_value) > 0.7:
150
+ high_corr_pairs.append((col1, col2, corr_value))
151
+
152
+ correlation_results['high_correlation_pairs'] = high_corr_pairs
153
+
154
+ # Multicollinearity using VIF (Only for numeric predictors)
155
+ if len(numerical_data.columns) > 1:
156
+ try:
157
+ X = sm.add_constant(numerical_data)
158
+ vif_data = pd.DataFrame()
159
+ vif_data["Variable"] = X.columns
160
+ vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
161
+ correlation_results['vif'] = vif_data
162
+
163
+ # Identify multicollinearity issues
164
+ multicollinearity_issues = vif_data[vif_data["VIF"] > 10].to_dict('records')
165
+ correlation_results['multicollinearity_issues'] = multicollinearity_issues
166
+ except Exception as e:
167
+ if verbose:
168
+ logger.error(f"VIF calculation failed: {str(e)}")
169
+
170
+ return correlation_results
171
+
172
+ def perform_dimensionality_reduction(scaled_data, numerical_columns, data_size, verbose=True):
173
+ """
174
+ Perform various dimensionality reduction techniques on scaled numerical data.
175
+
176
+ Parameters:
177
+ -----------
178
+ scaled_data : numpy.ndarray
179
+ Standardized numerical data
180
+ numerical_columns : list
181
+ List of numerical column names
182
+ data_size : int
183
+ Number of rows in the dataset
184
+ verbose : bool, default=True
185
+ Whether to print detailed messages during analysis
186
+
187
+ Returns:
188
+ --------
189
+ dict
190
+ A dictionary containing dimensionality reduction results
191
+ """
192
+ if len(numerical_columns) <= 1:
193
+ if verbose:
194
+ logger.warning("Dimensionality reduction requires at least 2 numerical columns")
195
+ return {}
196
+
197
+ dr_results = {}
198
+
199
+ # PCA
200
+ try:
201
+ pca = PCA(random_state=42)
202
+ pca_results = pca.fit_transform(scaled_data)
203
+
204
+ # Explained variance
205
+ explained_variance = pca.explained_variance_ratio_
206
+ cumulative_variance = np.cumsum(explained_variance)
207
+
208
+ # Find components that explain at least 80% of variance
209
+ components_80pct = np.argmax(cumulative_variance >= 0.8) + 1
210
+
211
+ # Component loadings
212
+ loadings = pd.DataFrame(
213
+ pca.components_.T,
214
+ columns=[f'PC{i+1}' for i in range(pca.n_components_)],
215
+ index=numerical_columns
216
+ )
217
+
218
+ dr_results['pca'] = {
219
+ 'explained_variance_ratio': explained_variance.tolist(),
220
+ 'cumulative_variance': cumulative_variance.tolist(),
221
+ 'components_for_80_percent_variance': components_80pct,
222
+ 'component_loadings': loadings,
223
+ 'pca_object': pca
224
+ }
225
+
226
+ # Factor Analysis
227
+ if len(numerical_columns) >= 3:
228
+ try:
229
+ factor = FactorAnalysis(n_components=min(5, len(numerical_columns)), random_state=42)
230
+ factor.fit(scaled_data)
231
+
232
+ loadings = pd.DataFrame(
233
+ factor.components_.T,
234
+ columns=[f'Factor{i+1}' for i in range(factor.n_components)],
235
+ index=numerical_columns
236
+ )
237
+
238
+ dr_results['factor_analysis'] = {
239
+ 'loadings': loadings,
240
+ 'factor_object': factor
241
+ }
242
+ except Exception as e:
243
+ if verbose:
244
+ logger.error(f"Factor Analysis failed: {str(e)}")
245
+
246
+ # t-SNE (for datasets that aren't too large)
247
+ if data_size <= 5000:
248
+ try:
249
+ tsne = TSNE(n_components=2, random_state=42)
250
+ tsne_results = tsne.fit_transform(scaled_data)
251
+
252
+ dr_results['tsne'] = {
253
+ 'coordinates': tsne_results,
254
+ 'tsne_object': tsne
255
+ }
256
+ except Exception as e:
257
+ if verbose:
258
+ logger.error(f"t-SNE failed: {str(e)}")
259
+
260
+ # MDS (for smaller datasets)
261
+ if data_size <= 2000:
262
+ try:
263
+ mds = MDS(n_components=2, random_state=42)
264
+ mds_results = mds.fit_transform(scaled_data)
265
+
266
+ dr_results['mds'] = {
267
+ 'coordinates': mds_results,
268
+ 'mds_object': mds
269
+ }
270
+ except Exception as e:
271
+ if verbose:
272
+ logger.error(f"MDS failed: {str(e)}")
273
+
274
+ except Exception as e:
275
+ if verbose:
276
+ logger.error(f"PCA failed: {str(e)}")
277
+
278
+ return dr_results
279
+
280
+ def perform_cluster_analysis(scaled_data, numerical_data, data_size, verbose=True):
281
+ """
282
+ Perform various clustering techniques on scaled numerical data.
283
+
284
+ Parameters:
285
+ -----------
286
+ scaled_data : numpy.ndarray
287
+ Standardized numerical data
288
+ numerical_data : pandas.DataFrame
289
+ DataFrame containing numerical data (for cluster analysis)
290
+ data_size : int
291
+ Number of rows in the dataset
292
+ verbose : bool, default=True
293
+ Whether to print detailed messages during analysis
294
+
295
+ Returns:
296
+ --------
297
+ dict
298
+ A dictionary containing clustering results
299
+ """
300
+ if len(numerical_data.columns) <= 1 or data_size <= 10:
301
+ if verbose:
302
+ logger.warning("Clustering requires at least 2 numerical columns and more than 10 data points")
303
+ return {}
304
+
305
+ clustering_results = {}
306
+
307
+ # K-means clustering
308
+ try:
309
+ # Determine optimal number of clusters using silhouette score
310
+ sil_scores = []
311
+ max_clusters = min(10, data_size // 10) # reasonable max number of clusters
312
+
313
+ for k in range(2, max_clusters + 1):
314
+ kmeans = KMeans(n_clusters=k, random_state=42)
315
+ labels = kmeans.fit_predict(scaled_data)
316
+ sil_score = silhouette_score(scaled_data, labels)
317
+ sil_scores.append((k, sil_score))
318
+
319
+ # Find the best k
320
+ best_k = max(sil_scores, key=lambda x: x[1])[0]
321
+
322
+ # Run K-means with optimal k
323
+ kmeans = KMeans(n_clusters=best_k, random_state=42)
324
+ labels = kmeans.fit_predict(scaled_data)
325
+
326
+ # Calculate cluster evaluation metrics
327
+ sil_avg = silhouette_score(scaled_data, labels)
328
+ ch_score = calinski_harabasz_score(scaled_data, labels)
329
+ db_score = davies_bouldin_score(scaled_data, labels)
330
+
331
+ # Analyze cluster characteristics
332
+ cluster_analysis = {}
333
+ for cluster in range(best_k):
334
+ cluster_data = numerical_data.iloc[labels == cluster]
335
+ cluster_analysis[f'Cluster_{cluster}'] = {
336
+ 'size': len(cluster_data),
337
+ 'percentage': len(cluster_data) / data_size * 100,
338
+ 'mean': cluster_data.mean().to_dict(),
339
+ 'std': cluster_data.std().to_dict()
340
+ }
341
+
342
+ clustering_results['kmeans'] = {
343
+ 'best_k': best_k,
344
+ 'silhouette_scores': dict(sil_scores),
345
+ 'silhouette_avg': sil_avg,
346
+ 'calinski_harabasz_score': ch_score,
347
+ 'davies_bouldin_score': db_score,
348
+ 'cluster_analysis': cluster_analysis,
349
+ 'kmeans_object': kmeans
350
+ }
351
+
352
+ # DBSCAN for density-based clustering
353
+ if data_size <= 5000: # DBSCAN can be slow on large datasets
354
+ try:
355
+ dbscan = DBSCAN(eps=0.5, min_samples=min(5, data_size // 100))
356
+ dbscan_labels = dbscan.fit_predict(scaled_data)
357
+
358
+ # Count number of clusters and noise points
359
+ n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
360
+ n_noise = list(dbscan_labels).count(-1)
361
+
362
+ # Only calculate silhouette if we have clusters and not all points are noise
363
+ if n_clusters > 1 and n_noise < data_size:
364
+ valid_data = scaled_data[dbscan_labels != -1]
365
+ valid_labels = dbscan_labels[dbscan_labels != -1]
366
+ if len(set(valid_labels)) > 1:
367
+ db_sil_score = silhouette_score(valid_data, valid_labels)
368
+ else:
369
+ db_sil_score = None
370
+ else:
371
+ db_sil_score = None
372
+
373
+ clustering_results['dbscan'] = {
374
+ 'n_clusters': n_clusters,
375
+ 'n_noise': n_noise,
376
+ 'silhouette_score': db_sil_score,
377
+ 'dbscan_object': dbscan
378
+ }
379
+ except Exception as e:
380
+ if verbose:
381
+ logger.error(f"DBSCAN failed: {str(e)}")
382
+
383
+ # Gaussian Mixture Models
384
+ try:
385
+ gmm = GaussianMixture(n_components=best_k, random_state=42)
386
+ gmm_labels = gmm.fit_predict(scaled_data)
387
+
388
+ gmm_sil_score = silhouette_score(scaled_data, gmm_labels)
389
+ gmm_ch_score = calinski_harabasz_score(scaled_data, gmm_labels)
390
+ gmm_db_score = davies_bouldin_score(scaled_data, gmm_labels)
391
+
392
+ clustering_results['gmm'] = {
393
+ 'n_components': best_k,
394
+ 'silhouette_score': gmm_sil_score,
395
+ 'calinski_harabasz_score': gmm_ch_score,
396
+ 'davies_bouldin_score': gmm_db_score,
397
+ 'gmm_object': gmm
398
+ }
399
+ except Exception as e:
400
+ if verbose:
401
+ logger.error(f"GMM failed: {str(e)}")
402
+
403
+ except Exception as e:
404
+ if verbose:
405
+ logger.error(f"Clustering analysis failed: {str(e)}")
406
+
407
+ return clustering_results
408
+
409
+ def analyze_feature_importance(data, target, numerical_columns, categorical_columns, target_is_numeric, verbose=True):
410
+ """
411
+ Analyze feature importance using various techniques.
412
+
413
+ Parameters:
414
+ -----------
415
+ data : pandas.DataFrame
416
+ The preprocessed dataframe
417
+ target : pandas.Series
418
+ The target variable
419
+ numerical_columns : list
420
+ List of numerical column names
421
+ categorical_columns : list
422
+ List of categorical column names
423
+ target_is_numeric : bool
424
+ Whether the target variable is numeric
425
+ verbose : bool, default=True
426
+ Whether to print detailed messages during analysis
427
+
428
+ Returns:
429
+ --------
430
+ dict
431
+ A dictionary containing feature importance analysis results
432
+ """
433
+ if target is None:
434
+ if verbose:
435
+ logger.warning("Feature importance analysis requires a target variable")
436
+ return {}
437
+
438
+ feature_importance_results = {}
439
+
440
+ # Prepare data for feature importance
441
+ X = data.copy()
442
+ y = target.copy()
443
+
444
+ # Encode categorical features for feature importance
445
+ X_encoded = pd.get_dummies(X, columns=categorical_columns, drop_first=True)
446
+
447
+ # Univariate feature importance
448
+ try:
449
+ if target_is_numeric:
450
+ # Numeric target: use F-test for numeric features
451
+ selector = SelectKBest(score_func=f_regression, k='all')
452
+ selector.fit(X_encoded, y)
453
+ f_scores = pd.Series(selector.scores_, index=X_encoded.columns)
454
+ p_values = pd.Series(selector.pvalues_, index=X_encoded.columns)
455
+
456
+ # Mutual information
457
+ try:
458
+ var_thresh = VarianceThreshold(threshold=0.01)
459
+ X_encoded = pd.DataFrame(var_thresh.fit_transform(X_encoded), columns=X_encoded.columns[var_thresh.get_support()])
460
+ mi_scores = mutual_info_regression(X_encoded, y)
461
+ mi_series = pd.Series(mi_scores, index=X_encoded.columns)
462
+ feature_importance_results['mutual_info'] = mi_series.sort_values(ascending=False).to_dict()
463
+ except Exception as e:
464
+ if verbose:
465
+ logger.error(f"Mutual information calculation failed: {str(e)}")
466
+
467
+ feature_importance_results['f_regression'] = {
468
+ 'scores': f_scores.sort_values(ascending=False).to_dict(),
469
+ 'p_values': p_values.sort_values().to_dict()
470
+ }
471
+ else:
472
+ # Categorical target: use chi2 for numeric features
473
+ # Need non-negative features for chi2
474
+ X_chi = X_encoded.copy()
475
+ for col in X_chi.columns:
476
+ if X_chi[col].min() < 0:
477
+ X_chi[col] = X_chi[col] - X_chi[col].min()
478
+
479
+ selector = SelectKBest(score_func=chi2, k='all')
480
+ selector.fit(X_chi, y)
481
+ chi2_scores = pd.Series(selector.scores_, index=X_encoded.columns)
482
+ p_values = pd.Series(selector.pvalues_, index=X_encoded.columns)
483
+
484
+ # F-test for classification
485
+ f_selector = SelectKBest(score_func=f_classif, k='all')
486
+ f_selector.fit(X_encoded, y)
487
+ f_scores = pd.Series(f_selector.scores_, index=X_encoded.columns)
488
+ f_p_values = pd.Series(f_selector.pvalues_, index=X_encoded.columns)
489
+
490
+ # Mutual information
491
+ try:
492
+ mi_scores = mutual_info_classif(X_encoded, y)
493
+ mi_series = pd.Series(mi_scores, index=X_encoded.columns)
494
+ feature_importance_results['mutual_info'] = mi_series.sort_values(ascending=False).to_dict()
495
+ except Exception as e:
496
+ if verbose:
497
+ logger.error(f"Mutual information calculation failed: {str(e)}")
498
+
499
+ feature_importance_results['chi2'] = {
500
+ 'scores': chi2_scores.sort_values(ascending=False).to_dict(),
501
+ 'p_values': p_values.sort_values().to_dict()
502
+ }
503
+
504
+ feature_importance_results['f_classif'] = {
505
+ 'scores': f_scores.sort_values(ascending=False).to_dict(),
506
+ 'p_values': f_p_values.sort_values().to_dict()
507
+ }
508
+
509
+ # Tree-based feature importance
510
+ try:
511
+ if target_is_numeric:
512
+ model = RandomForestRegressor(n_estimators=100, random_state=42)
513
+ else:
514
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
515
+
516
+ model.fit(X_encoded, y)
517
+ importances = pd.Series(model.feature_importances_, index=X_encoded.columns)
518
+
519
+ feature_importance_results['random_forest'] = importances.sort_values(ascending=False).to_dict()
520
+
521
+ # Recursive Feature Elimination
522
+ if len(X_encoded.columns) > 5:
523
+ rfe = RFE(estimator=model, n_features_to_select=min(10, len(X_encoded.columns)), step=1)
524
+ rfe.fit(X_encoded, y)
525
+ rfe_ranking = pd.Series(rfe.ranking_, index=X_encoded.columns)
526
+
527
+ feature_importance_results['rfe'] = {
528
+ 'selected_features': X_encoded.columns[rfe.support_].tolist(),
529
+ 'feature_ranking': rfe_ranking.sort_values().to_dict()
530
+ }
531
+ except Exception as e:
532
+ if verbose:
533
+ logger.error(f"Tree-based feature importance calculation failed: {str(e)}")
534
+
535
+ except Exception as e:
536
+ if verbose:
537
+ logger.error(f"Feature importance analysis failed: {str(e)}")
538
+
539
+ return feature_importance_results
540
+
541
+ def detect_outliers(scaled_data, data_size, verbose=True):
542
+ """
543
+ Detect multivariate outliers using different techniques.
544
+
545
+ Parameters:
546
+ -----------
547
+ scaled_data : numpy.ndarray
548
+ Standardized numerical data
549
+ data_size : int
550
+ Number of rows in the dataset
551
+ verbose : bool, default=True
552
+ Whether to print detailed messages during analysis
553
+
554
+ Returns:
555
+ --------
556
+ dict
557
+ A dictionary containing outlier detection results
558
+ """
559
+ if scaled_data.size == 0 or scaled_data.shape[1] <= 1:
560
+ if verbose:
561
+ logger.warning("Outlier detection requires at least 2 numerical columns")
562
+ return {}
563
+
564
+ outlier_results = {}
565
+
566
+ # Isolation Forest
567
+ try:
568
+ iso = IsolationForest(contamination=0.05, random_state=42)
569
+ outliers_isof = iso.fit_predict(scaled_data)
570
+ outlier_indices_isof = np.where(outliers_isof == -1)[0]
571
+
572
+ outlier_results['isolation_forest'] = {
573
+ 'num_outliers': len(outlier_indices_isof),
574
+ 'outlier_percentage': len(outlier_indices_isof) / data_size * 100
575
+ }
576
+ except Exception as e:
577
+ if verbose:
578
+ logger.error(f"Isolation Forest failed: {str(e)}")
579
+
580
+ # Local Outlier Factor (for smaller datasets)
581
+ if data_size <= 5000:
582
+ try:
583
+ lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
584
+ outliers_lof = lof.fit_predict(scaled_data)
585
+ outlier_indices_lof = np.where(outliers_lof == -1)[0]
586
+
587
+ outlier_results['local_outlier_factor'] = {
588
+ 'num_outliers': len(outlier_indices_lof),
589
+ 'outlier_indices': outlier_indices_lof.tolist(),
590
+ 'outlier_percentage': len(outlier_indices_lof) / data_size * 100
591
+ }
592
+ except Exception as e:
593
+ if verbose:
594
+ logger.error(f"Local Outlier Factor failed: {str(e)}")
595
+
596
+ return outlier_results
597
+
598
+ def perform_statistical_testing(data, target, scaled_data, numerical_columns, categorical_columns, target_is_numeric, verbose=True):
599
+ """
600
+ Perform various statistical tests on the data.
601
+
602
+ Parameters:
603
+ -----------
604
+ data : pandas.DataFrame
605
+ The preprocessed dataframe
606
+ target : pandas.Series
607
+ The target variable
608
+ scaled_data : numpy.ndarray
609
+ Standardized numerical data
610
+ numerical_columns : list
611
+ List of numerical column names
612
+ categorical_columns : list
613
+ List of categorical column names
614
+ target_is_numeric : bool
615
+ Whether the target variable is numeric
616
+ verbose : bool, default=True
617
+ Whether to print detailed messages during analysis
618
+
619
+ Returns:
620
+ --------
621
+ dict
622
+ A dictionary containing statistical testing results
623
+ """
624
+ if target is None:
625
+ if verbose:
626
+ print("Statistical testing requires a target variable")
627
+ return {}
628
+
629
+ statistical_testing_results = {}
630
+
631
+ # MANOVA (only for multivariate numeric data with categorical target)
632
+ if not target_is_numeric and len(numerical_columns) > 1:
633
+ try:
634
+ unique_classes = target.unique()
635
+ if len(unique_classes) > 1 and len(unique_classes) <= 10:
636
+ # Prepare groups for MANOVA
637
+ numerical_data = data[numerical_columns]
638
+ groups = [numerical_data.iloc[target == val].values for val in unique_classes]
639
+
640
+ # Run MANOVA
641
+ manova_result = stats.manova(groups)
642
+
643
+ statistical_testing_results['manova'] = {
644
+ 'test_statistic': float(manova_result.statistic),
645
+ 'p_value': float(manova_result.pvalue),
646
+ 'significant': float(manova_result.pvalue) < 0.05
647
+ }
648
+ except Exception as e:
649
+ if verbose:
650
+ print(f"MANOVA failed: {str(e)}")
651
+
652
+ # Linear Discriminant Analysis (for classification problems)
653
+ if not target_is_numeric:
654
+ try:
655
+ lda = LinearDiscriminantAnalysis()
656
+ X_lda = lda.fit_transform(scaled_data, target)
657
+
658
+ statistical_testing_results['lda'] = {
659
+ 'explained_variance_ratio': lda.explained_variance_ratio_.tolist(),
660
+ 'coordinates': X_lda.tolist(),
661
+ 'lda_object': lda
662
+ }
663
+ except Exception as e:
664
+ if verbose:
665
+ print(f"LDA failed: {str(e)}")
666
+
667
+ # # Multivariate regression for numeric target
668
+ # if target_is_numeric:
669
+ # # try:
670
+ # for col in numerical_columns:
671
+ # data[col] = pd.to_numeric(data[col], errors='coerce')
672
+
673
+ # data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].mean())
674
+
675
+ # X_encoded = pd.get_dummies(data[categorical_columns], drop_first=True)
676
+
677
+ # X_final = pd.concat([data[numerical_columns], X_encoded], axis=1)
678
+ # target = pd.to_numeric(target, errors='coerce').fillna(target.mean())
679
+
680
+ # X_final = X_final.apply(pd.to_numeric, errors='coerce')
681
+ # X_final = X_final.fillna(0) # optional: or use mean imputation
682
+
683
+ # X_final = X_final.astype('float64')
684
+ # target = target.astype('float64')
685
+
686
+ # X_sm = sm.add_constant(X_final)
687
+
688
+ # # Fit model
689
+ # model = sm.OLS(target, X_sm).fit()
690
+
691
+ # statistical_testing_results['multivariate_regression'] = {
692
+ # 'r_squared': model.rsquared,
693
+ # 'adj_r_squared': model.rsquared_adj,
694
+ # 'f_statistic': float(model.fvalue),
695
+ # 'f_pvalue': float(model.f_pvalue),
696
+ # 'coefficients': model.params.to_dict(),
697
+ # 'p_values': model.pvalues.to_dict(),
698
+ # 'significant_features': [feature for feature, p_val in model.pvalues.items() if p_val < 0.05]
699
+ # }
700
+ # # except Exception as e:
701
+ # if verbose:
702
+ # print(f"Multivariate regression failed: {str(e)}")
703
+
704
+ return statistical_testing_results
705
+
706
+ def generate_summary(results, df, verbose=True):
707
+ """
708
+ Generate a summary of analysis results.
709
+
710
+ Parameters:
711
+ -----------
712
+ results : dict
713
+ Dictionary containing all analysis results
714
+ df : pandas.DataFrame
715
+ The original dataframe
716
+ verbose : bool, default=True
717
+ Whether to print the summary
718
+
719
+ Returns:
720
+ --------
721
+ dict
722
+ A dictionary containing the summary
723
+ """
724
+ summary = {}
725
+
726
+ summary['dataset'] = {
727
+ 'rows': len(df),
728
+ 'columns': len(df.columns),
729
+ 'numerical_features': len(results.get('numerical_columns', [])),
730
+ 'categorical_features': len(results.get('categorical_columns', [])),
731
+ 'missing_values': sum(df.isnull().sum() > 0)
732
+ }
733
+
734
+ if 'correlation' in results:
735
+ high_corr_pairs = results['correlation'].get('high_correlation_pairs', [])
736
+ high_corr_count = len(high_corr_pairs)
737
+ multicollinearity_issues = results['correlation'].get('multicollinearity_issues', [])
738
+
739
+ summary['correlation'] = {
740
+ 'highly_correlated_pairs': high_corr_count,
741
+ 'correlation_threshold': 0.7,
742
+ 'top_correlations': high_corr_pairs[:5] if high_corr_pairs else [],
743
+ 'multicollinearity_issues': len(multicollinearity_issues),
744
+ 'vif_threshold': 10
745
+ }
746
+
747
+ # Dimensionality reduction summary
748
+ if 'dimensionality_reduction' in results and 'pca' in results['dimensionality_reduction']:
749
+ pca_results = results['dimensionality_reduction']['pca']
750
+
751
+ summary['dimensionality_reduction'] = {
752
+ 'components_for_80_percent_variance': pca_results['components_for_80_percent_variance'],
753
+ 'total_components': len(pca_results['explained_variance_ratio']),
754
+ 'first_component_variance': pca_results['explained_variance_ratio'][0] * 100
755
+ }
756
+
757
+ # Clustering summary
758
+ if 'clustering' in results and 'kmeans' in results['clustering']:
759
+ kmeans_results = results['clustering']['kmeans']
760
+
761
+ summary['clustering'] = {
762
+ 'optimal_clusters': kmeans_results['best_k'],
763
+ 'silhouette_score': kmeans_results['silhouette_avg'],
764
+ 'clearly_separable': kmeans_results['silhouette_avg'] > 0.5
765
+ }
766
+
767
+ # Feature importance summary
768
+ if 'feature_importance' in results:
769
+ top_features_with_explanations = []
770
+
771
+ # Get top features from RF if available
772
+ if 'random_forest' in results['feature_importance']:
773
+ rf_importances = results['feature_importance']['random_forest']
774
+ top_rf = sorted(rf_importances.items(), key=lambda x: x[1], reverse=True)[:5]
775
+ for feature, importance in top_rf:
776
+ explanation = f"{feature}: Importance {importance:.3f} (Random Forest: relative feature contribution to prediction accuracy)."
777
+ top_features_with_explanations.append(explanation)
778
+
779
+ # Get top features from mutual info if available
780
+ elif 'mutual_info' in results['feature_importance']:
781
+ mi_importances = results['feature_importance']['mutual_info']
782
+ top_mi = sorted(mi_importances.items(), key=lambda x: x[1], reverse=True)[:5]
783
+ for feature, importance in top_mi:
784
+ explanation = f"{feature}: Importance {importance:.3f} (Mutual Information: degree of dependency with target)."
785
+ top_features_with_explanations.append(explanation)
786
+
787
+ summary['feature_importance'] = {
788
+ 'top_features': top_features_with_explanations
789
+ }
790
+
791
+ # Outlier summary
792
+ if 'outlier_detection' in results and 'isolation_forest' in results['outlier_detection']:
793
+ iso_results = results['outlier_detection']['isolation_forest']
794
+
795
+ summary['outliers'] = {
796
+ 'percentage': iso_results['outlier_percentage'],
797
+ 'count': iso_results['num_outliers']
798
+ }
799
+
800
+ # Statistical testing summary
801
+ if 'statistical_testing' in results:
802
+ stat_tests = []
803
+
804
+ if 'manova' in results['statistical_testing']:
805
+ significant = results['statistical_testing']['manova']['significant']
806
+ stat_tests.append(f"MANOVA: {'Significant' if significant else 'Not significant'}")
807
+
808
+ if 'lda' in results['statistical_testing']:
809
+ stat_tests.append("LDA performed")
810
+
811
+ if 'multivariate_regression' in results['statistical_testing']:
812
+ reg_results = results['statistical_testing']['multivariate_regression']
813
+ r_squared = reg_results['r_squared']
814
+ significant_features = len(reg_results['significant_features'])
815
+ stat_tests.append(f"Regression: R² = {r_squared:.3f}, Significant features: {significant_features}")
816
+
817
+ summary['statistical_tests'] = stat_tests
818
+
819
+ output_string = ""
820
+
821
+ output_string = ""
822
+
823
+ if verbose:
824
+ output_string += "\n=== MULTIVARIATE ANALYSIS SUMMARY ===\n"
825
+ output_string += f"Dataset: {summary['dataset']['rows']} rows, {summary['dataset']['columns']} columns\n"
826
+ output_string += f"Features: {summary['dataset']['numerical_features']} numerical, {summary['dataset']['categorical_features']} categorical\n"
827
+
828
+ if 'correlation' in summary:
829
+ output_string += f"Correlations (Threshold: {summary['correlation']['correlation_threshold']}):\n"
830
+ if summary['correlation']['highly_correlated_pairs'] > 0:
831
+ for col1, col2, corr_value in summary['correlation']['top_correlations']:
832
+ output_string += f" - {col1} & {col2}: {corr_value:.3f}\n"
833
+ else:
834
+ output_string += " - No highly correlated feature pairs found.\n"
835
+
836
+ output_string += f"Multicollinearity (VIF Threshold: {summary['correlation']['vif_threshold']}): {summary['correlation']['multicollinearity_issues']} issues\n"
837
+
838
+ if 'dimensionality_reduction' in summary and pca_results:
839
+ output_string += f"\nPCA: {pca_results['components_for_80_percent_variance']} components explain 80% of variance\n"
840
+ output_string += f"First component explains {pca_results['explained_variance_ratio'][0] * 100:.2f}% of variance\n"
841
+
842
+ output_string += "\nComponent Loadings:\n"
843
+ output_string += str(pca_results['component_loadings']) + "\n"
844
+
845
+ output_string += "\nComponent Interpretation:\n"
846
+ for pc in pca_results['component_loadings'].columns:
847
+ output_string += f" - {pc}:\n"
848
+ top_features = pca_results['component_loadings'][pc].abs().sort_values(ascending=False).head(3)
849
+ for feature, loading in top_features.items():
850
+ output_string += f" {feature}: {loading:.3f}\n"
851
+ output_string += "-" * 10 + "\n"
852
+
853
+ output_string += "\nPCA Meaning:\n"
854
+ output_string += " - PCA has reduced the dimensionality of the data while retaining 80% of the variance.\n"
855
+ output_string += " - The components represent combinations of the original variables, with loadings indicating the strength and direction of each variable's influence on the component.\n"
856
+ output_string += " - The top loading features for each component can help in understanding what the components represent.\n"
857
+
858
+ if 'clustering' in summary and kmeans_results:
859
+ output_string += f"\nOptimal clusters: {kmeans_results['best_k']}\n"
860
+ output_string += f"Cluster separation: {kmeans_results['silhouette_avg']:.3f} (silhouette score)\n"
861
+
862
+ output_string += "\nCluster Descriptions:\n"
863
+ for cluster, analysis in kmeans_results['cluster_analysis'].items():
864
+ output_string += f" - {cluster}: Size={analysis['size']} ({analysis['percentage']:.2f}%)\n"
865
+ output_string += " Mean values:\n"
866
+ for feature, mean_val in analysis['mean'].items():
867
+ output_string += f" {feature}: {mean_val:.3f}\n"
868
+ output_string += " Standard deviations:\n"
869
+ for feature, std_val in analysis['std'].items():
870
+ output_string += f" {feature}: {std_val:.3f}\n"
871
+ output_string += "-" * 20 + "\n"
872
+
873
+ output_string += "\nCluster Differences:\n"
874
+ if kmeans_results['best_k'] > 1:
875
+ feature_means = {}
876
+ for cluster, analysis in kmeans_results['cluster_analysis'].items():
877
+ for feature, mean_val in analysis['mean'].items():
878
+ if feature not in feature_means:
879
+ feature_means[feature] = {}
880
+ feature_means[feature][cluster] = mean_val
881
+
882
+ for feature, cluster_means in feature_means.items():
883
+ output_string += f" - {feature}:\n"
884
+ for cluster, mean_val in cluster_means.items():
885
+ output_string += f" {cluster}: {mean_val:.3f}\n"
886
+
887
+ mean_values = list(cluster_means.values())
888
+ if len(set(mean_values)) > 1:
889
+ max_diff = max(mean_values) - min(mean_values)
890
+ output_string += f" Max difference: {max_diff:.3f}\n"
891
+ output_string += "-" * 10 + "\n"
892
+
893
+ output_string += "\nClustering Meaning:\n"
894
+ output_string += " - The clusters represent distinct groups within the data, characterized by differences in the mean values of the numerical features.\n"
895
+ if kmeans_results['silhouette_avg'] > 0.5:
896
+ output_string += " - The high silhouette score indicates that the clusters are well-separated and meaningful.\n"
897
+ else:
898
+ output_string += " - The silhouette score suggests the clusters are reasonably separated.\n"
899
+
900
+ if 'feature_importance' in summary:
901
+ output_string += "Top Features:\n"
902
+ for feature_explanation in summary['feature_importance']['top_features']:
903
+ output_string += f" - {feature_explanation}\n"
904
+
905
+ if 'outliers' in summary:
906
+ output_string += f"Outliers: {summary['outliers']['count']} ({summary['outliers']['percentage']:.2f}%)\n"
907
+
908
+ if 'statistical_tests' in summary:
909
+ output_string += "Statistical tests:\n"
910
+ for test in summary['statistical_tests']:
911
+ output_string += f" - {test}\n"
912
+
913
+ return output_string
914
+
915
+ def safe_prepare(results):
916
+ cleaned_results = []
917
+ for item in results:
918
+ if item is None:
919
+ continue
920
+ elif isinstance(item, dict):
921
+ cleaned_results.append(json.dumps(item))
922
+ elif isinstance(item, (list, str)):
923
+ cleaned_results.append(item)
924
+ else:
925
+ cleaned_results.append(str(item))
926
+ return json.dumps(cleaned_results)
927
+
928
+ def multivariate_analysis(data_path: str, target_column_name: str):
929
+ """
930
+ Perform comprehensive multivariate analysis on a dataset from the specified file path.
931
+
932
+ Parameters:
933
+ -----------
934
+ data_path : str
935
+ Path to the data file (supports CSV, Excel, etc. based on pandas read capabilities)
936
+ target_column_name : str
937
+ Name of the target column / feature
938
+
939
+ Returns:
940
+ --------
941
+ dict
942
+ A dictionary containing all analysis results
943
+ """
944
+ target_column = target_column_name
945
+ # target_column='Transported'
946
+ categorical_columns = None
947
+ verbose=True
948
+
949
+ try:
950
+ # Determine file type and read data
951
+ if verbose:
952
+ logger.warning(f"Reading data from {data_path}...")
953
+
954
+ if data_path.endswith('.csv'):
955
+ df = pd.read_csv(data_path)
956
+ elif data_path.endswith(('.xls', '.xlsx')):
957
+ df = pd.read_excel(data_path)
958
+ elif data_path.endswith('.json'):
959
+ df = pd.read_json(data_path)
960
+ elif data_path.endswith('.parquet'):
961
+ df = pd.read_parquet(data_path)
962
+ else:
963
+ raise ValueError("Unsupported file format. Please provide a CSV, Excel, JSON, or Parquet file.")
964
+
965
+ if verbose:
966
+ logger.warning(f"Data loaded successfully. Shape: {df.shape}")
967
+
968
+ results = {}
969
+
970
+ if verbose:
971
+ logger.warning("Preprocessing data...")
972
+
973
+ preprocess_results = preprocess_dataframe(df, target_column, categorical_columns, verbose)
974
+ results.update(preprocess_results)
975
+
976
+ data = results['data']
977
+ numerical_data = results['numerical_data']
978
+ scaled_data = results['scaled_data']
979
+ target = results['target']
980
+ numerical_columns = results['numerical_columns']
981
+ categorical_columns = results['categorical_columns']
982
+ target_is_numeric = results.get('target_is_numeric', None)
983
+
984
+ if verbose:
985
+ logger.warning("Analyzing correlations...")
986
+
987
+ correlation_results = analyze_correlations(numerical_data, verbose)
988
+ results['correlation'] = correlation_results
989
+
990
+ if verbose:
991
+ logger.warning("Performing dimensionality reduction...")
992
+
993
+ dr_results = perform_dimensionality_reduction(scaled_data, numerical_columns, len(data), verbose)
994
+ results['dimensionality_reduction'] = dr_results
995
+
996
+ if verbose:
997
+ logger.warning("Performing cluster analysis...")
998
+
999
+ clustering_results = perform_cluster_analysis(scaled_data, numerical_data, len(data), verbose)
1000
+ results['clustering'] = clustering_results
1001
+
1002
+ if target is not None:
1003
+ if verbose:
1004
+ logger.warning("Analyzing feature importance...")
1005
+
1006
+ feature_importance_results = analyze_feature_importance(
1007
+ data, target, numerical_columns, categorical_columns, target_is_numeric, verbose
1008
+ )
1009
+ results['feature_importance'] = feature_importance_results
1010
+
1011
+ if verbose:
1012
+ logger.warning("Detecting outliers...")
1013
+
1014
+ outlier_results = detect_outliers(scaled_data, len(data), verbose)
1015
+ results['outlier_detection'] = outlier_results
1016
+
1017
+ if target is not None:
1018
+ if verbose:
1019
+ logger.warning("Performing statistical tests...")
1020
+
1021
+ statistical_testing_results = perform_statistical_testing(
1022
+ data, target, scaled_data, numerical_columns, categorical_columns, target_is_numeric, verbose
1023
+ )
1024
+ results['statistical_testing'] = statistical_testing_results
1025
+
1026
+ if verbose:
1027
+ logger.warning("Generating summary...")
1028
+
1029
+ summary = generate_summary(results, df, verbose)
1030
+ results['summary'] = summary
1031
+
1032
+ if verbose:
1033
+ logger.warning("Multivariate analysis completed successfully!")
1034
+
1035
+ return results['summary']
1036
+
1037
+ except Exception as e:
1038
+ logger.error(f"Error in multivariate analysis: {str(e)}")
1039
+ return {"error": str(e)}
src/app/pipelines/eda/tools/analysis_tools/univariate_analysis.py ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from scipy import stats
4
+ import missingno as msno
5
+ from typing import Dict, List, Tuple, Optional, Union
6
+ from agno.utils.log import logger
7
+ import json
8
+
9
+ def data_overview(data: pd.DataFrame, categorical_threshold) -> Dict:
10
+ """
11
+ Generate a high-level overview of the dataset structure.
12
+
13
+ Returns:
14
+ A dictionary containing dataset dimensions and data types
15
+ """
16
+ logger.warn("Analyzing data overview...")
17
+
18
+ rows, cols = data.shape
19
+
20
+ dtypes = data.dtypes.value_counts().to_dict()
21
+
22
+ total_observations = rows * cols
23
+ total_missing = data.isna().sum().sum()
24
+ completeness_ratio = (total_observations - total_missing) / total_observations
25
+
26
+ feature_types = {}
27
+ for col in data.columns:
28
+ unique_count = data[col].nunique()
29
+ dtype = data[col].dtype
30
+
31
+ if pd.api.types.is_numeric_dtype(dtype):
32
+ if unique_count <= categorical_threshold:
33
+ feature_types[col] = 'categorical (numeric)'
34
+ else:
35
+ feature_types[col] = 'numerical'
36
+ elif pd.api.types.is_string_dtype(dtype) or pd.api.types.is_object_dtype(dtype):
37
+ if unique_count <= categorical_threshold:
38
+ feature_types[col] = 'categorical'
39
+ else:
40
+ feature_types[col] = 'text'
41
+ elif pd.api.types.is_datetime64_dtype(dtype):
42
+ feature_types[col] = 'datetime'
43
+ elif pd.api.types.is_bool_dtype(dtype):
44
+ feature_types[col] = 'boolean'
45
+ else:
46
+ feature_types[col] = 'other'
47
+
48
+ overview = {
49
+ 'dimensions': {'rows': rows, 'columns': cols},
50
+ 'data_types': dtypes,
51
+ 'feature_types': feature_types,
52
+ 'observations': {
53
+ 'total': total_observations,
54
+ 'missing': total_missing,
55
+ 'completeness_ratio': completeness_ratio
56
+ }
57
+ }
58
+
59
+ return overview
60
+
61
+ def missing_values_analysis(data: pd.DataFrame) -> Dict:
62
+ """
63
+ Analyze missing values in the dataset.
64
+
65
+ Returns:
66
+ A dictionary containing missing value statistics
67
+ """
68
+ logger.warn("Analyzing missing values...")
69
+
70
+ missing_counts = data.isnull().sum()
71
+ missing_percentage = (missing_counts / len(data)) * 100
72
+
73
+ columns_with_missing = missing_counts[missing_counts > 0].index.tolist()
74
+
75
+ missing_correlation = None
76
+ if len(columns_with_missing) > 1:
77
+ missing_mask = data[columns_with_missing].isna()
78
+ missing_correlation = missing_mask.corr()
79
+
80
+ missing_analysis = {
81
+ 'missing_counts': missing_counts.to_dict(),
82
+ 'missing_percentage': missing_percentage.to_dict(),
83
+ 'columns_with_missing': columns_with_missing,
84
+ 'total_missing_percentage': (missing_counts.sum() / (len(data) * len(data.columns))) * 100
85
+ }
86
+
87
+ if missing_correlation is not None:
88
+ missing_analysis['missing_correlation'] = missing_correlation.to_dict()
89
+
90
+ return missing_analysis
91
+
92
+ def analyze_features(data: pd.DataFrame, results: dict) -> Dict:
93
+ """
94
+ Perform type-specific analysis for each feature.
95
+
96
+ Returns:
97
+ A dictionary containing feature-level analysis results
98
+ """
99
+ logger.warn("Analyzing individual features...")
100
+
101
+ feature_results = {}
102
+
103
+ for column in data.columns:
104
+ feature_type = results['data_overview']['feature_types'][column]
105
+
106
+ if 'numerical' in feature_type:
107
+ feature_results[column] = analyze_numerical(data, column)
108
+ elif 'categorical' in feature_type or feature_type == 'boolean':
109
+ feature_results[column] = analyze_categorical(data, column)
110
+ elif feature_type == 'datetime':
111
+ feature_results[column] = analyze_datetime(data, column)
112
+ elif feature_type == 'text':
113
+ feature_results[column] = analyze_text(data, column)
114
+ else:
115
+ feature_results[column] = {'type': feature_type, 'message': 'No specific analysis available for this type'}
116
+
117
+ return feature_results
118
+
119
+ def analyze_numerical(df: pd.DataFrame, column: str) -> Dict:
120
+ """
121
+ Perform comprehensive analysis for a numerical feature.
122
+
123
+ Args:
124
+ column: The name of the column to analyze
125
+
126
+ Returns:
127
+ A dictionary containing numerical statistics
128
+ """
129
+ data = df[column].dropna()
130
+
131
+ if len(data) == 0:
132
+ return {'type': 'numerical', 'error': 'No non-null values found'}
133
+
134
+ try:
135
+ stats_dict = {
136
+ 'type': 'numerical',
137
+ 'count': len(data),
138
+ 'missing': df[column].isna().sum(),
139
+ 'min': float(data.min()),
140
+ 'max': float(data.max()),
141
+ 'range': float(data.max() - data.min()),
142
+ 'mean': float(data.mean()),
143
+ 'median': float(data.median()),
144
+ 'std': float(data.std()),
145
+ 'variance': float(data.var()),
146
+ 'quantiles': {
147
+ '25%': float(data.quantile(0.25)),
148
+ '50%': float(data.quantile(0.5)),
149
+ '75%': float(data.quantile(0.75)),
150
+ '90%': float(data.quantile(0.9)),
151
+ '95%': float(data.quantile(0.95)),
152
+ '99%': float(data.quantile(0.99))
153
+ }
154
+ }
155
+
156
+ stats_dict['skewness'] = float(stats.skew(data))
157
+ stats_dict['kurtosis'] = float(stats.kurtosis(data))
158
+
159
+ if len(data) >= 8: # Minimum required for Shapiro-Wilk test
160
+ # Sample data if too large for tests
161
+ sample_data = data if len(data) < 5000 else data.sample(5000)
162
+
163
+ try:
164
+ shapiro_test = stats.shapiro(sample_data)
165
+ stats_dict['normality_tests'] = {
166
+ 'shapiro_wilk': {
167
+ 'statistic': float(shapiro_test[0]),
168
+ 'p_value': float(shapiro_test[1]),
169
+ 'is_normal': shapiro_test[1] > 0.05
170
+ }
171
+ }
172
+
173
+ if len(data) >= 20: # Minimum for Anderson-Darling
174
+ anderson_test = stats.anderson(sample_data, dist='norm')
175
+ stats_dict['normality_tests']['anderson_darling'] = {
176
+ 'statistic': float(anderson_test.statistic),
177
+ 'critical_values': list(anderson_test.critical_values),
178
+ 'significance_levels': list(anderson_test.significance_level),
179
+ 'is_normal': anderson_test.statistic < anderson_test.critical_values[2] # 5% significance
180
+ }
181
+ except:
182
+ stats_dict['normality_tests'] = {'error': 'Could not perform normality tests on this data'}
183
+
184
+ q1 = data.quantile(0.25)
185
+ q3 = data.quantile(0.75)
186
+ iqr = q3 - q1
187
+ lower_bound = q1 - 1.5 * iqr
188
+ upper_bound = q3 + 1.5 * iqr
189
+
190
+ outliers = data[(data < lower_bound) | (data > upper_bound)]
191
+
192
+ stats_dict['outliers'] = {
193
+ 'count': len(outliers),
194
+ 'percentage': (len(outliers) / len(data)) * 100 if len(data) > 0 else 0,
195
+ 'lower_bound': float(lower_bound),
196
+ 'upper_bound': float(upper_bound)
197
+ }
198
+
199
+ value_counts = data.value_counts().head(10).to_dict()
200
+ stats_dict['frequent_values'] = {str(k): int(v) for k, v in value_counts.items()}
201
+
202
+ return stats_dict
203
+
204
+ except Exception as e:
205
+ logger.warn(f"Error analyzing numerical column {column}: {str(e)}")
206
+ return {'type': 'numerical', 'error': str(e)}
207
+
208
+ def analyze_categorical(df: pd.DataFrame, column: str) -> Dict:
209
+ """
210
+ Perform comprehensive analysis for a categorical feature.
211
+
212
+ Args:
213
+ column: The name of the column to analyze
214
+
215
+ Returns:
216
+ A dictionary containing categorical statistics
217
+ """
218
+ data = df[column].dropna()
219
+
220
+ # Skip if no data available
221
+ if len(data) == 0:
222
+ return {'type': 'categorical', 'error': 'No non-null values found'}
223
+
224
+ try:
225
+ # Basic statistics
226
+ value_counts = data.value_counts()
227
+ value_percentages = data.value_counts(normalize=True) * 100
228
+
229
+ stats_dict = {
230
+ 'type': 'categorical',
231
+ 'count': len(data),
232
+ 'missing': df[column].isna().sum(),
233
+ 'unique_values': data.nunique(),
234
+ 'mode': str(data.mode().iloc[0]) if not data.mode().empty else None,
235
+ 'entropy': float(stats.entropy(value_counts / len(data))) if len(value_counts) > 1 else 0
236
+ }
237
+
238
+ # Category frequencies
239
+ max_categories = 30 # Limit to top 30 categories to prevent huge outputs
240
+ if len(value_counts) <= max_categories:
241
+ stats_dict['categories'] = {
242
+ str(category): {
243
+ 'count': int(count),
244
+ 'percentage': float(value_percentages[category])
245
+ } for category, count in value_counts.items()
246
+ }
247
+ else:
248
+ # Include top categories and group the rest as "other"
249
+ top_categories = value_counts.head(max_categories)
250
+ other_count = value_counts.iloc[max_categories:].sum()
251
+ other_percentage = value_percentages.iloc[max_categories:].sum()
252
+
253
+ stats_dict['categories'] = {
254
+ str(category): {
255
+ 'count': int(count),
256
+ 'percentage': float(value_percentages[category])
257
+ } for category, count in top_categories.items()
258
+ }
259
+
260
+ stats_dict['categories']['other'] = {
261
+ 'count': int(other_count),
262
+ 'percentage': float(other_percentage)
263
+ }
264
+
265
+ stats_dict['note'] = f"Showing top {max_categories} of {len(value_counts)} categories. Remaining grouped as 'other'."
266
+
267
+ return stats_dict
268
+
269
+ except Exception as e:
270
+ logger.warn(f"Error analyzing categorical column {column}: {str(e)}")
271
+ return {'type': 'categorical', 'error': str(e)}
272
+
273
+ def analyze_datetime(df: pd.DataFrame, column: str) -> Dict:
274
+ """
275
+ Perform comprehensive analysis for a datetime feature.
276
+
277
+ Args:
278
+ column: The name of the column to analyze
279
+
280
+ Returns:
281
+ A dictionary containing datetime statistics
282
+ """
283
+ data = df[column].dropna()
284
+
285
+ # Skip if no data available
286
+ if len(data) == 0:
287
+ return {'type': 'datetime', 'error': 'No non-null values found'}
288
+
289
+ try:
290
+ stats_dict = {
291
+ 'type': 'datetime',
292
+ 'count': len(data),
293
+ 'missing': df[column].isna().sum(),
294
+ 'min_date': str(data.min()),
295
+ 'max_date': str(data.max()),
296
+ 'range_days': (data.max() - data.min()).days if hasattr((data.max() - data.min()), 'days') else None
297
+ }
298
+
299
+ # Distribution by year, month, day of week if there are enough dates
300
+ if len(data) >= 10:
301
+ try:
302
+ stats_dict['year_counts'] = data.dt.year.value_counts().sort_index().to_dict()
303
+ stats_dict['month_counts'] = data.dt.month.value_counts().sort_index().to_dict()
304
+ stats_dict['day_of_week_counts'] = data.dt.dayofweek.value_counts().sort_index().to_dict()
305
+ stats_dict['hour_counts'] = data.dt.hour.value_counts().sort_index().to_dict() if hasattr(data.dt, 'hour') else None
306
+ except:
307
+ # Some datetime objects might not support certain attributes
308
+ pass
309
+
310
+ return stats_dict
311
+
312
+ except Exception as e:
313
+ logger.warn(f"Error analyzing datetime column {column}: {str(e)}")
314
+ return {'type': 'datetime', 'error': str(e)}
315
+
316
+ def analyze_text(df: pd.DataFrame, column: str) -> Dict:
317
+ """
318
+ Perform basic analysis for a text feature.
319
+
320
+ Args:
321
+ column: The name of the column to analyze
322
+
323
+ Returns:
324
+ A dictionary containing text statistics
325
+ """
326
+ data = df[column].dropna().astype(str)
327
+
328
+ # Skip if no data available
329
+ if len(data) == 0:
330
+ return {'type': 'text', 'error': 'No non-null values found'}
331
+
332
+ try:
333
+ text_lengths = data.str.len()
334
+ word_counts = data.str.split().str.len()
335
+
336
+ stats_dict = {
337
+ 'type': 'text',
338
+ 'count': len(data),
339
+ 'missing': df[column].isna().sum(),
340
+ 'unique_values': data.nunique(),
341
+ 'text_length': {
342
+ 'min': int(text_lengths.min()),
343
+ 'max': int(text_lengths.max()),
344
+ 'mean': float(text_lengths.mean()),
345
+ 'median': float(text_lengths.median())
346
+ }
347
+ }
348
+
349
+ # Word count statistics if available
350
+ if not word_counts.isna().all():
351
+ stats_dict['word_count'] = {
352
+ 'min': int(word_counts.min()),
353
+ 'max': int(word_counts.max()),
354
+ 'mean': float(word_counts.mean()),
355
+ 'median': float(word_counts.median())
356
+ }
357
+
358
+ # Sample values (first few characters)
359
+ max_samples = 5
360
+ max_length = 100 # Show only first 100 chars for each sample
361
+
362
+ if len(data) <= max_samples:
363
+ samples = data.tolist()
364
+ else:
365
+ samples = data.sample(max_samples).tolist()
366
+
367
+ stats_dict['samples'] = [f"{s[:max_length]}{'...' if len(s) > max_length else ''}" for s in samples]
368
+
369
+ return stats_dict
370
+
371
+ except Exception as e:
372
+ logger.warning(f"Error analyzing text column {column}: {str(e)}")
373
+ return {'type': 'text', 'error': str(e)}
374
+
375
+ def generate_summary_report(results: dict) -> Dict:
376
+ """
377
+ Generate a summary of the univariate analysis results.
378
+
379
+ Returns:
380
+ A dictionary containing the summary report
381
+ """
382
+ logger.info("Generating summary report...")
383
+
384
+ # Extract key insights
385
+ summary = {
386
+ 'dataset_summary': {
387
+ 'dimensions': results['data_overview']['dimensions'],
388
+ 'completeness': f"{results['data_overview']['observations']['completeness_ratio']*100:.2f}%"
389
+ },
390
+ 'feature_types': {
391
+ 'numerical': [],
392
+ 'categorical': [],
393
+ 'datetime': [],
394
+ 'text': [],
395
+ 'other': []
396
+ },
397
+ 'missing_values': {
398
+ 'total_missing_percentage': f"{results['missing_values']['total_missing_percentage']:.2f}%",
399
+ 'features_with_high_missingness': []
400
+ },
401
+ 'numerical_features': {
402
+ 'highly_skewed': [],
403
+ 'potentially_non_normal': []
404
+ },
405
+ 'categorical_features': {
406
+ 'high_cardinality': [],
407
+ 'binary': []
408
+ }
409
+ }
410
+
411
+ # Categorize features by type
412
+ for col, feat_type in results['data_overview']['feature_types'].items():
413
+ if 'numerical' in feat_type:
414
+ summary['feature_types']['numerical'].append(col)
415
+ elif 'categorical' in feat_type or feat_type == 'boolean':
416
+ summary['feature_types']['categorical'].append(col)
417
+ elif feat_type == 'datetime':
418
+ summary['feature_types']['datetime'].append(col)
419
+ elif feat_type == 'text':
420
+ summary['feature_types']['text'].append(col)
421
+ else:
422
+ summary['feature_types']['other'].append(col)
423
+
424
+ # Features with high missingness (>10%)
425
+ for col, miss_pct in results['missing_values']['missing_percentage'].items():
426
+ if miss_pct > 10:
427
+ summary['missing_values']['features_with_high_missingness'].append({
428
+ 'feature': col,
429
+ 'missing_percentage': f"{miss_pct:.2f}%"
430
+ })
431
+
432
+ # Analyze numerical features
433
+ for col in summary['feature_types']['numerical']:
434
+ if col in results['feature_analysis']:
435
+ analysis = results['feature_analysis'][col]
436
+
437
+ if 'skewness' in analysis and abs(analysis['skewness']) > 1:
438
+ summary['numerical_features']['highly_skewed'].append({
439
+ 'feature': col,
440
+ 'skewness': analysis['skewness']
441
+ })
442
+
443
+ if 'normality_tests' in analysis and isinstance(analysis['normality_tests'], dict):
444
+ for test, res in analysis['normality_tests'].items():
445
+ if isinstance(res, dict) and 'is_normal' in res and not res['is_normal']:
446
+ summary['numerical_features']['potentially_non_normal'].append({
447
+ 'feature': col,
448
+ 'test': test
449
+ })
450
+ break
451
+
452
+ # Analyze categorical features
453
+ for col in summary['feature_types']['categorical']:
454
+ if col in results['feature_analysis']:
455
+ analysis = results['feature_analysis'][col]
456
+
457
+ if 'unique_values' in analysis:
458
+ if analysis['unique_values'] > 20:
459
+ summary['categorical_features']['high_cardinality'].append({
460
+ 'feature': col,
461
+ 'unique_values': analysis['unique_values']
462
+ })
463
+ elif analysis['unique_values'] == 2:
464
+ summary['categorical_features']['binary'].append(col)
465
+
466
+ return summary
467
+
468
+ def univariate_analysis(data_path: str, categorical_threshold: int = 10):
469
+ """
470
+ Perform a comprehensive univariate analysis on a dataset.
471
+
472
+ Args:
473
+ data_path: Path to a data file or a pandas DataFrame
474
+ categorical_threshold: Maximum unique values to consider a feature categorical
475
+ Returns:
476
+ A dictionary containing all analysis results
477
+ """
478
+ try:
479
+ if isinstance(data_path, str):
480
+ data = pd.read_csv(data_path)
481
+ else:
482
+ logger.error(f"Unsupported file format: {data_path}")
483
+ return {"error": f"Unsupported file format: {data_path}"}
484
+
485
+ if not isinstance(data, pd.DataFrame):
486
+ logger.error(f"Input is not a valid pandas DataFrame")
487
+ return {"error": "Input is not a valid pandas DataFrame"}
488
+
489
+ logger.warn("Staring Analysis.....")
490
+
491
+ copy_data = data.copy()
492
+
493
+ results = {}
494
+
495
+ results['data_overview'] = data_overview(
496
+ data=copy_data,
497
+ categorical_threshold=categorical_threshold
498
+ )
499
+
500
+ results['missing_values'] = missing_values_analysis(
501
+ data=copy_data
502
+ )
503
+
504
+ results['feature_analysis'] = analyze_features(
505
+ data=copy_data,
506
+ results = results
507
+ )
508
+
509
+ analysis_summary = generate_summary_report(results)
510
+
511
+ return json.dumps(analysis_summary)
512
+
513
+ except Exception as e:
514
+ logger.error(f"Error in analyze_dataset: {str(e)}")
515
+ return {"error": str(e)}
516
+
517
+
src/app/pipelines/eda/tools/data_cleaning_tools/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .handle_missing_values import handle_missing_values
2
+ from .handle_outliers import handle_outliers
src/app/pipelines/eda/tools/data_cleaning_tools/handle_missing_values.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from agno.utils.log import logger
3
+ import json
4
+
5
+ def handle_missing_values(file_path: str) -> list:
6
+ """
7
+ Loads a CSV file using pandas, handles missing values using statistical methods,
8
+ and saves the processed file to storage "outputs/".
9
+
10
+ Args:
11
+ file_path (str): Path to the CSV file.
12
+ Returns:
13
+ Status of the entire process
14
+ """
15
+ try:
16
+ df = pd.read_csv(file_path)
17
+ logger.info(f"Started Missing Values Handler. CSV Loaded with shape: {df.shape} ")
18
+
19
+ missing_before = df.isnull().sum().to_dict()
20
+
21
+ result_df = df.copy()
22
+
23
+ numeric_cols = result_df.select_dtypes(include=['number']).columns.tolist()
24
+ categorical_cols = result_df.select_dtypes(include=['category']).columns.tolist()
25
+ text_cols = result_df.select_dtypes(include=['object']).columns.tolist()
26
+
27
+
28
+ for col in numeric_cols:
29
+ if result_df[col].isnull().any():
30
+ median_value = result_df[col].median()
31
+ result_df[col] = result_df[col].fillna(median_value)
32
+ logger.info(f"Filled {col} missing values with median: {median_value}")
33
+
34
+ for col in categorical_cols:
35
+ if result_df[col].isnull().any():
36
+ if not result_df[col].mode().empty:
37
+ mode_value = result_df[col].mode()[0]
38
+ result_df[col] = result_df[col].fillna(mode_value)
39
+ logger.info(f"Filled {col} missing values with mode: {mode_value}")
40
+ else:
41
+ logger.warning(f"Column {col} has no mode. Missing values remain.")
42
+
43
+ for col in text_cols:
44
+ if result_df[col].isnull().any():
45
+ if not result_df[col].mode().empty:
46
+ mode_value = result_df[col].mode()[0]
47
+ result_df[col] = result_df[col].fillna(mode_value)
48
+ logger.info(f"Filled text column {col} missing values with most frequent value")
49
+ else:
50
+ result_df[col] = result_df[col].fillna("")
51
+ logger.info(f"Filled text column {col} missing values with empty string")
52
+
53
+ output_dir = "src/core/cache/dataset_logs"
54
+
55
+ missing_after = result_df.isnull().sum().to_dict()
56
+ result_df.to_csv(f"{output_dir}/handle_missing_values_output.csv", index=False)
57
+
58
+ logger.info(f"CSV output stored with shape: {result_df.shape} ")
59
+
60
+ return json.dumps({'status': 'success'})
61
+
62
+ except Exception as e:
63
+ logger.warning(f"Failed to process CSV file: {e}")
64
+ return str(e)
src/app/pipelines/eda/tools/data_cleaning_tools/handle_outliers.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from agno.tools import Toolkit
4
+ from agno.utils.log import logger
5
+ import json
6
+
7
+ def _detect_outliers(df: pd.DataFrame, threshold: float = 3.5, remove_outliers: bool = True) -> pd.DataFrame:
8
+ """
9
+ Detect and handle outliers in numerical columns using the Modified Z-Score method.
10
+
11
+ Args:
12
+ df (pd.DataFrame): Input dataframe.
13
+ threshold (float): The threshold for detecting outliers (default is 3.5).
14
+ remove_outliers (bool): Whether to remove outliers (True) or just mark them (False).
15
+
16
+ Returns:
17
+ pd.DataFrame: DataFrame with outliers handled.
18
+ """
19
+ result_df = df.copy()
20
+ numeric_cols = result_df.select_dtypes(include=[np.number]).columns.tolist()
21
+
22
+ if not numeric_cols:
23
+ logger.warning("No numerical columns found. Skipping outlier handling.")
24
+ return result_df
25
+
26
+ outlier_flags = pd.DataFrame(index=result_df.index)
27
+
28
+ for col in numeric_cols:
29
+ median = result_df[col].median()
30
+ mad = np.median(np.abs(result_df[col] - median))
31
+
32
+ if mad == 0: # Prevent division by zero
33
+ continue
34
+
35
+ # Compute Modified Z-Score
36
+ mod_z_score = 0.6745 * (result_df[col] - median) / mad
37
+
38
+ # Mark outliers
39
+ outlier_flags[col + '_outlier'] = np.abs(mod_z_score) > threshold
40
+
41
+ result_df['is_outlier'] = outlier_flags.any(axis=1)
42
+
43
+ if remove_outliers:
44
+ cleaned_df = result_df[~result_df['is_outlier']].drop(columns=['is_outlier'])
45
+ logger.info(f"Removed {result_df['is_outlier'].sum()} outliers.")
46
+ else:
47
+ cleaned_df = result_df
48
+
49
+ return cleaned_df
50
+
51
+ def handle_outliers(file_path: str, threshold: float = 3.5, remove_outliers: bool = True) -> dict:
52
+ """
53
+ Loads a CSV file, detects/removes outliers using the Modified Z-Score method, and saves the cleaned data to /outputs
54
+
55
+ Args:
56
+ file_path (str): Path to the CSV file.
57
+ threshold (float): The threshold for detecting outliers.
58
+ remove_outliers (bool): Whether to remove outliers or just mark them.
59
+
60
+ Returns:
61
+ dict: Process status, outlier statistics, and output file path.
62
+ """
63
+ try:
64
+ df = pd.read_csv(file_path)
65
+ logger.warn(f"Started Outlier Detection. CSV Loaded with shape: {df.shape}")
66
+
67
+ outliers_before = df.isnull().sum().to_dict()
68
+ df = _detect_outliers(df, threshold, remove_outliers)
69
+
70
+ outliers_after = df.isnull().sum().to_dict()
71
+
72
+ output_dir = "src/core/cache/dataset_logs"
73
+
74
+ output_path = f"{output_dir}/outlier_detection_output.csv"
75
+ df.to_csv(output_path, index=False)
76
+
77
+ logger.info(f"CSV output stored at {output_path} with shape: {df.shape}")
78
+
79
+ return json.dumps(f"Total outliers before: {outliers_before}. Total outliers after: {outliers_after}. The threshold used for dealing with outliers is {threshold}. The file is stored in path: '{output_path}' ")
80
+
81
+ except Exception as e:
82
+ logger.warning(f"Failed to process CSV file: {e}")
83
+ return {"status": "Error", "message": str(e)}
src/app/pipelines/eda/tools/lib.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .analysis_tools import univariate_analysis, bivariate_analysis, multivariate_analysis
2
+ from .data_cleaning_tools import handle_outliers, handle_missing_values
3
+
4
+ tool_library = {
5
+ # "HandleMissingValues": {
6
+ # "name": "Missing Values Handler",
7
+ # "function": handle_missing_values,
8
+ # "metadata": '''
9
+ # 1. Fills missing values with the median of each column.
10
+ # 2. Fills missing values with the mode, if available; otherwise, logs a warning.
11
+ # 3. Fills missing values with the most frequent value or an empty string if mode is unavailable.
12
+ # ''',
13
+ # },
14
+ "handle_outliers": {
15
+ "name": "Outlier Handler",
16
+ "function": handle_outliers,
17
+ "metadata": '''
18
+ 1. Uses median and MAD (Median Absolute Deviation) to detect outliers.
19
+ 2. Identifies extreme values based on a set threshold and either excludes them from the dataset or keeps them marked for reference.
20
+ ''',
21
+ },
22
+ 'univariate_analysis': {
23
+ "name": "Univariate Analysis",
24
+ "function": univariate_analysis,
25
+ "metadata": '''
26
+ 1. Provides a high-level summary of dataset structure, data types, and missing value statistics.
27
+ 2. Analyzes missing values, their distribution, and correlation between missing columns.
28
+ 3. Performs feature-specific analysis based on detected data types
29
+ 4. Computes descriptive statistics, normality tests, and outlier detection for numerical columns.
30
+ 5. Analyzes categorical distributions, entropy, and category frequencies with top values.
31
+ 6. (Truncated but likely) extracts patterns, ranges, and trends from datetime columns.
32
+ ''',
33
+ },
34
+ 'bivariate_analysis': {
35
+ "name": "Bivariate Analysis",
36
+ "function": bivariate_analysis,
37
+ "metadata": '''
38
+ 1. Uses Pearson, Spearman, and Kendall correlations for numerical variables, chi-square/Cramér’s V for categorical associations, and statistical tests like ANOVA for numerical vs. categorical analysis. Identifies best-fit relationships (linear, polynomial, etc.) for numerical pairs.
39
+ 2. Provides a detailed bivariate analysis of all variable pairs in a dataframe, summarizing key correlations, associations, and insights. Optionally generates and saves visualizations like scatterplots and heatmaps.
40
+ 3. Uses Chi-square tests and Cramer's V to assess categorical feature associations, calculates Phi coefficient for 2x2 tables, and computes Goodman & Kruskal’s Lambda for predictive strength.
41
+ 4. Identifies statistically significant relationships between categorical variables, ranks them by strength, and optionally visualizes contingency tables as heatmaps.
42
+ 5. The function performs ANOVA (One-Way & Welch’s ANOVA), Point-Biserial Correlation (for binary categories), and Levene’s test to analyze relationships between numerical and categorical features, calculating effect sizes (eta-squared, omega-squared) for significance testing.
43
+ ''',
44
+ },
45
+ 'multivariate_analysis': {
46
+ "name": "Multivariate Analysis",
47
+ "function": multivariate_analysis,
48
+ "metadata": '''
49
+ 1. Calculates the pairwise correlation coefficients between all numerical columns in a given DataFrame, generating a correlation matrix.
50
+ 2. It identifies pairs of numerical features with absolute correlation values exceeding a threshold of 0.7, indicating strong linear relationships.
51
+ 3. Calculates the Variance Inflation Factor (VIF) for each numerical feature to detect multicollinearity, flagging features with VIF values greater than 10 as potential issues.
52
+ 4. Uses PCA, Factor Analysis, t-SNE, and MDS. Identifies principal components or latent factors, aiming for 80% variance retention in PCA.
53
+ 5. Finds optimal clusters using silhouette score, evaluates cluster quality. Density-based clustering for smaller datasets (<=5000 rows), identifies noise. Fits Gaussian mixture models, evaluates model fit.
54
+ 6. Statistical tests and mutual information to rank individual feature relevance. Random Forest models to determine feature contribution to prediction. Iterative feature removal to select top features (max 10).
55
+ 7. Detects outliers by isolating them in random partitions, using a contamination rate of 5%. Identifies local density deviations for smaller datasets (<= 5000 rows), also using a 5percent contamination rate and 20 neighbors. Provides the number and percentage of detected outliers for each method.
56
+ 8. MANOVA: Tests mean differences across categorical target groups for multiple numerical features. LDA: Dimensionality reduction and classification for categorical targets.
57
+ '''
58
+ }
59
+ }
src/app/pipelines/modules/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .data_statistics import DataStatisticsWorkflow
2
+ from .data_quality_assessment import DataQualityAssessmentWorkflow
3
+ from .data_understanding_context import DataUnderstandingContextWorkflow
4
+ from .univariate_analysis import UnivariateAnalysisWorkflow
src/app/pipelines/modules/data_quality_assessment.py ADDED
@@ -0,0 +1,1657 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import ast
4
+ import json
5
+ import difflib
6
+ import numpy as np
7
+ import pandas as pd
8
+ from scipy import stats
9
+ from typing import Union
10
+ from collections import Counter
11
+ from src.core.utils import logger
12
+ from scipy.spatial.distance import pdist
13
+ from scipy.stats import chi2_contingency
14
+ from agno.agent import Agent, RunResponse
15
+ from agno.models.openai import OpenAIChat
16
+ from src.core.utils import KnowledgeBaseClass
17
+ from scipy.cluster.hierarchy import linkage, fcluster
18
+ from sklearn.metrics.pairwise import cosine_similarity
19
+ from typing import Dict, List, Union, Tuple, Any, Optional
20
+ from sklearn.feature_extraction.text import TfidfVectorizer
21
+
22
+ class DataQualityAssessmentWorkflow:
23
+ def __init__(
24
+ self, data_source: str,
25
+ llm_choice: str,
26
+ ml_task: str
27
+ ) -> None:
28
+ ''''''
29
+ self.data = None
30
+ self.data_source = data_source
31
+ self.llm_choice = llm_choice
32
+ self.ml_task = ml_task
33
+ self.llm = OpenAIChat(id=llm_choice, api_key=os.getenv('OPENAI_API_KEY'))
34
+ self.writer: Agent = Agent(
35
+ model=self.llm,
36
+ instructions=[
37
+ "You will be provided with lots of structured outputs. Your work is to display this"
38
+ "in a nicely formatted manner. You must analayze the results and output a comprehensive and insightful report"
39
+ ],
40
+ markdown=True,
41
+ )
42
+ _ = self.load_data(data_source=data_source)
43
+
44
+ def load_data(self, data_source: str) -> Union[None, bool]:
45
+ '''Load CSV into dataframe'''
46
+ try:
47
+ self.data = pd.read_csv(data_source)
48
+ return True
49
+ except Exception as e:
50
+ logger.error(
51
+ f"Failed to read the file from the data source with error: {e}", log_type="data_quality_assessment", console=True)
52
+ return False
53
+
54
+ def analyze_missing_data(self, verbose=False) -> Dict:
55
+ """Comprehensive analysis of missing data patterns in a CSV dataset"""
56
+ results = {
57
+ "status": "success",
58
+ "data_loaded": False,
59
+ "data_shape": None,
60
+ "missing_rates": None,
61
+ "little_mcar_test": None,
62
+ "mar_correlations": None,
63
+ "mutual_missingness": None,
64
+ "segment_analysis": None,
65
+ "summary": None,
66
+ "errors": []
67
+ }
68
+
69
+ try:
70
+ df = self.data
71
+ results["data_loaded"] = True
72
+ results["data_shape"] = df.shape
73
+ except Exception as e:
74
+ results["status"] = "failed"
75
+ results["errors"].append(f"Failed to load data: {str(e)}")
76
+ return results
77
+
78
+ try:
79
+ results["missing_rates"] = self.analyze_missing_rates(df, verbose=verbose)
80
+ except Exception as e:
81
+ results["errors"].append(f"Failed to analyze missing rates: {str(e)}")
82
+ logger.error(f"Failed to analyze missing rates: {str(e)}", log_type='data_quality_assessment', console=verbose)
83
+
84
+ try:
85
+ results["little_mcar_test"] = self.littles_mcar_test(df, verbose=verbose)
86
+ except Exception as e:
87
+ results["errors"].append(f"Failed to perform Little's MCAR test: {str(e)}")
88
+ logger.error(f"Failed to perform Little's MCAR test: {str(e)}", log_type='data_quality_assessment', console=verbose)
89
+
90
+ try:
91
+ results["mar_correlations"] = self.check_mar_correlations(df, verbose=verbose)
92
+ except Exception as e:
93
+ results["errors"].append(f"Failed to perform MAR correlation check: {str(e)}")
94
+ logger.error(f"Failed to perform MAR correlation check: {str(e)}", log_type='data_quality_assessment', console=verbose)
95
+
96
+ try:
97
+ results["mutual_missingness"] = self.analyze_mutual_missingness(df, verbose=verbose)
98
+ except Exception as e:
99
+ results["errors"].append(f"Failed to analyze mutual missingness: {str(e)}")
100
+ logger.error(f"Failed to analyze mutual missingness: {str(e)}", log_type='data_quality_assessment', console=verbose)
101
+
102
+ try:
103
+ results["segment_analysis"] = self.segment_based_analysis(df, verbose=verbose)
104
+ except Exception as e:
105
+ results["errors"].append(f"Failed to perform segment-based analysis: {str(e)}")
106
+ logger.error(f"Failed to perform segment-based analysis: {str(e)}", log_type='data_quality_assessment', console=verbose)
107
+
108
+ results["summary"] = self.generate_summary(results, verbose=verbose)
109
+
110
+ return results
111
+
112
+ def analyze_missing_rates(self, df: pd.DataFrame, verbose=False) -> Dict:
113
+ """Calculate per-variable/column missing rates"""
114
+ try:
115
+ missing_count = df.isnull().sum()
116
+ missing_percentage = (missing_count / len(df)) * 100
117
+
118
+ missing_info = pd.DataFrame({
119
+ 'Column': missing_count.index,
120
+ 'Missing Count': missing_count.values,
121
+ 'Missing Percentage': missing_percentage.values
122
+ })
123
+
124
+ missing_info = missing_info.sort_values('Missing Percentage', ascending=False)
125
+
126
+ total_cells = df.shape[0] * df.shape[1]
127
+ total_missing = df.isnull().sum().sum()
128
+ overall_missing_percentage = (total_missing / total_cells) * 100
129
+
130
+ result = {
131
+ 'per_column': missing_info.to_dict('records'),
132
+ 'overall': {
133
+ 'total_cells': total_cells,
134
+ 'total_missing': total_missing,
135
+ 'overall_missing_percentage': overall_missing_percentage
136
+ }
137
+ }
138
+
139
+ return result
140
+ except Exception as e:
141
+ logger.error(f"Error in analyze_missing_rates: {str(e)}", log_type="data_quality_assessment", console=verbose)
142
+ return {'error': str(e)}
143
+
144
+ def littles_mcar_test(self, df: pd.DataFrame, verbose=False) -> Dict:
145
+ """Perform Little's MCAR test to check if data is Missing Completely At Random"""
146
+ try:
147
+ numeric_df = df.select_dtypes(include=[np.number])
148
+ if numeric_df.empty:
149
+ return {
150
+ 'status': 'skipped',
151
+ 'reason': 'No numeric columns available for Little\'s MCAR test'
152
+ }
153
+
154
+ if numeric_df.shape[0] < 3 or numeric_df.shape[1] < 2:
155
+ return {
156
+ 'status': 'skipped',
157
+ 'reason': 'Not enough data for meaningful Little\'s MCAR test'
158
+ }
159
+
160
+ results = []
161
+
162
+ for col in numeric_df.columns:
163
+ try:
164
+ missing_indicator = numeric_df[col].isnull()
165
+
166
+ if missing_indicator.sum() == 0 or missing_indicator.sum() == len(missing_indicator):
167
+ continue
168
+
169
+ for other_col in numeric_df.columns:
170
+ if col == other_col:
171
+ continue
172
+
173
+ if numeric_df[other_col].isnull().sum() == len(numeric_df):
174
+ continue
175
+
176
+ not_missing_values = numeric_df.loc[~missing_indicator, other_col].dropna()
177
+ missing_values = numeric_df.loc[missing_indicator, other_col].dropna()
178
+
179
+ if len(not_missing_values) > 5 and len(missing_values) > 5:
180
+ try:
181
+ not_missing_mean = not_missing_values.mean()
182
+ missing_mean = missing_values.mean()
183
+
184
+ t_stat, p_value = stats.ttest_ind(
185
+ not_missing_values,
186
+ missing_values,
187
+ equal_var=False,
188
+ nan_policy='omit'
189
+ )
190
+
191
+ results.append({
192
+ 'reference_col': col,
193
+ 'test_col': other_col,
194
+ 'not_missing_mean': not_missing_mean,
195
+ 'missing_mean': missing_mean,
196
+ 'difference': abs(not_missing_mean - missing_mean),
197
+ 't_statistic': t_stat,
198
+ 'p_value': p_value,
199
+ 'significant': p_value < 0.05
200
+ })
201
+ except Exception:
202
+ pass
203
+ except:
204
+ continue
205
+ significant_tests = [test for test in results if test.get('significant', False)]
206
+
207
+ if not results:
208
+ is_mcar = None
209
+ evidence_strength = None
210
+ else:
211
+ prop_significant = len(significant_tests) / len(results)
212
+
213
+ is_mcar = prop_significant <= 0.05
214
+
215
+ if prop_significant == 0:
216
+ evidence_strength = "Strong evidence for MCAR"
217
+ elif prop_significant <= 0.05:
218
+ evidence_strength = "Moderate evidence for MCAR"
219
+ elif prop_significant <= 0.10:
220
+ evidence_strength = "Weak evidence against MCAR"
221
+ else:
222
+ evidence_strength = "Strong evidence against MCAR"
223
+
224
+ return {
225
+ 'status': 'completed',
226
+ 'is_mcar': is_mcar,
227
+ 'tests_conducted': len(results),
228
+ 'significant_tests': len(significant_tests),
229
+ 'proportion_significant': len(significant_tests) / len(results) if results else None,
230
+ 'evidence_strength': evidence_strength,
231
+ 'test_details': results[:10]
232
+ }
233
+ except Exception as e:
234
+ logger.error(f"Error in littles_mcar_test: {str(e)}", log_type="data_quality_assessment", console=verbose)
235
+ return {'status': 'error', 'error': str(e)}
236
+
237
+ def check_mar_correlations(self, df: pd.DataFrame, verbose=False) -> Dict:
238
+ """Check if missingness in one variable depends on observed values in others (MAR)"""
239
+ try:
240
+ results = {}
241
+
242
+ for col in df.columns:
243
+ try:
244
+ missing_count = df[col].isnull().sum()
245
+ if missing_count == 0 or missing_count == len(df):
246
+ continue
247
+
248
+ missingness = df[col].isnull().astype(int)
249
+
250
+ col_results = {}
251
+
252
+ for other_col in df.columns:
253
+ if col == other_col or df[other_col].isnull().sum() == len(df):
254
+ continue
255
+
256
+ if pd.api.types.is_numeric_dtype(df[other_col]):
257
+ try:
258
+ present_values = df.loc[~df[col].isnull(), other_col].dropna()
259
+ missing_values = df.loc[df[col].isnull(), other_col].dropna()
260
+
261
+ if len(present_values) > 5 and len(missing_values) > 5:
262
+ present_mean = present_values.mean()
263
+ missing_mean = missing_values.mean()
264
+
265
+ t_stat, p_value = stats.ttest_ind(
266
+ present_values,
267
+ missing_values,
268
+ equal_var=False,
269
+ nan_policy='omit'
270
+ )
271
+
272
+ if p_value < 0.05:
273
+ col_results[other_col] = {
274
+ 'test': 't-test',
275
+ 'present_mean': present_mean,
276
+ 'missing_mean': missing_mean,
277
+ 'mean_difference': missing_mean - present_mean,
278
+ 't_statistic': t_stat,
279
+ 'p_value': p_value,
280
+ 'significant': True
281
+ }
282
+ except Exception as e:
283
+ pass
284
+
285
+ elif pd.api.types.is_object_dtype(df[other_col]) or pd.api.types.is_categorical_dtype(df[other_col]):
286
+ try:
287
+ value_counts = df[other_col].value_counts()
288
+ threshold = max(5, len(df) * 0.01)
289
+
290
+ temp_df = df.copy()
291
+
292
+ low_freq_cats = value_counts[value_counts < threshold].index.tolist()
293
+ if low_freq_cats:
294
+ temp_df[other_col] = temp_df[other_col].apply(
295
+ lambda x: 'Other' if x in low_freq_cats else x
296
+ )
297
+
298
+ contingency = pd.crosstab(
299
+ temp_df[col].isnull(),
300
+ temp_df[other_col].fillna('Missing')
301
+ )
302
+
303
+ if (contingency.shape[0] > 1 and contingency.shape[1] > 1 and
304
+ (contingency < 5).sum().sum() <= contingency.size * 0.2):
305
+
306
+ chi2, p, dof, expected = chi2_contingency(contingency)
307
+
308
+ if p < 0.05:
309
+ col_results[other_col] = {
310
+ 'test': 'chi-square',
311
+ 'chi2': chi2,
312
+ 'p_value': p,
313
+ 'dof': dof,
314
+ 'significant': True
315
+ }
316
+ except Exception as e:
317
+ pass
318
+
319
+ if col_results:
320
+ results[col] = col_results
321
+ except:
322
+ continue
323
+
324
+ is_mar = len(results) > 0
325
+
326
+ return {
327
+ 'is_mar': is_mar,
328
+ 'details': results,
329
+ 'columns_with_mar_evidence': list(results.keys())
330
+ }
331
+ except Exception as e:
332
+ logger.error(f"Error in check_mar_correlations: {str(e)}", log_type="data_quality_assessment", console=verbose)
333
+ return {'error': str(e)}
334
+
335
+ def analyze_mutual_missingness(self, df: pd.DataFrame, verbose=False) -> Dict:
336
+ """Check for mutual missingness - analyze if certain features tend to be missing together"""
337
+ try:
338
+ binary_missing = df.isnull().astype(int)
339
+
340
+ if binary_missing.sum().sum() == 0:
341
+ return {
342
+ 'status': 'skipped',
343
+ 'reason': 'No missing values found in dataset'
344
+ }
345
+
346
+ missingness_correlation = binary_missing.corr()
347
+
348
+ strong_correlations = []
349
+
350
+ for i in range(len(missingness_correlation.columns)):
351
+ try:
352
+ for j in range(i+1, len(missingness_correlation.columns)):
353
+ col1 = missingness_correlation.columns[i]
354
+ col2 = missingness_correlation.columns[j]
355
+ corr_value = missingness_correlation.iloc[i, j]
356
+
357
+ if abs(corr_value) > 0.5:
358
+ contingency = pd.crosstab(
359
+ binary_missing[col1],
360
+ binary_missing[col2]
361
+ )
362
+
363
+ try:
364
+ if contingency.values.min() < 5:
365
+ from scipy.stats import fisher_exact
366
+ _, p_value = fisher_exact(contingency)
367
+ else:
368
+ _, p_value, _, _ = chi2_contingency(contingency)
369
+
370
+ if p_value < 0.05:
371
+ strong_correlations.append({
372
+ 'column1': col1,
373
+ 'column2': col2,
374
+ 'correlation': corr_value,
375
+ 'p_value': p_value,
376
+ 'significant': True
377
+ })
378
+ except Exception:
379
+ strong_correlations.append({
380
+ 'column1': col1,
381
+ 'column2': col2,
382
+ 'correlation': corr_value,
383
+ 'significant': None
384
+ })
385
+ except:
386
+ continue
387
+
388
+ co_occurrence = {}
389
+ for i in range(len(df.columns)):
390
+ try:
391
+ for j in range(i+1, len(df.columns)):
392
+ col1 = df.columns[i]
393
+ col2 = df.columns[j]
394
+
395
+ both_missing = (df[col1].isnull() & df[col2].isnull()).sum()
396
+
397
+ if both_missing > 0:
398
+ col1_missing = df[col1].isnull().sum()
399
+ col2_missing = df[col2].isnull().sum()
400
+
401
+ union_missing = (df[col1].isnull() | df[col2].isnull()).sum()
402
+ jaccard = both_missing / union_missing if union_missing > 0 else 0
403
+
404
+ co_occurrence[(col1, col2)] = {
405
+ 'both_missing': both_missing,
406
+ 'col1_missing': col1_missing,
407
+ 'col2_missing': col2_missing,
408
+ 'co_occurrence_ratio': both_missing / min(col1_missing, col2_missing) if min(col1_missing, col2_missing) > 0 else 0,
409
+ 'jaccard_coefficient': jaccard
410
+ }
411
+ except:
412
+ continue
413
+
414
+ co_occurrence_list = [
415
+ {
416
+ 'column1': cols[0],
417
+ 'column2': cols[1],
418
+ 'both_missing_count': data['both_missing'],
419
+ 'co_occurrence_ratio': data['co_occurrence_ratio'],
420
+ 'jaccard_coefficient': data['jaccard_coefficient']
421
+ }
422
+ for cols, data in co_occurrence.items()
423
+ ]
424
+ co_occurrence_list.sort(key=lambda x: x['co_occurrence_ratio'], reverse=True)
425
+
426
+ missingness_clusters = None
427
+ try:
428
+ if len(binary_missing.columns) > 1:
429
+
430
+
431
+ cols_with_missing = [col for col in binary_missing.columns if binary_missing[col].sum() > 0]
432
+ if len(cols_with_missing) > 1:
433
+ missing_data = binary_missing[cols_with_missing].T
434
+
435
+ dist_matrix = pdist(missing_data, metric='correlation')
436
+
437
+ linkage_matrix = linkage(dist_matrix, method='average')
438
+
439
+ clusters = fcluster(linkage_matrix, t=0.5, criterion='distance')
440
+
441
+ missingness_clusters = {}
442
+ for i, col in enumerate(cols_with_missing):
443
+ cluster_id = clusters[i]
444
+ if cluster_id not in missingness_clusters:
445
+ missingness_clusters[cluster_id] = []
446
+ missingness_clusters[cluster_id].append(col)
447
+
448
+ missingness_clusters = {k: v for k, v in missingness_clusters.items() if len(v) > 1}
449
+ except Exception as e:
450
+ logger.info(f"Warning: Clustering of missingness patterns failed: {str(e)}", log_type="data_quality_assessment", console=verbose)
451
+
452
+ return {
453
+ 'status': 'completed',
454
+ 'strong_correlations': strong_correlations,
455
+ 'co_occurrence': co_occurrence_list[:15],
456
+ 'missingness_clusters': missingness_clusters
457
+ }
458
+ except Exception as e:
459
+ logger.error(f"Error in analyze_mutual_missingness: {str(e)}", log_type='data_quality_assessment', console=verbose)
460
+ return {'status': 'error', 'error': str(e)}
461
+
462
+ def segment_based_analysis(self, df: pd.DataFrame, verbose=False) -> Dict:
463
+ """Analyze missingness grouped by categories"""
464
+ try:
465
+ results = {}
466
+
467
+ categorical_cols = [
468
+ col for col in df.columns
469
+ if pd.api.types.is_object_dtype(df[col]) or
470
+ pd.api.types.is_categorical_dtype(df[col]) or
471
+ (pd.api.types.is_numeric_dtype(df[col]) and df[col].nunique() < 10)
472
+ ]
473
+
474
+ if not categorical_cols:
475
+ return {
476
+ 'status': 'skipped',
477
+ 'reason': 'No suitable categorical columns found for segmentation'
478
+ }
479
+
480
+ for cat_col in categorical_cols:
481
+ try:
482
+ if df[cat_col].isnull().sum() > 0.5 * len(df):
483
+ continue
484
+
485
+ df_temp = df.copy()
486
+ df_temp[cat_col] = df_temp[cat_col].fillna('Missing')
487
+
488
+ categories = df_temp[cat_col].value_counts().head(10).index.tolist()
489
+
490
+ category_results = {}
491
+
492
+ for category in categories:
493
+ subset = df_temp[df_temp[cat_col] == category]
494
+
495
+ if len(subset) < 5:
496
+ continue
497
+
498
+ missing_percentages = subset.drop(columns=[cat_col]).isnull().mean() * 100
499
+
500
+ missing_percentages = missing_percentages[missing_percentages > 0]
501
+
502
+ if not missing_percentages.empty:
503
+ category_results[category] = {
504
+ 'sample_size': len(subset),
505
+ 'missing_percentages': missing_percentages.to_dict()
506
+ }
507
+
508
+ if category_results:
509
+ results[cat_col] = category_results
510
+
511
+ except Exception as e:
512
+ logger.error(f"Error in segment_based_analysis: {str(e)}", log_type='data_quality_assessment', console=verbose)
513
+ return {'status': 'error', 'error': str(e)}
514
+
515
+ return {
516
+ 'status': 'completed',
517
+ 'segments_analyzed': len(results),
518
+ 'details': results
519
+ }
520
+ except Exception as e:
521
+ logger.error(f"Error in segment_based_analysis: {str(e)}", log_type='data_quality_assessment', console=verbose)
522
+ return {'status': 'error', 'error': str(e)}
523
+
524
+ def generate_summary(self, results: Dict, verbose=False) -> str:
525
+ """Generate a human-readable summary of missing data analysis results"""
526
+ summary_lines = ["# Missing Data Analysis Summary"]
527
+
528
+ try:
529
+ if results["data_loaded"]:
530
+ summary_lines.append("\n## Dataset Overview")
531
+ summary_lines.append(f"- Successfully loaded dataset with shape: {results['data_shape'][0]} rows × {results['data_shape'][1]} columns")
532
+ else:
533
+ summary_lines.append("\n## Error Loading Dataset")
534
+ summary_lines.append("- Failed to load the dataset. Please check the file path and format.")
535
+ return "\n".join(summary_lines)
536
+ except Exception as e:
537
+ logger.error(f"Error in dataset loading section: {str(e)}", log_type='data_quality_assessment', console=verbose)
538
+
539
+ try:
540
+ if results.get("missing_rates") and "error" not in results["missing_rates"]:
541
+ missing_rates = results["missing_rates"]
542
+ summary_lines.append("\n## Missing Values Overview")
543
+ summary_lines.append(f"- Overall missing data: {missing_rates['overall']['overall_missing_percentage']:.2f}% "
544
+ f"({missing_rates['overall']['total_missing']} out of {missing_rates['overall']['total_cells']} cells)")
545
+
546
+ if missing_rates['per_column']:
547
+ summary_lines.append("\n### Top columns with missing values:")
548
+ top_missing = sorted(missing_rates['per_column'], key=lambda x: x['Missing Percentage'], reverse=True)[:5]
549
+ for col in top_missing:
550
+ if col['Missing Percentage'] > 0:
551
+ summary_lines.append(f"- {col['Column']}: {col['Missing Percentage']:.2f}% ({col['Missing Count']} values)")
552
+ except Exception as e:
553
+ logger.error(f"Error in missing values overview section: {str(e)}", log_type='data_quality_assessment', console=verbose)
554
+
555
+ try:
556
+ little_test = results.get("little_mcar_test", {})
557
+ if little_test.get("status") == "completed":
558
+ summary_lines.append("\n## Little's MCAR Test Results")
559
+ if little_test.get("is_mcar") is not None:
560
+ if little_test["is_mcar"]:
561
+ summary_lines.append("- **Data appears to be Missing Completely At Random (MCAR)**")
562
+ if "proportion_significant" in little_test:
563
+ summary_lines.append(f"- Proportion of significant tests: {little_test['proportion_significant']:.2f}")
564
+ summary_lines.append(f"- Evidence: {little_test.get('evidence_strength', 'Evidence supports MCAR')}")
565
+ else:
566
+ summary_lines.append("- **Data does not appear to be Missing Completely At Random (MCAR)**")
567
+ if "proportion_significant" in little_test:
568
+ summary_lines.append(f"- Proportion of significant tests: {little_test['proportion_significant']:.2f}")
569
+ summary_lines.append(f"- Evidence: {little_test.get('evidence_strength', 'Evidence against MCAR')}")
570
+ else:
571
+ summary_lines.append("- Could not determine MCAR status definitively")
572
+ elif little_test.get("status") == "skipped":
573
+ summary_lines.append("\n## Little's MCAR Test Results")
574
+ summary_lines.append(f"- Test skipped: {little_test.get('reason', 'Unknown reason')}")
575
+ except Exception as e:
576
+ logger.error(f"Error in Little's MCAR test section: {str(e)}", log_type='data_quality_assessment', console=verbose)
577
+
578
+ try:
579
+ mar_results = results.get("mar_correlations", {})
580
+ if mar_results and "error" not in mar_results:
581
+ summary_lines.append("\n## Missing At Random (MAR) Analysis")
582
+ if mar_results.get("is_mar"):
583
+ summary_lines.append("- **Evidence found that data is Missing At Random (MAR)**")
584
+ summary_lines.append("- Missingness in some variables depends on observed values in other variables")
585
+ mar_details = mar_results.get("details", {})
586
+ for col in list(mar_details.keys())[:3]:
587
+ related_cols = list(mar_details[col].keys())
588
+ summary_lines.append(f"- Missingness in '{col}' depends on values in: {', '.join(related_cols[:3])}"
589
+ + (f" and {len(related_cols) - 3} more columns" if len(related_cols) > 3 else ""))
590
+ else:
591
+ summary_lines.append("- No clear evidence that data is Missing At Random (MAR)")
592
+ except Exception as e:
593
+ logger.error(f"Error in MAR analysis section: {str(e)}", log_type='data_quality_assessment', console=verbose)
594
+
595
+ try:
596
+ mutual_results = results.get("mutual_missingness", {})
597
+ if mutual_results.get("status") == "completed":
598
+ summary_lines.append("\n## Mutual Missingness Analysis")
599
+ strong_corrs = mutual_results.get("strong_correlations", [])
600
+ if strong_corrs:
601
+ summary_lines.append("- **Some variables tend to be missing together**")
602
+ for corr in strong_corrs[:3]:
603
+ summary_lines.append(f"- '{corr['column1']}' and '{corr['column2']}' have strongly correlated missingness (r = {corr['correlation']:.2f})")
604
+ if len(strong_corrs) > 3:
605
+ summary_lines.append(f"- {len(strong_corrs) - 3} other pairs of columns with strongly correlated missingness")
606
+ else:
607
+ summary_lines.append("- No strong patterns of mutual missingness detected")
608
+
609
+ co_occurrence = mutual_results.get("co_occurrence", [])
610
+ if co_occurrence:
611
+ summary_lines.append("\n### Most common co-occurrences of missing values:")
612
+ for co in co_occurrence[:3]:
613
+ summary_lines.append(f"- '{co['column1']}' and '{co['column2']}' are missing together in {co['both_missing_count']} rows")
614
+ except Exception as e:
615
+ logger.error(f"Error in mutual missingness section: {str(e)}", log_type='data_quality_assessment', console=verbose)
616
+
617
+ try:
618
+ segment_results = results.get("segment_analysis", {})
619
+ if segment_results.get("status") == "completed":
620
+ summary_lines.append("\n## Segment-based Missing Value Analysis")
621
+ segments = segment_results.get("details", {})
622
+ if segments:
623
+ summary_lines.append(f"- Analyzed missingness patterns across {len(segments)} different segmentations")
624
+ example_segment = list(segments.keys())[0]
625
+ summary_lines.append(f"\n### Example: Missingness by '{example_segment}' categories")
626
+ for category in list(segments[example_segment].keys())[:3]:
627
+ cat_data = segments[example_segment][category]
628
+ sample_size = cat_data['sample_size']
629
+ missing_data = cat_data['missing_percentages']
630
+ top_missing = sorted(missing_data.items(), key=lambda x: x[1], reverse=True)[:2]
631
+ if top_missing:
632
+ summary_lines.append(f"- In category '{category}' (n={sample_size}):")
633
+ for col, pct in top_missing:
634
+ summary_lines.append(f" - '{col}' has {pct:.1f}% missing values")
635
+ else:
636
+ summary_lines.append("- No significant segment-based missingness patterns found")
637
+ except Exception as e:
638
+ logger.error(f"Error in segment-based analysis section: {str(e)}", log_type='data_quality_assessment', console=verbose)
639
+
640
+ try:
641
+ summary_lines.append("\n## Recommendations for Missing Data")
642
+ if results.get("missing_rates") and "error" not in results["missing_rates"]:
643
+ missing_rates = results["missing_rates"]
644
+ high_missing_cols = [col for col in missing_rates['per_column'] if col['Missing Percentage'] > 50]
645
+ if high_missing_cols:
646
+ summary_lines.append("- **Consider dropping columns** with high missing rates (>50%):")
647
+ for col in high_missing_cols[:3]:
648
+ summary_lines.append(f" - '{col['Column']}' ({col['Missing Percentage']:.1f}% missing)")
649
+ if len(high_missing_cols) > 3:
650
+ summary_lines.append(f" - and {len(high_missing_cols) - 3} other columns")
651
+
652
+ if results.get("little_mcar_test", {}).get("is_mcar") is True:
653
+ summary_lines.append("- Since data appears to be MCAR, **simple imputation** methods like mean/median/mode imputation are reasonable")
654
+
655
+ if results.get("mar_correlations", {}).get("is_mar") is True:
656
+ summary_lines.append("- Since evidence suggests data is MAR, consider **model-based imputation methods** like:")
657
+ summary_lines.append(" - Multiple Imputation by Chained Equations (MICE)")
658
+ summary_lines.append(" - K-Nearest Neighbors (KNN) imputation")
659
+ summary_lines.append(" - Regression-based imputation")
660
+
661
+ if results.get("mutual_missingness", {}).get("strong_correlations"):
662
+ summary_lines.append("- For variables that are missing together, consider **multivariate imputation** approaches")
663
+ except Exception as e:
664
+ logger.error(f"Error in recommendations section: {str(e)}", log_type='data_quality_assessment', console=verbose)
665
+
666
+ try:
667
+ if results.get("errors"):
668
+ summary_lines.append("\n## Analysis Issues")
669
+ summary_lines.append("The following issues were encountered during analysis:")
670
+ for error in results["errors"]:
671
+ summary_lines.append(f"- {error}")
672
+ except Exception as e:
673
+ logger.error(f"Error in final error reporting section: {str(e)}", log_type='data_quality_assessment', console=verbose)
674
+
675
+ return "\n".join(summary_lines)
676
+
677
+ def agent_mva(self, report: str, verbose=False):
678
+ try:
679
+ kbc = KnowledgeBaseClass()
680
+ kb = kbc.initialize_knowledge_base(task_type=self.ml_task)
681
+ agent = kbc.initialize_agent(
682
+ agent_name="missing_value_analysis_agent",
683
+ llm_choice=self.llm_choice,
684
+ knowledge_base=kb
685
+ )
686
+
687
+ input = f"Task type: {self.ml_task}\n\n\n{report}"
688
+ res: RunResponse = agent.run(input, stream=False)
689
+
690
+ formatted_code = self.post_process_code(code=res.content.code_generated, verbose=verbose)
691
+
692
+ with open("temp.py", "w") as f:
693
+ f.write(f'''{formatted_code}''')
694
+
695
+ from temp import main
696
+
697
+ results = main(self.data_source)
698
+
699
+ return results
700
+ except Exception as e:
701
+ logger.error(f"Failed to run agentic analysis with error: {e}", log_type="data_quality_assessment", console=verbose)
702
+ return ""
703
+
704
+ def post_process_code(self, code: str, verbose=False) -> str:
705
+ def fix_fstring_quotes(match):
706
+ inner = match.group(1)
707
+ fixed_inner = re.sub(r'\{([^{}]*?)\["([^"]+)"\]\}', r"{\1['\2']}", inner)
708
+ return f'f"{fixed_inner}"'
709
+
710
+ fstring_pattern = r"f'([^']*{[^}]+}[^']*)'"
711
+ code = re.sub(fstring_pattern, fix_fstring_quotes, code)
712
+
713
+ try:
714
+ ast.parse(code)
715
+ except SyntaxError as e:
716
+ logger.error(f"[Syntax Error after fix] Line {e.lineno}: {e.msg}", log_type="data_quality_assessment", console=verbose)
717
+ pass
718
+
719
+ return code
720
+
721
+ def detect_duplicates(
722
+ self,
723
+ key_columns: Optional[List[str]] = None,
724
+ similarity_columns: Optional[List[str]] = None,
725
+ similarity_threshold: float = 0.8,
726
+ numeric_threshold: float = 0.05,
727
+ sample_size: Optional[int] = None,
728
+ verbose = False
729
+ ) -> Dict[str, Any]:
730
+ """Comprehensive duplicate detection function that can analyze any CSV dataset"""
731
+
732
+ df = self.data
733
+
734
+ results = {
735
+ "dataset_info": {
736
+ "original_rows": len(df),
737
+ "original_columns": len(df.columns),
738
+ "column_dtypes": {col: str(df[col].dtype) for col in df.columns}
739
+ },
740
+ "exact_duplicates": {},
741
+ "key_based_duplicates": {},
742
+ "near_duplicates": {},
743
+ "distribution_impact": {},
744
+ "summary": ""
745
+ }
746
+
747
+ if sample_size and len(df) > sample_size:
748
+ try:
749
+ analysis_df = df.sample(sample_size, random_state=42)
750
+ results["dataset_info"]["sampled"] = True
751
+ results["dataset_info"]["sample_size"] = sample_size
752
+ except Exception as e:
753
+ analysis_df = df
754
+ results["dataset_info"]["sampling_error"] = str(e)
755
+ else:
756
+ analysis_df = df
757
+
758
+ try:
759
+ exact_duplicates = self.analyze_exact_duplicates(analysis_df, verbose=verbose)
760
+ results["exact_duplicates"] = exact_duplicates
761
+ except Exception as e:
762
+ results["exact_duplicates"]["error"] = str(e)
763
+
764
+ try:
765
+ key_duplicates = self.analyze_key_based_duplicates(analysis_df, key_columns, verbose=verbose)
766
+ results["key_based_duplicates"] = key_duplicates
767
+ except Exception as e:
768
+ results["key_based_duplicates"]["error"] = str(e)
769
+
770
+ try:
771
+ near_duplicates = self.analyze_near_duplicates(analysis_df, similarity_columns,
772
+ similarity_threshold, numeric_threshold, verbose=verbose)
773
+ results["near_duplicates"] = near_duplicates
774
+ except Exception as e:
775
+ results["near_duplicates"]["error"] = str(e)
776
+
777
+ try:
778
+ distribution_impact = self.analyze_distribution_impact(analysis_df, results, verbose=verbose)
779
+ results["distribution_impact"] = distribution_impact
780
+ except Exception as e:
781
+ results["distribution_impact"]["error"] = str(e)
782
+
783
+ try:
784
+ results["summary"] = self.generate_duplicate_analysis_summary(results, verbose=verbose)
785
+ except Exception as e:
786
+ results["summary"] = f"Error generating summary: {str(e)}"
787
+
788
+ return results
789
+
790
+ def analyze_exact_duplicates(self, df: pd.DataFrame, verbose=False) -> Dict[str, Any]:
791
+ """Detect and analyze exact duplicate rows"""
792
+ results = {}
793
+
794
+ try:
795
+ duplicate_mask = df.duplicated()
796
+ duplicated_rows = df[duplicate_mask]
797
+
798
+ unique_duplicate_patterns = df[df.duplicated(keep=False)].drop_duplicates()
799
+
800
+ results["total_exact_duplicates"] = int(duplicate_mask.sum())
801
+ results["unique_duplicate_patterns"] = len(unique_duplicate_patterns)
802
+ results["duplicate_percentage"] = round(results["total_exact_duplicates"] / len(df) * 100, 2)
803
+
804
+ if len(duplicated_rows) > 0:
805
+ dup_counts = Counter(map(tuple, df[df.duplicated(keep=False)].itertuples(index=False)))
806
+ most_common = [(str(k), v) for k, v in dup_counts.most_common(5)]
807
+ results["most_common_duplicates"] = most_common
808
+
809
+ if len(df.columns) <= 20:
810
+ column_duplication = {}
811
+ for col in df.columns:
812
+ dup_count = df.duplicated(subset=[col], keep=False).sum()
813
+ if dup_count > 0:
814
+ column_duplication[col] = int(dup_count)
815
+
816
+ results["column_duplication_counts"] = column_duplication
817
+
818
+ except Exception as e:
819
+ results["error"] = f"Error in exact duplicates analysis: {str(e)}"
820
+ logger.error(f"Error in exact duplicates analysis: {str(e)}", log_type="data_quality_assessment", console=verbose)
821
+
822
+ return results
823
+
824
+ def analyze_key_based_duplicates(self, df: pd.DataFrame, key_columns: Optional[List[str]] = None, verbose=False) -> Dict[str, Any]:
825
+ """Detect and analyze key-based duplicates"""
826
+ results = {}
827
+
828
+ try:
829
+ if key_columns is None:
830
+ key_columns = self.identify_key_candidates(df)
831
+ results["detected_key_candidates"] = key_columns
832
+
833
+ if not key_columns:
834
+ results["message"] = "No key columns identified or provided"
835
+ return results
836
+
837
+ dup_counts = {}
838
+ for key in key_columns:
839
+ try:
840
+ if key in df.columns:
841
+ dups = df.duplicated(subset=[key], keep=False)
842
+ dup_count = int(dups.sum())
843
+ dup_percentage = round(dup_count / len(df) * 100, 2)
844
+
845
+ if dup_count > 0:
846
+ dup_values = df[dups][key].value_counts().head(5).to_dict()
847
+ dup_values = {str(k): int(v) for k, v in dup_values.items()}
848
+ else:
849
+ dup_values = {}
850
+
851
+ dup_counts[key] = {
852
+ "duplicate_count": dup_count,
853
+ "duplicate_percentage": dup_percentage,
854
+ "top_duplicated_values": dup_values
855
+ }
856
+ except Exception as e:
857
+ dup_counts[key] = {"error": str(e)}
858
+
859
+ results["key_duplicate_analysis"] = dup_counts
860
+
861
+ if len(key_columns) > 1:
862
+ try:
863
+ multi_key_dups = df.duplicated(subset=key_columns, keep=False)
864
+ results["multi_key_duplicates"] = {
865
+ "count": int(multi_key_dups.sum()),
866
+ "percentage": round(multi_key_dups.sum() / len(df) * 100, 2)
867
+ }
868
+ except Exception as e:
869
+ results["multi_key_duplicates"] = {"error": str(e)}
870
+
871
+ except Exception as e:
872
+ results["error"] = f"Error in key-based duplicates analysis: {str(e)}"
873
+ logger.error(f"Error in key-based duplicates analysis: {str(e)}", log_type="data_quality_assessment", console=verbose)
874
+
875
+ return results
876
+
877
+ def identify_key_candidates(self, df: pd.DataFrame) -> List[str]:
878
+ """
879
+ Identify potential primary key columns in the dataframe
880
+ """
881
+ candidates = []
882
+
883
+ try:
884
+ for col in df.columns:
885
+ if df[col].nunique() == len(df) and df[col].notna().all():
886
+ candidates.append(col)
887
+
888
+ if not candidates:
889
+ for col in df.columns:
890
+ uniqueness_ratio = df[col].nunique() / len(df)
891
+ if uniqueness_ratio > 0.9 and df[col].notna().all():
892
+ candidates.append(col)
893
+
894
+ if not candidates:
895
+ for col in df.columns:
896
+ if any(key_term in col.lower() for key_term in ['id', 'key', 'code', 'num', 'uuid']):
897
+ candidates.append(col)
898
+
899
+ except Exception:
900
+ pass
901
+
902
+ return candidates
903
+
904
+ def analyze_near_duplicates(
905
+ self,
906
+ df: pd.DataFrame,
907
+ similarity_columns: Optional[List[str]] = None,
908
+ similarity_threshold: float = 0.8,
909
+ numeric_threshold: float = 0.05,
910
+ verbose = False
911
+ ) -> Dict[str, Any]:
912
+ """
913
+ Detect and analyze near-duplicate rows using similarity metrics
914
+ """
915
+ results = {}
916
+
917
+ try:
918
+ if similarity_columns is None:
919
+ text_columns = [col for col in df.columns if df[col].dtype == 'object']
920
+ numeric_columns = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
921
+
922
+ similarity_columns = text_columns[:5] if len(text_columns) > 5 else text_columns
923
+ results["auto_selected_text_columns"] = similarity_columns
924
+ results["available_numeric_columns"] = numeric_columns
925
+
926
+ if not similarity_columns:
927
+ results["message"] = "No text columns identified for similarity analysis"
928
+ return results
929
+
930
+ sample_size = min(1000, len(df))
931
+ if len(df) > sample_size:
932
+ sample_df = df.sample(sample_size, random_state=42)
933
+ results["sampled_for_similarity"] = True
934
+ results["similarity_sample_size"] = sample_size
935
+ else:
936
+ sample_df = df
937
+
938
+ text_similarity_results = {}
939
+ for col in similarity_columns:
940
+ try:
941
+ if col in df.columns and df[col].dtype == 'object':
942
+ col_data = sample_df[col].fillna("").astype(str)
943
+
944
+ if col_data.nunique() <= 1:
945
+ continue
946
+
947
+ similar_pairs = self.find_similar_text(col_data, similarity_threshold)
948
+
949
+ if similar_pairs:
950
+ text_similarity_results[col] = {
951
+ "similar_pairs_count": len(similar_pairs),
952
+ "examples": similar_pairs[:5] # Limit to first 5 examples
953
+ }
954
+ except Exception as e:
955
+ text_similarity_results[col] = {"error": str(e)}
956
+
957
+ results["text_similarity"] = text_similarity_results
958
+
959
+ numeric_similarity_results = {}
960
+ numeric_cols = [col for col in df.columns if col in similarity_columns and pd.api.types.is_numeric_dtype(df[col])]
961
+
962
+ for col in numeric_cols:
963
+ try:
964
+ if df[col].isna().sum() / len(df) > 0.3: # More than 30% missing
965
+ continue
966
+
967
+ col_data = sample_df[col].dropna()
968
+ similar_numeric_pairs = self.find_similar_numeric(col_data, numeric_threshold)
969
+
970
+ if similar_numeric_pairs:
971
+ numeric_similarity_results[col] = {
972
+ "similar_pairs_count": len(similar_numeric_pairs),
973
+ "examples": similar_numeric_pairs[:5] # Limit to first 5 examples
974
+ }
975
+ except Exception as e:
976
+ numeric_similarity_results[col] = {"error": str(e)}
977
+
978
+ results["numeric_similarity"] = numeric_similarity_results
979
+
980
+ try:
981
+ if len(similarity_columns) >= 2:
982
+ combined_text = sample_df[similarity_columns].fillna("").astype(str).apply(
983
+ lambda x: " ".join(x), axis=1
984
+ )
985
+
986
+ if len(combined_text) > 1: # Need at least 2 rows for comparison
987
+ vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
988
+ try:
989
+ tfidf_matrix = vectorizer.fit_transform(combined_text)
990
+ similar_doc_pairs = self.find_similar_vectors(tfidf_matrix, similarity_threshold)
991
+
992
+ if similar_doc_pairs:
993
+ results["multi_column_similarity"] = {
994
+ "similar_rows_count": len(similar_doc_pairs),
995
+ "examples": similar_doc_pairs[:5] # Limit to first 5 examples
996
+ }
997
+ except Exception as e:
998
+ results["multi_column_similarity"] = {"error": str(e)}
999
+ except Exception as e:
1000
+ results["multi_column_similarity_error"] = str(e)
1001
+
1002
+ except Exception as e:
1003
+ results["error"] = f"Error in near-duplicates analysis: {str(e)}"
1004
+ logger.error(f"Error in near-duplicates analysis: {str(e)}", log_type="data_quality_assessment", console=verbose)
1005
+
1006
+
1007
+ return results
1008
+
1009
+ def find_similar_text(self, series: pd.Series, threshold: float) -> List[Tuple[str, str, float]]:
1010
+ """
1011
+ Find similar text pairs in a series
1012
+ """
1013
+ similar_pairs = []
1014
+ values = series.tolist()
1015
+
1016
+ max_comparisons = 10000
1017
+ if len(values) > 200: # For large sets, sample comparisons
1018
+ import random
1019
+ from itertools import combinations
1020
+
1021
+ indices = list(range(len(values)))
1022
+ all_pairs = list(combinations(indices, 2))
1023
+
1024
+ if len(all_pairs) > max_comparisons:
1025
+ sampled_pairs = random.sample(all_pairs, max_comparisons)
1026
+ else:
1027
+ sampled_pairs = all_pairs
1028
+
1029
+ for i, j in sampled_pairs:
1030
+ try:
1031
+ if not values[i] or not values[j]:
1032
+ continue
1033
+
1034
+ similarity = difflib.SequenceMatcher(None, values[i], values[j]).ratio()
1035
+ if similarity >= threshold:
1036
+ similar_pairs.append((values[i], values[j], round(similarity, 2)))
1037
+ except:
1038
+ continue
1039
+ else:
1040
+ for i in range(len(values)):
1041
+ try:
1042
+ for j in range(i+1, len(values)):
1043
+ if not values[i] or not values[j]:
1044
+ continue
1045
+
1046
+ similarity = difflib.SequenceMatcher(None, values[i], values[j]).ratio()
1047
+ if similarity >= threshold:
1048
+ similar_pairs.append((values[i], values[j], round(similarity, 2)))
1049
+ except:
1050
+ continue
1051
+ return similar_pairs
1052
+
1053
+ def find_similar_numeric(self, series: pd.Series, threshold: float) -> List[Tuple[float, float, float]]:
1054
+ """
1055
+ Find similar numeric pairs in a series
1056
+ """
1057
+ similar_pairs = []
1058
+ values = series.tolist()
1059
+
1060
+ max_comparisons = 10000
1061
+ if len(values) > 200: # For large sets, sample comparisons
1062
+ import random
1063
+ from itertools import combinations
1064
+
1065
+ indices = list(range(len(values)))
1066
+ all_pairs = list(combinations(indices, 2))
1067
+
1068
+ if len(all_pairs) > max_comparisons:
1069
+ sampled_pairs = random.sample(all_pairs, max_comparisons)
1070
+ else:
1071
+ sampled_pairs = all_pairs
1072
+
1073
+ for i, j in sampled_pairs:
1074
+ try:
1075
+ if values[i] == 0 or values[j] == 0:
1076
+ continue
1077
+
1078
+ max_val = max(abs(values[i]), abs(values[j]))
1079
+ min_val = min(abs(values[i]), abs(values[j]))
1080
+
1081
+ if max_val == 0: # Both values are zero
1082
+ continue
1083
+
1084
+ rel_diff = (max_val - min_val) / max_val
1085
+
1086
+ if rel_diff <= threshold:
1087
+ similar_pairs.append((values[i], values[j], round(rel_diff, 3)))
1088
+ except:
1089
+ continue
1090
+ else:
1091
+ for i in range(len(values)):
1092
+ try:
1093
+ for j in range(i+1, len(values)):
1094
+ if values[i] == 0 or values[j] == 0:
1095
+ continue
1096
+
1097
+ max_val = max(abs(values[i]), abs(values[j]))
1098
+ min_val = min(abs(values[i]), abs(values[j]))
1099
+
1100
+ if max_val == 0: # Both values are zero
1101
+ continue
1102
+
1103
+ rel_diff = (max_val - min_val) / max_val
1104
+
1105
+ if rel_diff <= threshold:
1106
+ similar_pairs.append((values[i], values[j], round(rel_diff, 3)))
1107
+ except:
1108
+ continue
1109
+
1110
+ return similar_pairs
1111
+
1112
+ def find_similar_vectors(self, tfidf_matrix, threshold: float) -> List[Tuple[int, int, float]]:
1113
+ """
1114
+ Find similar document pairs based on TF-IDF vectors
1115
+ """
1116
+ similar_pairs = []
1117
+
1118
+ max_rows = 1000
1119
+ if tfidf_matrix.shape[0] > max_rows:
1120
+ import random
1121
+ indices = random.sample(range(tfidf_matrix.shape[0]), max_rows)
1122
+ sampled_matrix = tfidf_matrix[indices]
1123
+ else:
1124
+ sampled_matrix = tfidf_matrix
1125
+ indices = list(range(tfidf_matrix.shape[0]))
1126
+
1127
+ similarity_matrix = cosine_similarity(sampled_matrix)
1128
+
1129
+ rows, cols = np.where(similarity_matrix >= threshold)
1130
+ for i, j in zip(rows, cols):
1131
+ try:
1132
+ if i < j: # Only include each pair once
1133
+ similar_pairs.append((indices[i], indices[j], round(similarity_matrix[i, j], 2)))
1134
+ except:
1135
+ continue
1136
+
1137
+ return similar_pairs
1138
+
1139
+ def analyze_distribution_impact(self, df: pd.DataFrame, results: Dict[str, Any], verbose=False) -> Dict[str, Any]:
1140
+ """
1141
+ Assess impact of duplicates on distribution statistics
1142
+ """
1143
+ distribution_results = {}
1144
+
1145
+ try:
1146
+ df_deduped = df.drop_duplicates()
1147
+
1148
+ if len(df) == len(df_deduped):
1149
+ distribution_results["message"] = "No exact duplicates found for distribution impact analysis"
1150
+ return distribution_results
1151
+
1152
+ numeric_cols = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
1153
+
1154
+ col_impacts = {}
1155
+ for col in numeric_cols:
1156
+ try:
1157
+ if df[col].isna().sum() / len(df) > 0.5: # More than 50% missing
1158
+ continue
1159
+
1160
+ orig_stats = {
1161
+ "mean": float(df[col].mean()),
1162
+ "median": float(df[col].median()),
1163
+ "std": float(df[col].std()),
1164
+ "min": float(df[col].min()),
1165
+ "max": float(df[col].max())
1166
+ }
1167
+
1168
+ dedup_stats = {
1169
+ "mean": float(df_deduped[col].mean()),
1170
+ "median": float(df_deduped[col].median()),
1171
+ "std": float(df_deduped[col].std()),
1172
+ "min": float(df_deduped[col].min()),
1173
+ "max": float(df_deduped[col].max())
1174
+ }
1175
+
1176
+ pct_changes = {}
1177
+ for stat in orig_stats:
1178
+ try:
1179
+ if orig_stats[stat] != 0:
1180
+ pct_changes[stat] = round(
1181
+ (dedup_stats[stat] - orig_stats[stat]) / abs(orig_stats[stat]) * 100, 2
1182
+ )
1183
+ else:
1184
+ pct_changes[stat] = 0.0 if dedup_stats[stat] == 0 else float('inf')
1185
+ except:
1186
+ continue
1187
+
1188
+ col_impacts[col] = {
1189
+ "original_stats": orig_stats,
1190
+ "deduped_stats": dedup_stats,
1191
+ "percentage_changes": pct_changes,
1192
+ "significant_change": any(abs(chg) > 5 for chg in pct_changes.values())
1193
+ }
1194
+ except Exception as e:
1195
+ col_impacts[col] = {"error": str(e)}
1196
+
1197
+ distribution_results["numeric_columns_impact"] = col_impacts
1198
+
1199
+ cat_cols = [col for col in df.columns if df[col].dtype == 'object']
1200
+
1201
+ cat_cols = cat_cols[:10] if len(cat_cols) > 10 else cat_cols
1202
+
1203
+ cat_impacts = {}
1204
+ for col in cat_cols:
1205
+ try:
1206
+ orig_counts = df[col].value_counts(normalize=True).head(10).to_dict()
1207
+ dedup_counts = df_deduped[col].value_counts(normalize=True).head(10).to_dict()
1208
+
1209
+ orig_counts = {str(k): float(v) for k, v in orig_counts.items()}
1210
+ dedup_counts = {str(k): float(v) for k, v in dedup_counts.items()}
1211
+
1212
+ changed_cats = {}
1213
+ all_cats = set(list(orig_counts.keys()) + list(dedup_counts.keys()))
1214
+
1215
+ for cat in all_cats:
1216
+ orig_val = orig_counts.get(cat, 0)
1217
+ dedup_val = dedup_counts.get(cat, 0)
1218
+
1219
+ if orig_val > 0:
1220
+ pct_change = round((dedup_val - orig_val) / orig_val * 100, 2)
1221
+ if abs(pct_change) > 5: # 5% threshold for significant change
1222
+ changed_cats[cat] = pct_change
1223
+
1224
+ cat_impacts[col] = {
1225
+ "significant_category_changes": changed_cats,
1226
+ "has_significant_changes": len(changed_cats) > 0
1227
+ }
1228
+ except Exception as e:
1229
+ cat_impacts[col] = {"error": str(e)}
1230
+
1231
+ distribution_results["categorical_columns_impact"] = cat_impacts
1232
+
1233
+ try:
1234
+ significant_numeric_changes = sum(
1235
+ 1 for col in col_impacts if "significant_change" in col_impacts[col] and col_impacts[col]["significant_change"]
1236
+ )
1237
+
1238
+ significant_cat_changes = sum(
1239
+ 1 for col in cat_impacts if "has_significant_changes" in cat_impacts[col] and cat_impacts[col]["has_significant_changes"]
1240
+ )
1241
+
1242
+ distribution_results["overall_assessment"] = {
1243
+ "columns_with_significant_numeric_changes": significant_numeric_changes,
1244
+ "columns_with_significant_categorical_changes": significant_cat_changes,
1245
+ "total_columns_analyzed": len(col_impacts) + len(cat_impacts),
1246
+ "duplicates_impact_level": self.get_impact_level(
1247
+ significant_numeric_changes + significant_cat_changes,
1248
+ len(col_impacts) + len(cat_impacts)
1249
+ )
1250
+ }
1251
+ except Exception as e:
1252
+ distribution_results["overall_assessment"] = {"error": str(e)}
1253
+
1254
+ except Exception as e:
1255
+ distribution_results["error"] = f"Error in distribution impact analysis: {str(e)}"
1256
+ logger.error(f"Error in distribution impact analysis: {str(e)}", log_type="data_quality_assessment", console=verbose)
1257
+
1258
+ return distribution_results
1259
+
1260
+ def get_impact_level(self, significant_changes: int, total_cols: int) -> str:
1261
+ """
1262
+ Determine the impact level based on the proportion of columns with significant changes
1263
+ """
1264
+ if total_cols == 0:
1265
+ return "Unknown"
1266
+ try:
1267
+ proportion = significant_changes / total_cols
1268
+
1269
+ if proportion == 0:
1270
+ return "None"
1271
+ elif proportion < 0.1:
1272
+ return "Minimal"
1273
+ elif proportion < 0.3:
1274
+ return "Low"
1275
+ elif proportion < 0.5:
1276
+ return "Moderate"
1277
+ elif proportion < 0.7:
1278
+ return "High"
1279
+ else:
1280
+ return "Severe"
1281
+ except:
1282
+ pass
1283
+
1284
+ def generate_duplicate_analysis_summary(self, results: Dict[str, Any], verbose=False) -> str:
1285
+ """
1286
+ Generate a text summary of duplicate detection results
1287
+ """
1288
+ summary_parts = []
1289
+
1290
+ try:
1291
+ dataset_info = results.get("dataset_info", {})
1292
+ summary_parts.append(f"Dataset Summary: {dataset_info.get('original_rows', 'N/A')} rows, "
1293
+ f"{dataset_info.get('original_columns', 'N/A')} columns")
1294
+ if dataset_info.get("sampled"):
1295
+ summary_parts.append(f"Analysis performed on a sample of {dataset_info.get('sample_size', 'N/A')} rows")
1296
+ except Exception as e:
1297
+ summary_parts.append(f"Error summarizing dataset info: {e}")
1298
+
1299
+ try:
1300
+ exact_dups = results.get("exact_duplicates", {})
1301
+ if "error" in exact_dups:
1302
+ summary_parts.append(f"Error in exact duplicate analysis summary: {exact_dups['error']}")
1303
+ else:
1304
+ dup_count = exact_dups.get("total_exact_duplicates", 0)
1305
+ dup_pct = exact_dups.get("duplicate_percentage", 0)
1306
+ summary_parts.append(f"Exact Duplicates: {dup_count} rows ({dup_pct}% of dataset)")
1307
+ except Exception as e:
1308
+ summary_parts.append(f"Error summarizing exact duplicates: {e}")
1309
+
1310
+ try:
1311
+ key_dups = results.get("key_based_duplicates", {})
1312
+ if "error" in key_dups:
1313
+ summary_parts.append(f"Error in key-based duplicate analysis summary: {key_dups['error']}")
1314
+ else:
1315
+ if "detected_key_candidates" in key_dups:
1316
+ candidates = key_dups.get("detected_key_candidates", [])
1317
+ if candidates:
1318
+ summary_parts.append(f"Detected Key Candidates: {', '.join(candidates)}")
1319
+ else:
1320
+ summary_parts.append("No key candidates detected")
1321
+ key_analysis = key_dups.get("key_duplicate_analysis", {})
1322
+ if key_analysis:
1323
+ key_summary = []
1324
+ for key, info in key_analysis.items():
1325
+ if "error" in info:
1326
+ continue
1327
+ key_summary.append(f"{key}: {info.get('duplicate_count', 0)} duplicates "
1328
+ f"({info.get('duplicate_percentage', 0)}%)")
1329
+ if key_summary:
1330
+ summary_parts.append("Key-Based Duplicates Summary:\n- " + "\n- ".join(key_summary))
1331
+ multi_key_dups = key_dups.get("multi_key_duplicates", {})
1332
+ if multi_key_dups and "error" not in multi_key_dups:
1333
+ summary_parts.append(f"Multi-Column Key Duplicates: {multi_key_dups.get('count', 0)} rows "
1334
+ f"({multi_key_dups.get('percentage', 0)}%)")
1335
+ except Exception as e:
1336
+ summary_parts.append(f"Error summarizing key-based duplicates: {e}")
1337
+
1338
+ try:
1339
+ near_dups = results.get("near_duplicates", {})
1340
+ if "error" in near_dups:
1341
+ summary_parts.append(f"Error in near-duplicate analysis summary: {near_dups['error']}")
1342
+ else:
1343
+ text_similarity = near_dups.get("text_similarity", {})
1344
+ if text_similarity:
1345
+ text_similar_count = sum(info.get("similar_pairs_count", 0)
1346
+ for info in text_similarity.values()
1347
+ if "error" not in info)
1348
+ if text_similar_count > 0:
1349
+ summary_parts.append(f"Text Near-Duplicates: {text_similar_count} similar pairs identified")
1350
+
1351
+ numeric_similarity = near_dups.get("numeric_similarity", {})
1352
+ if numeric_similarity:
1353
+ numeric_similar_count = sum(info.get("similar_pairs_count", 0)
1354
+ for info in numeric_similarity.values()
1355
+ if "error" not in info)
1356
+ if numeric_similar_count > 0:
1357
+ summary_parts.append(f"Numeric Near-Duplicates: {numeric_similar_count} similar pairs identified")
1358
+
1359
+ multi_col = near_dups.get("multi_column_similarity", {})
1360
+ if multi_col and "error" not in multi_col and multi_col.get("similar_rows_count", 0) > 0:
1361
+ summary_parts.append(f"Multi-Column Near-Duplicates: {multi_col.get('similar_rows_count', 0)} similar row pairs")
1362
+ except Exception as e:
1363
+ summary_parts.append(f"Error summarizing near duplicates: {e}")
1364
+
1365
+ try:
1366
+ dist_impact = results.get("distribution_impact", {})
1367
+ if "error" in dist_impact:
1368
+ summary_parts.append(f"Error in distribution impact analysis summary: {dist_impact['error']}")
1369
+ else:
1370
+ overall = dist_impact.get("overall_assessment", {})
1371
+ if overall and "error" not in overall:
1372
+ impact_level = overall.get("duplicates_impact_level", "Unknown")
1373
+ sig_cols = overall.get("columns_with_significant_numeric_changes", 0) + \
1374
+ overall.get("columns_with_significant_categorical_changes", 0)
1375
+ total_cols = overall.get("total_columns_analyzed", 0)
1376
+ if total_cols > 0:
1377
+ summary_parts.append(f"Distribution Impact: {impact_level} "
1378
+ f"({sig_cols}/{total_cols} columns significantly affected)")
1379
+ except Exception as e:
1380
+ summary_parts.append(f"Error summarizing distribution impact: {e}")
1381
+
1382
+ try:
1383
+ if "exact_duplicates" in results and "total_exact_duplicates" in results["exact_duplicates"]:
1384
+ dup_pct = results["exact_duplicates"].get("duplicate_percentage", 0)
1385
+
1386
+ if dup_pct > 20:
1387
+ summary_parts.append("\nRECOMMENDATION: High duplicate percentage detected. "
1388
+ "Consider deduplicating the dataset before analysis.")
1389
+ elif dup_pct > 5:
1390
+ summary_parts.append("\nRECOMMENDATION: Moderate duplicate percentage detected. "
1391
+ "Consider the impact of duplicates on your analysis.")
1392
+ elif dup_pct > 0:
1393
+ summary_parts.append("\nRECOMMENDATION: Low duplicate percentage detected. "
1394
+ "Minimal impact expected on analysis.")
1395
+ else:
1396
+ summary_parts.append("\nRECOMMENDATION: No exact duplicates found.")
1397
+ else:
1398
+ summary_parts.append("\nRECOMMENDATION: Duplicate analysis complete.")
1399
+ except Exception as e:
1400
+ summary_parts.append(f"Error generating recommendation: {e}")
1401
+
1402
+ return "\n".join(summary_parts)
1403
+
1404
+ def perform_consistency_checks(self, verbose=False) -> Dict[str, Any]:
1405
+ """Performs various data consistency checks."""
1406
+ results = {}
1407
+
1408
+ try:
1409
+ results["cross_field_validation"] = self.check_cross_field_validity()
1410
+ except Exception as e:
1411
+ results["cross_field_validation"] = {"error": str(e)}
1412
+ logger.error(f"Error in cross-field validation: {e}", log_type="consistency_check", console=verbose)
1413
+
1414
+ try:
1415
+ results["logical_relationship"] = self.verify_logical_relationships()
1416
+ except Exception as e:
1417
+ results["logical_relationship"] = {"error": str(e)}
1418
+ logger.error(f"Error in logical relationship verification: {e}", log_type="consistency_check", console=verbose)
1419
+
1420
+ try:
1421
+ results["data_type_validation"] = self.validate_data_types()
1422
+ except Exception as e:
1423
+ results["data_type_validation"] = {"error": str(e)}
1424
+ logger.error(f"Error in data type validation: {e}", log_type="consistency_check", console=verbose)
1425
+
1426
+ try:
1427
+ results["value_transition_validity"] = self.check_value_transitions()
1428
+ except Exception as e:
1429
+ results["value_transition_validity"] = {"error": str(e)}
1430
+ logger.error(f"Error in value transition validity: {e}", log_type="consistency_check", console=verbose)
1431
+
1432
+ try:
1433
+ results["unit_consistency"] = self.check_unit_consistency()
1434
+ except Exception as e:
1435
+ results["unit_consistency"] = {"error": str(e)}
1436
+ logger.error(f"Error in unit consistency checks: {e}", log_type="consistency_check", console=verbose)
1437
+
1438
+ try:
1439
+ results["format_consistency"] = self.check_format_consistency()
1440
+ except Exception as e:
1441
+ results["format_consistency"] = {"error": str(e)}
1442
+ logger.error(f"Error in format consistency tests: {e}", log_type="consistency_check", console=verbose)
1443
+
1444
+ return results
1445
+
1446
+ def check_cross_field_validity(self) -> Dict[str, Any]:
1447
+ """Example: Check if discount is not greater than the price."""
1448
+ if self.data is None or 'price' not in self.data.columns or 'discount' not in self.data.columns:
1449
+ return {"warning": "Price or discount columns not found."}
1450
+ invalid_rows = self.data[self.data['discount'] > self.data['price']]
1451
+ return {"invalid_count": len(invalid_rows), "invalid_indices": invalid_rows.index.tolist()}
1452
+
1453
+ def verify_logical_relationships(self) -> Dict[str, Any]:
1454
+ """Example: If order_status is 'shipped', shipment_date should not be NaN."""
1455
+ if self.data is None or 'order_status' not in self.data.columns or 'shipment_date' not in self.data.columns:
1456
+ return {"warning": "order_status or shipment_date columns not found."}
1457
+ invalid_rows = self.data[(self.data['order_status'] == 'shipped') & (self.data['shipment_date'].isnull())]
1458
+ return {"invalid_count": len(invalid_rows), "invalid_indices": invalid_rows.index.tolist()}
1459
+
1460
+ def validate_data_types(self) -> Dict[str, Any]:
1461
+ """Check if declared data types match actual data types (basic check)."""
1462
+ if self.data is None:
1463
+ return {"warning": "No data loaded."}
1464
+ mismatches = {}
1465
+ for col in self.data.columns:
1466
+ try:
1467
+ inferred_type = pd.api.types.infer_dtype(self.data[col])
1468
+ mismatches[col] = {"inferred_type": inferred_type}
1469
+ except:
1470
+ continue
1471
+ return mismatches
1472
+
1473
+ def check_value_transitions(self) -> Dict[str, Any]:
1474
+ """Example: Check if a 'temperature' column generally increases over a 'time' column (simplistic)."""
1475
+ if self.data is None or 'temperature' not in self.data.columns or 'time' not in self.data.columns:
1476
+ return {"warning": "temperature or time columns not found."}
1477
+ diffs = self.data['temperature'].diff()
1478
+ decreasing_transitions = diffs[diffs < 0].count()
1479
+ increasing_transitions = diffs[diffs > 0].count()
1480
+ return {"decreasing_transitions": decreasing_transitions, "increasing_transitions": increasing_transitions}
1481
+
1482
+ def check_unit_consistency(self) -> Dict[str, Any]:
1483
+ """Example: Check if a 'measurement' column has consistent units (requires some form of unit identification)."""
1484
+ if self.data is None or 'measurement' not in self.data.columns:
1485
+ return {"warning": "measurement column not found."}
1486
+
1487
+ unique_values = self.data['measurement'].astype(str).unique()
1488
+ return {"unique_measurement_values": list(unique_values), "comment": "Implement more sophisticated unit parsing if needed."}
1489
+
1490
+ def check_format_consistency(self) -> Dict[str, Any]:
1491
+ """Example: Check if a 'date' column has a consistent date format."""
1492
+ if self.data is None or 'date' not in self.data.columns:
1493
+ return {"warning": "date column not found."}
1494
+ formats = set()
1495
+ for value in self.data['date'].astype(str).dropna().unique():
1496
+ try:
1497
+ pd.to_datetime(value)
1498
+ except:
1499
+ formats.add(f"Inconsistent format: '{value}'")
1500
+ return {"inconsistent_formats": list(formats)}
1501
+
1502
+ def perform_data_completeness_checks(self, verbose=False) -> Dict[str, Any]:
1503
+ """Performs various data completeness checks."""
1504
+ results = {}
1505
+
1506
+ try:
1507
+ results["coverage_analysis"] = self.analyze_coverage()
1508
+ except Exception as e:
1509
+ results["coverage_analysis"] = {"error": str(e)}
1510
+ logger.error(f"Error in coverage analysis: {e}", log_type="completeness_check", console=verbose)
1511
+
1512
+ try:
1513
+ results["time_period_completeness"] = self.check_time_completeness()
1514
+ except Exception as e:
1515
+ results["time_period_completeness"] = {"error": str(e)}
1516
+ logger.error(f"Error in time period completeness checks: {e}", log_type="completeness_check", console=verbose)
1517
+
1518
+ return results
1519
+
1520
+ def analyze_coverage(self) -> Dict[str, Any]:
1521
+ """Analyze the percentage of non-missing values for each column."""
1522
+ if self.data is None:
1523
+ return {"warning": "No data loaded."}
1524
+ coverage = {}
1525
+ for col in self.data.columns:
1526
+ try:
1527
+ coverage[col] = f"{self.data[col].count() / len(self.data) * 100:.2f}%"
1528
+ except:
1529
+ continue
1530
+ return coverage
1531
+
1532
+ def check_time_completeness(self) -> Dict[str, Any]:
1533
+ """Example: Check if there are missing dates in a 'timestamp' column (assuming sorted)."""
1534
+ if self.data is None or 'timestamp' not in self.data.columns:
1535
+ return {"warning": "timestamp column not found."}
1536
+ try:
1537
+ time_series = pd.to_datetime(self.data['timestamp']).sort_values()
1538
+ if not time_series.empty:
1539
+ first_date = time_series.iloc[0].date()
1540
+ last_date = time_series.iloc[-1].date()
1541
+ expected_days = (last_date - first_date).days + 1
1542
+ actual_days = len(time_series.dt.date.unique())
1543
+ return {"expected_unique_days": expected_days, "actual_unique_days": actual_days}
1544
+ else:
1545
+ return {"warning": "Timestamp column is empty."}
1546
+ except Exception as e:
1547
+ return {"error": f"Could not process timestamp column: {e}"}
1548
+
1549
+ def generate_consistency_analysis_summary(self, consistency_results: Dict[str, Any], completeness_results: Dict[str, Any]) -> str:
1550
+ """Generates a summary of the data quality assessment."""
1551
+ summary_lines = ["Data Quality Assessment Summary:\n"]
1552
+
1553
+ summary_lines.append("\nConsistency Checks:\n")
1554
+ if consistency_results:
1555
+ for check, result in consistency_results.items():
1556
+ try:
1557
+ summary_lines.append(f"- {check}: {result}\n")
1558
+ except:
1559
+ continue
1560
+ else:
1561
+ summary_lines.append("- No consistency checks performed or results available.\n")
1562
+
1563
+ summary_lines.append("\nData Completeness Checks:\n")
1564
+ if completeness_results:
1565
+ for check, result in completeness_results.items():
1566
+ try:
1567
+ summary_lines.append(f"- {check}: {result}\n")
1568
+ except:
1569
+ continue
1570
+ else:
1571
+ summary_lines.append("- No data completeness checks performed or results available.\n")
1572
+
1573
+ return "".join(summary_lines)
1574
+
1575
+ def generate_report_from_agent(self, input)->str:
1576
+ '''Transform the json output to a user-readable report'''
1577
+ try:
1578
+ input = f"ML Task: {self.ml_task}\n{input}"
1579
+ response: RunResponse = self.writer.run(input, stream=False)
1580
+ return response.content
1581
+ except Exception as e:
1582
+ return f"Failed to generate report with error: {e}"
1583
+
1584
+ def convert_numpy_types(self, obj):
1585
+ if isinstance(obj, dict):
1586
+ return {k: self.convert_numpy_types(v) for k, v in obj.items()}
1587
+ elif isinstance(obj, list):
1588
+ return [self.convert_numpy_types(item) for item in obj]
1589
+ elif isinstance(obj, np.integer):
1590
+ return int(obj)
1591
+ elif isinstance(obj, np.floating):
1592
+ return float(obj)
1593
+ elif isinstance(obj, np.bool_):
1594
+ return bool(obj)
1595
+ elif isinstance(obj, np.ndarray):
1596
+ return obj.tolist()
1597
+ else:
1598
+ return obj
1599
+
1600
+
1601
+ def run(self, verbose=False) -> Dict[str, dict]:
1602
+ '''Run the entire workflow'''
1603
+ logger.info("Starting missing value analysis...", log_type="data_quality_assessment", console=verbose)
1604
+ mva_results = self.analyze_missing_data(verbose=verbose)
1605
+ # logger.info("Starting agentic missing value analysis...", log_type="data_quality_assessment", console=verbose)
1606
+ # agentic_mva = self.agent_mva(report=mva_results['summary'], verbose=verbose)
1607
+
1608
+ logger.info("Starting Duplicate value analysis...", log_type="data_quality_assessment", console=verbose)
1609
+ duplicate_analysis_results = self.detect_duplicates(verbose=verbose)
1610
+
1611
+ logger.info("Starting Data quality & Consistency checks...", log_type="data_quality_assessment", console=verbose)
1612
+ consistency_results = self.perform_consistency_checks(verbose=verbose)
1613
+ completeness_results = self.perform_data_completeness_checks(verbose=verbose)
1614
+ data_quality_results = consistency_results | completeness_results
1615
+ data_quality_summary = self.generate_consistency_analysis_summary(consistency_results, completeness_results)
1616
+
1617
+ logger.info("Generating final reports....", log_type='data_quality_assessment', console=verbose)
1618
+
1619
+ final_result = {
1620
+ "missing_value_analysis": {
1621
+ 'dict': self.convert_numpy_types(mva_results),
1622
+ 'report': ""
1623
+ },
1624
+ "duplicate_analysis": {
1625
+ 'dict': self.convert_numpy_types(duplicate_analysis_results),
1626
+ 'report': ""
1627
+ },
1628
+ "data_quality_analysis": {
1629
+ 'dict': self.convert_numpy_types(data_quality_results),
1630
+ 'report': ""
1631
+ },
1632
+ }
1633
+
1634
+ try:
1635
+ mva_str = json.dumps(mva_results, indent=2, default=str)
1636
+ final_result['missing_value_analysis']['report'] = self.generate_report_from_agent(mva_str)
1637
+ except:
1638
+ logger.error("Failed to generate report for mva....", log_type='data_quality_assessment', console=verbose)
1639
+ pass
1640
+
1641
+ try:
1642
+ duplicate_analysis_results_str = json.dumps(duplicate_analysis_results, indent=2, default=str)
1643
+ final_result['duplicate_analysis']['report'] = self.generate_report_from_agent(duplicate_analysis_results_str)
1644
+ except:
1645
+ logger.error("Failed to generate report for duplicate analysis....", log_type='data_quality_assessment', console=verbose)
1646
+ pass
1647
+
1648
+ try:
1649
+ data_quality_results_str = json.dumps(data_quality_results, indent=2, default=str)
1650
+ data_quality_results_str = data_quality_results_str +'\n'+data_quality_summary
1651
+ final_result['data_quality_analysis']['report'] = self.generate_report_from_agent(data_quality_results_str)
1652
+ except:
1653
+ logger.error("Failed to generate report for data quality....", log_type='data_quality_assessment', console=verbose)
1654
+ pass
1655
+
1656
+
1657
+ return final_result
src/app/pipelines/modules/data_statistics.py ADDED
@@ -0,0 +1,1270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import psutil
6
+ import numpy as np
7
+ import pandas as pd
8
+ import dateutil.parser
9
+ from dotenv import load_dotenv
10
+ from dateutil import parser
11
+ from datetime import datetime
12
+ from collections import Counter
13
+ from src.core.utils import logger
14
+ from agno.agent import Agent, RunResponse
15
+ from agno.models.openai import OpenAIChat
16
+ from typing import Union, List, Dict, Any, Tuple
17
+
18
+ load_dotenv()
19
+
20
+ class DataStatisticsWorkflow:
21
+ def __init__(
22
+ self, data_source: str,
23
+ llm_choice: str,
24
+ ml_task: str
25
+ ) -> None:
26
+ ''''''
27
+ self.data = None
28
+ self.llm = OpenAIChat(id=llm_choice, api_key=os.getenv('OPENAI_API_KEY'))
29
+ self.ml_task = ml_task
30
+ _ = self.load_data(data_source=data_source)
31
+ self.llm = OpenAIChat(id=llm_choice, api_key=os.getenv('OPENAI_API_KEY'))
32
+ self.writer: Agent = Agent(
33
+ model=self.llm,
34
+ instructions=[
35
+ "You will be provided with lots of structured outputs. Your work is to display this"
36
+ "in a nicely formatted manner. You must analayze the results and output a comprehensive and insightful report"
37
+ ],
38
+ markdown=True,
39
+ )
40
+
41
+ def load_data(self, data_source: str) -> Union[None, bool]:
42
+ '''Load CSV into dataframe'''
43
+ try:
44
+ self.data = pd.read_csv(data_source)
45
+ return True
46
+ except Exception as e:
47
+ logger.error(
48
+ f"Failed to read the file from the data source with error: {e}", log_type="data_statistics", console=True)
49
+ return False
50
+
51
+ def format_json(self, results, indent: int = 4) -> str:
52
+ def convert_to_serializable(obj):
53
+ if isinstance(obj, (np.integer, np.floating, np.bool_)):
54
+ return obj.item()
55
+ elif isinstance(obj, np.ndarray):
56
+ return obj.tolist()
57
+ elif pd.isna(obj):
58
+ return None
59
+ return obj
60
+
61
+ def process_dict(d):
62
+ result = {}
63
+ for k, v in d.items():
64
+ if isinstance(v, dict):
65
+ result[k] = process_dict(v)
66
+ elif isinstance(v, list):
67
+ result[k] = [convert_to_serializable(item) for item in v]
68
+ else:
69
+ result[k] = convert_to_serializable(v)
70
+ return result
71
+
72
+ serializable_results = process_dict(results)
73
+
74
+ return json.dumps(serializable_results, indent=indent)
75
+
76
+ def build_statistical_summary(self, data_source: str = None, verbose=False) -> Dict[str, Any]:
77
+ '''Get the basic central tendancy, dispersion, quantiles, distinct values, frequency distributions and sparsity'''
78
+ logger.info(f"Starting statistical analysis", log_type="data_statistics", console=verbose)
79
+ if data_source:
80
+ status = self.load_data(data_source=data_source)
81
+ if not status:
82
+ logger.info("Failed to load data. Can't build statistical summary",
83
+ log_type='data_statistics', console=True)
84
+
85
+ quantiles: List[float] = [0.05, 0.25, 0.5, 0.75, 0.95, 0.99]
86
+ num_freq_values: int = 10
87
+
88
+ results = {
89
+ "dataset_info": {
90
+ "num_rows": len(self.data),
91
+ "num_columns": len(self.data.columns),
92
+ "memory_usage": self.data.memory_usage(deep=True).sum(),
93
+ "dtypes": {col: str(dtype) for col, dtype in self.data.dtypes.items()}
94
+ },
95
+ "columns": {}
96
+ }
97
+
98
+ for column in self.data.columns:
99
+ column_data = self.data[column]
100
+ column_type = str(self.data[column].dtype)
101
+ is_numeric = pd.api.types.is_numeric_dtype(column_data)
102
+ is_datetime = pd.api.types.is_datetime64_any_dtype(column_data)
103
+
104
+ col_results = {
105
+ "dtype": column_type,
106
+ "count": len(column_data),
107
+ "num_unique": column_data.nunique(),
108
+ "num_missing": column_data.isna().sum(),
109
+ "missing_percentage": (column_data.isna().sum() / len(column_data)) * 100,
110
+ "sparsity": {
111
+ "zeros": None,
112
+ "zeros_percentage": None,
113
+ "empty_strings": None,
114
+ "empty_strings_percentage": None
115
+ }
116
+ }
117
+
118
+ is_boolean = pd.api.types.is_bool_dtype(column_data)
119
+
120
+ if is_numeric and not is_boolean:
121
+ try:
122
+ num_zeros = (column_data == 0).sum()
123
+ col_results["sparsity"]["zeros"] = num_zeros
124
+ col_results["sparsity"]["zeros_percentage"] = (num_zeros / column_data.count()) * 100
125
+ except Exception as e:
126
+ logger.error(f"{e}", log_type='data_statistics', console=verbose)
127
+ elif column_type.startswith('object') or column_type.startswith('string'):
128
+ try:
129
+ empty_strings = column_data.fillna('').apply(lambda x: isinstance(x, str) and x.strip() == '').sum()
130
+ col_results["sparsity"]["empty_strings"] = empty_strings
131
+ col_results["sparsity"]["empty_strings_percentage"] = (empty_strings / column_data.count()) * 100
132
+ except Exception as e:
133
+ logger.error(f"{e}", log_type='data_statistics', console=verbose)
134
+
135
+ if is_numeric and not is_boolean:
136
+ try:
137
+ col_results["central_tendency"] = {
138
+ "mean": column_data.mean() if not all(column_data.isna()) else None,
139
+ "median": column_data.median() if not all(column_data.isna()) else None,
140
+ "mode": column_data.mode().iloc[0] if not column_data.mode().empty else None
141
+ }
142
+ except Exception as e:
143
+ logger.error(f"{e}", log_type='data_statistics', console=verbose)
144
+
145
+ try:
146
+ col_results["dispersion"] = {
147
+ "std": column_data.std() if not all(column_data.isna()) else None,
148
+ "variance": column_data.var() if not all(column_data.isna()) else None,
149
+ "range": {
150
+ "min": column_data.min() if not all(column_data.isna()) else None,
151
+ "max": column_data.max() if not all(column_data.isna()) else None
152
+ },
153
+ "iqr": (
154
+ column_data.quantile(0.75) - column_data.quantile(0.25)
155
+ if not pd.api.types.is_bool_dtype(column_data)
156
+ else None
157
+ )
158
+ }
159
+ except Exception as e:
160
+ logger.error(f"{e}", log_type='data_statistics', console=verbose)
161
+
162
+ if not all(column_data.isna()):
163
+ col_results["quantiles"] = {
164
+ f"q{int(q*100)}": column_data.quantile(q) for q in quantiles
165
+ }
166
+ else:
167
+ col_results["quantiles"] = {
168
+ f"q{int(q*100)}": None for q in quantiles}
169
+
170
+ value_counts = column_data.value_counts(
171
+ dropna=False).head(num_freq_values)
172
+ col_results["frequency_distribution"] = {
173
+ "values": value_counts.index.tolist(),
174
+ "counts": value_counts.tolist(),
175
+ "percentages": (value_counts / len(column_data) * 100).tolist()
176
+ }
177
+
178
+ if is_datetime:
179
+ try:
180
+ col_results["datetime_info"] = {
181
+ "min_date": column_data.min().strftime('%Y-%m-%d %H:%M:%S') if not all(column_data.isna()) else None,
182
+ "max_date": column_data.max().strftime('%Y-%m-%d %H:%M:%S') if not all(column_data.isna()) else None,
183
+ "date_range_days": (column_data.max() - column_data.min()).days if not all(column_data.isna()) else None
184
+ }
185
+ except:
186
+ col_results["datetime_info"] = "Error processing datetime information"
187
+
188
+ results["columns"][column] = col_results
189
+
190
+ numeric_cols = self.data.select_dtypes(include=['number']).columns.tolist()
191
+ if len(numeric_cols) > 1:
192
+ try:
193
+ correlations = self.data[numeric_cols].corr().round(
194
+ 3).to_dict()
195
+ results["correlations"] = correlations
196
+ except:
197
+ results["correlations"] = "Error computing correlations"
198
+
199
+ return results
200
+
201
+ def analyze_data_types(self, sample_size=None, verbose=False):
202
+ logger.info(f"Starting data-type analysis", log_type="data_statistics", console=verbose)
203
+
204
+ df = self.data
205
+
206
+ if sample_size and sample_size < len(self.data):
207
+ df = self.data.sample(sample_size, random_state=42)
208
+
209
+ df = df.replace('', np.nan)
210
+
211
+ results = {}
212
+
213
+ for column in df.columns:
214
+ if df[column].isna().all():
215
+ results[column] = {
216
+ "inferred_type": "empty",
217
+ "description": "Column is entirely empty"
218
+ }
219
+ continue
220
+
221
+ values = df[column].dropna().values
222
+ if len(values) == 0:
223
+ continue
224
+
225
+ column_analysis = self.analyze_column(values, column, df)
226
+ results[column] = column_analysis
227
+
228
+ results["__summary__"] = self.generate_summary(results, df)
229
+ return results
230
+
231
+ def analyze_column(self, values, column_name, df):
232
+ """Analyze a single column's values to determine data types and patterns (safe and error-resilient)"""
233
+ try:
234
+ sample_values = values[:5].tolist() if hasattr(values, 'tolist') else list(values)[:5]
235
+ except Exception:
236
+ sample_values = []
237
+
238
+ type_counts = {
239
+ "integer": 0,
240
+ "float": 0,
241
+ "boolean": 0,
242
+ "date": 0,
243
+ "text": 0
244
+ }
245
+
246
+ unique_values = set()
247
+ total_values = len(values)
248
+
249
+ for val in values:
250
+ try:
251
+ unique_values.add(val)
252
+ detected_type = self.detect_value_type(val)
253
+ if detected_type:
254
+ type_counts[detected_type] += 1
255
+ except Exception:
256
+ continue
257
+
258
+ try:
259
+ main_type = max(type_counts.items(), key=lambda x: x[1])[0]
260
+ mixed_types = sum(1 for count in type_counts.values() if count > 0) > 1
261
+ type_percentages = {
262
+ t: (count / total_values) * 100 for t, count in type_counts.items() if count > 0
263
+ }
264
+ except Exception:
265
+ main_type = "text"
266
+ mixed_types = False
267
+ type_percentages = {}
268
+
269
+ result = {
270
+ "inferred_type": main_type,
271
+ "has_mixed_types": mixed_types,
272
+ "type_percentages": type_percentages,
273
+ "unique_count": len(unique_values),
274
+ "unique_percentage": (len(unique_values) / total_values * 100) if total_values else 0,
275
+ "sample_values": sample_values
276
+ }
277
+
278
+ try:
279
+ if main_type in ["integer", "float"]:
280
+ numeric_analysis = self.analyze_numeric_column(values, result)
281
+ result.update(numeric_analysis)
282
+ except Exception:
283
+ pass
284
+
285
+ try:
286
+ if main_type == "date":
287
+ date_analysis = self.analyze_date_column(values)
288
+ result.update(date_analysis)
289
+ except Exception:
290
+ pass
291
+
292
+ if main_type == "text":
293
+ try:
294
+ numeric_as_text = self.check_numeric_as_text(values)
295
+ if numeric_as_text.get("is_numeric_as_text"):
296
+ result["numeric_as_text"] = True
297
+ result["numeric_as_text_details"] = numeric_as_text
298
+ except Exception:
299
+ pass
300
+
301
+ try:
302
+ date_as_text = self.check_dates_as_text(values)
303
+ if date_as_text.get("is_date_as_text"):
304
+ result["date_as_text"] = True
305
+ result["date_as_text_details"] = date_as_text
306
+ except Exception:
307
+ pass
308
+
309
+ try:
310
+ cardinality = self.analyze_cardinality(values, result["unique_count"], total_values)
311
+ result.update(cardinality)
312
+ except Exception:
313
+ pass
314
+
315
+ try:
316
+ if result["unique_count"] <= 2 and not result.get("is_id_like", False):
317
+ result["is_binary"] = True
318
+ result["binary_values"] = list(unique_values)
319
+ else:
320
+ result["is_binary"] = False
321
+ except Exception:
322
+ result["is_binary"] = False
323
+
324
+ try:
325
+ result["description"] = self.generate_column_description(result, column_name)
326
+ except Exception:
327
+ result["description"] = "Could not generate column description due to an error."
328
+
329
+ return result
330
+
331
+ def detect_value_type(self, value, verbose=False):
332
+ """Determine the data type of a single value"""
333
+ try:
334
+ if value.lower() in ('true', 'false', 'yes', 'no', 'y', 'n', 't', 'f', '1', '0'):
335
+ if value.lower() in ('1', '0'):
336
+ pass
337
+ else:
338
+ return "boolean"
339
+
340
+ if re.match(r'^-?\d+$', value):
341
+ return "integer"
342
+
343
+ if re.match(r'^-?\d+\.\d+$', value) or re.match(r'^-?\d+,\d+$', value):
344
+ return "float"
345
+
346
+ try:
347
+ dateutil.parser.parse(value)
348
+ return "date"
349
+ except:
350
+ pass
351
+
352
+ return "text"
353
+ except Exception as e:
354
+ logger.error(f"Value type detection failed with error: {e}", log_type='data_statistics', console=verbose)
355
+ return ""
356
+
357
+ def analyze_date_column(self, values):
358
+ """Further analyze date columns safely and thoroughly"""
359
+ from datetime import datetime
360
+ formats = []
361
+ min_date = None
362
+ max_date = None
363
+
364
+ for val in values[:100]:
365
+ if not val:
366
+ continue
367
+ try:
368
+ date_val = val if isinstance(val, datetime) else dateutil.parser.parse(str(val))
369
+
370
+ if not min_date or date_val < min_date:
371
+ min_date = date_val
372
+ if not max_date or date_val > max_date:
373
+ max_date = date_val
374
+
375
+ val_str = str(val).strip()
376
+ if re.match(r'^\d{4}-\d{2}-\d{2}$', val_str):
377
+ formats.append("YYYY-MM-DD")
378
+ elif re.match(r'^\d{4}/\d{2}/\d{2}$', val_str):
379
+ formats.append("YYYY/MM/DD")
380
+ elif re.match(r'^\d{2}-\d{2}-\d{4}$', val_str):
381
+ formats.append("DD-MM-YYYY")
382
+ elif re.match(r'^\d{2}/\d{2}/\d{4}$', val_str):
383
+ formats.append("DD/MM/YYYY")
384
+ elif re.match(r'^\d{2}-\w{3}-\d{4}$', val_str):
385
+ formats.append("DD-MMM-YYYY")
386
+ elif re.match(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$', val_str):
387
+ formats.append("YYYY-MM-DD HH:MM:SS")
388
+ else:
389
+ formats.append("Other")
390
+ except Exception:
391
+ continue
392
+
393
+ format_counts = Counter(formats)
394
+ most_common = format_counts.most_common(2)
395
+
396
+ result = {
397
+ "date_format": [f[0] for f in most_common] if most_common else ["Unknown"],
398
+ "min_date": min_date.isoformat() if min_date else None,
399
+ "max_date": max_date.isoformat() if max_date else None,
400
+ "date_range_days": (max_date - min_date).days if min_date and max_date else None,
401
+ "confidence": (len(formats) / min(len(values), 100)) * 100
402
+ }
403
+
404
+ return result
405
+
406
+ def analyze_numeric_column(self, values, current_results):
407
+ """Further analyze numeric columns (safe version)"""
408
+ result = {}
409
+
410
+ try:
411
+ numeric_values = []
412
+ for val in values:
413
+ try:
414
+ if val is None:
415
+ continue
416
+ if isinstance(val, (int, float)):
417
+ numeric_values.append(float(val))
418
+ elif isinstance(val, str):
419
+ cleaned = val.replace(',', '.').strip()
420
+ numeric_values.append(float(cleaned))
421
+ except (ValueError, TypeError):
422
+ continue
423
+
424
+ if not numeric_values:
425
+ result["warning"] = "No valid numeric values found"
426
+ return result
427
+
428
+ numeric_values.sort()
429
+ n = len(numeric_values)
430
+ median = (
431
+ numeric_values[n // 2]
432
+ if n % 2 == 1
433
+ else (numeric_values[n // 2 - 1] + numeric_values[n // 2]) / 2
434
+ )
435
+
436
+ result.update({
437
+ "min": min(numeric_values),
438
+ "max": max(numeric_values),
439
+ "mean": sum(numeric_values) / n,
440
+ "median": median
441
+ })
442
+
443
+ if current_results.get("inferred_type") == "float":
444
+ integers_count = sum(1 for x in numeric_values if x == int(x))
445
+ result["integers_percentage"] = (integers_count / n) * 100
446
+
447
+ except Exception as e:
448
+ result["error"] = f"Error in analyze_numeric_column: {str(e)}"
449
+
450
+ return result
451
+
452
+ def check_numeric_as_text(self, values):
453
+ """Check if text values might actually be numeric values stored as text (safe version)"""
454
+ result = {
455
+ "is_numeric_as_text": False,
456
+ "confidence": 0.0,
457
+ "matched_patterns": {}
458
+ }
459
+
460
+ try:
461
+ if not values:
462
+ result["warning"] = "No values provided"
463
+ return result
464
+
465
+ sample_size = min(100, len(values))
466
+ if sample_size == 0:
467
+ result["warning"] = "No valid sample size"
468
+ return result
469
+
470
+ numeric_patterns = [
471
+ r'^\d{1,3}(,\d{3})+(\.\d+)?$', # US format: 1,234.56
472
+ r'^\d{1,3}(\.\d{3})+(,\d+)?$', # EU format: 1.234,56
473
+ r'^-?\d+\s\d+(/\d+)?$', # Fractions: 1 1/2
474
+ r'^\$\d+(\.\d+)?$', # Dollars: $123.45
475
+ r'^€\d+(\.\d+)?$', # Euros: €123.45
476
+ r'^\d+(\.\d+)?%$', # Percentages: 12.3%
477
+ ]
478
+
479
+ matches = 0
480
+ pattern_matches = {pattern: 0 for pattern in numeric_patterns}
481
+
482
+ for val in values[:sample_size]:
483
+ try:
484
+ if not isinstance(val, str):
485
+ continue
486
+ for pattern in numeric_patterns:
487
+ if re.match(pattern, val.strip()):
488
+ matches += 1
489
+ pattern_matches[pattern] += 1
490
+ break
491
+ except Exception:
492
+ continue
493
+
494
+ percentage = (matches / sample_size) * 100
495
+ is_numeric_as_text = percentage > 80
496
+
497
+ result.update({
498
+ "is_numeric_as_text": is_numeric_as_text,
499
+ "confidence": percentage,
500
+ "matched_patterns": {p: c for p, c in pattern_matches.items() if c > 0}
501
+ })
502
+
503
+ except Exception as e:
504
+ result["error"] = f"Error in check_numeric_as_text: {str(e)}"
505
+
506
+ return result
507
+
508
+ def check_dates_as_text(self, values):
509
+ """Check if text values might actually be dates stored as text (safe version)"""
510
+ result = {
511
+ "is_date_as_text": False,
512
+ "confidence": 0.0,
513
+ "common_formats": []
514
+ }
515
+
516
+ try:
517
+ if not values:
518
+ result["warning"] = "No values provided"
519
+ return result
520
+
521
+ sample_size = min(100, len(values))
522
+ dates_detected = 0
523
+ formats_detected = []
524
+
525
+ for val in values[:sample_size]:
526
+ try:
527
+ if not isinstance(val, str):
528
+ continue
529
+
530
+ parser.parse(val)
531
+ dates_detected += 1
532
+
533
+ if re.match(r'^\d{4}-\d{2}-\d{2}', val):
534
+ formats_detected.append("ISO")
535
+ elif re.match(r'^\d{2}/\d{2}/\d{4}', val):
536
+ formats_detected.append("US/UK")
537
+ elif re.match(r'^\d{2}-\w{3}-\d{4}', val):
538
+ formats_detected.append("DD-MMM-YYYY")
539
+ else:
540
+ formats_detected.append("Other")
541
+
542
+ except Exception:
543
+ continue
544
+
545
+ if sample_size > 0:
546
+ percentage = (dates_detected / sample_size) * 100
547
+ else:
548
+ percentage = 0.0
549
+
550
+ is_date_as_text = percentage > 80
551
+ format_counts = Counter(formats_detected)
552
+ common_formats = format_counts.most_common(2)
553
+
554
+ result.update({
555
+ "is_date_as_text": is_date_as_text,
556
+ "confidence": percentage,
557
+ "common_formats": common_formats if formats_detected else []
558
+ })
559
+
560
+ except Exception as e:
561
+ result["error"] = f"Error in check_dates_as_text: {str(e)}"
562
+
563
+ return result
564
+
565
+ def analyze_cardinality(self, values, unique_count, total_count):
566
+ """Analyze cardinality to determine if a column is categorical, continuous, or ID-like, safely"""
567
+ result = {}
568
+
569
+ try:
570
+ if not values or total_count == 0:
571
+ result["error"] = "Empty values or total_count is 0"
572
+ return result
573
+
574
+ uniqueness_ratio = unique_count / total_count if total_count else 0
575
+
576
+ try:
577
+ value_counts = Counter(values)
578
+ most_common = value_counts.most_common(5)
579
+ top_value_percentage = most_common[0][1] / total_count * 100 if most_common else 0
580
+ except Exception as e:
581
+ most_common = []
582
+ top_value_percentage = 0
583
+ result["warning"] = f"Error analyzing value counts: {str(e)}"
584
+
585
+ if uniqueness_ratio > 0.9:
586
+ result["is_id_like"] = True
587
+ result["cardinality_type"] = "id_like"
588
+ elif uniqueness_ratio > 0.5:
589
+ result["is_continuous"] = True
590
+ result["cardinality_type"] = "continuous"
591
+ else:
592
+ result["is_categorical"] = True
593
+ result["cardinality_type"] = "categorical"
594
+
595
+ if unique_count <= 5:
596
+ result["categorical_level"] = "low"
597
+ elif unique_count <= 20:
598
+ result["categorical_level"] = "medium"
599
+ else:
600
+ result["categorical_level"] = "high"
601
+
602
+ result["top_categories"] = [
603
+ {
604
+ "value": val,
605
+ "count": count,
606
+ "percentage": (count / total_count * 100 if total_count else 0)
607
+ }
608
+ for val, count in most_common
609
+ ]
610
+
611
+ result["uniqueness_ratio"] = uniqueness_ratio
612
+ result["top_value_percentage"] = top_value_percentage
613
+
614
+ except Exception as e:
615
+ result["error"] = f"Error in analyze_cardinality: {str(e)}"
616
+
617
+ return result
618
+
619
+ def generate_column_description(self, analysis, column_name):
620
+ """Generate a human-readable description of the column based on analysis, with error safety"""
621
+ descriptions = []
622
+
623
+ try:
624
+ if analysis.get("has_mixed_types", False):
625
+ inferred = analysis.get("inferred_type", "unknown")
626
+ descriptions.append(f"Mixed data types with {inferred} being most common")
627
+ else:
628
+ inferred = analysis.get("inferred_type", "unknown")
629
+ descriptions.append(f"{inferred.capitalize()} data type")
630
+ except Exception as e:
631
+ descriptions.append(f"[Error identifying type]: {str(e)}")
632
+
633
+ try:
634
+ if analysis.get("is_id_like", False):
635
+ descriptions.append("Likely an ID field (high uniqueness)")
636
+ elif analysis.get("is_binary", False):
637
+ binary_values = ", ".join(str(v) for v in analysis.get("binary_values", []))
638
+ descriptions.append(f"Binary field with values: {binary_values}")
639
+ elif analysis.get("is_categorical", False):
640
+ cardinality = analysis.get("categorical_level", "unknown")
641
+ unique_count = analysis.get("unique_count", "unknown")
642
+ descriptions.append(f"{cardinality.capitalize()} cardinality categorical field with {unique_count} unique values")
643
+ elif analysis.get("is_continuous", False):
644
+ descriptions.append("Continuous variable")
645
+ except Exception as e:
646
+ descriptions.append(f"[Error determining field category]: {str(e)}")
647
+
648
+ try:
649
+ if analysis.get("numeric_as_text", False):
650
+ descriptions.append("Numeric values stored as text")
651
+ except Exception:
652
+ pass
653
+
654
+ try:
655
+ if analysis.get("date_as_text", False):
656
+ descriptions.append("Date values stored as text")
657
+ except Exception:
658
+ pass
659
+
660
+ try:
661
+ if "min" in analysis and "max" in analysis:
662
+ descriptions.append(f"Range: {analysis['min']} to {analysis['max']}")
663
+ except Exception as e:
664
+ descriptions.append(f"[Error extracting range]: {str(e)}")
665
+
666
+ try:
667
+ if "date_format" in analysis:
668
+ descriptions.append(f"Date format: {analysis['date_format']}")
669
+ if "date_range_days" in analysis:
670
+ descriptions.append(f"Spans {analysis['date_range_days']} days")
671
+ except Exception as e:
672
+ descriptions.append(f"[Error in date format]: {str(e)}")
673
+
674
+ return ". ".join(descriptions)
675
+
676
+ def generate_summary(self, results, df):
677
+ """Generate an overall summary of the dataset with error safety"""
678
+ analysis_results = {k: v for k, v in results.items() if k != "__summary__"}
679
+
680
+ column_types = {}
681
+ mixed_types_count = numeric_as_text_count = date_as_text_count = 0
682
+ binary_count = id_like_count = categorical_count = continuous_count = 0
683
+
684
+ for col, analysis in analysis_results.items():
685
+ try:
686
+ col_type = analysis.get("inferred_type", "unknown")
687
+ column_types[col_type] = column_types.get(col_type, 0) + 1
688
+ except Exception as e:
689
+ column_types["unknown"] = column_types.get("unknown", 0) + 1
690
+
691
+ try:
692
+ if analysis.get("has_mixed_types", False):
693
+ mixed_types_count += 1
694
+ except Exception:
695
+ pass
696
+
697
+ try:
698
+ if analysis.get("numeric_as_text", False):
699
+ numeric_as_text_count += 1
700
+ except Exception:
701
+ pass
702
+
703
+ try:
704
+ if analysis.get("date_as_text", False):
705
+ date_as_text_count += 1
706
+ except Exception:
707
+ pass
708
+
709
+ try:
710
+ if analysis.get("is_binary", False):
711
+ binary_count += 1
712
+ except Exception:
713
+ pass
714
+
715
+ try:
716
+ if analysis.get("is_id_like", False):
717
+ id_like_count += 1
718
+ except Exception:
719
+ pass
720
+
721
+ try:
722
+ if analysis.get("is_categorical", False):
723
+ categorical_count += 1
724
+ except Exception:
725
+ pass
726
+
727
+ try:
728
+ if analysis.get("is_continuous", False):
729
+ continuous_count += 1
730
+ except Exception:
731
+ pass
732
+
733
+ recommendations = []
734
+ if mixed_types_count > 0:
735
+ recommendations.append(f"Review {mixed_types_count} columns with mixed data types")
736
+ if numeric_as_text_count > 0:
737
+ recommendations.append(f"Convert {numeric_as_text_count} columns with numeric values stored as text")
738
+ if date_as_text_count > 0:
739
+ recommendations.append(f"Convert {date_as_text_count} columns with date values stored as text")
740
+
741
+ try:
742
+ detailed_findings = self.generate_key_findings(analysis_results)
743
+ except Exception as e:
744
+ detailed_findings = [f"[Error generating key findings]: {str(e)}"]
745
+
746
+ return {
747
+ "total_rows": len(df),
748
+ "total_columns": len(analysis_results),
749
+ "column_type_counts": column_types,
750
+ "data_quality": {
751
+ "mixed_types_count": mixed_types_count,
752
+ "numeric_as_text_count": numeric_as_text_count,
753
+ "date_as_text_count": date_as_text_count
754
+ },
755
+ "structure": {
756
+ "binary_columns": binary_count,
757
+ "id_like_columns": id_like_count,
758
+ "categorical_columns": categorical_count,
759
+ "continuous_columns": continuous_count,
760
+ },
761
+ "recommendations": recommendations,
762
+ "detailed_findings": detailed_findings
763
+ }
764
+
765
+ def generate_key_findings(self, analysis_results):
766
+ """Generate detailed key findings from the analysis results with error safety"""
767
+ findings = []
768
+
769
+ for col, analysis in analysis_results.items():
770
+ try:
771
+ if analysis.get("has_mixed_types", False):
772
+ type_percentages = analysis.get("type_percentages", {})
773
+ finding = f"Column '{col}' has mixed data types: " + ", ".join(
774
+ f"{type_name} ({percentage:.1f}%)"
775
+ for type_name, percentage in type_percentages.items()
776
+ )
777
+ findings.append(finding)
778
+ except Exception as e:
779
+ findings.append(f"[Error analyzing mixed types for '{col}']: {e}")
780
+
781
+ try:
782
+ if analysis.get("numeric_as_text", False):
783
+ details = analysis.get("numeric_as_text_details", {})
784
+ confidence = details.get("confidence", 0.0)
785
+ findings.append(
786
+ f"Column '{col}' likely contains numeric values stored as text ({confidence:.1f}% confidence)"
787
+ )
788
+ except Exception as e:
789
+ findings.append(f"[Error analyzing numeric-as-text for '{col}']: {e}")
790
+
791
+ try:
792
+ if analysis.get("date_as_text", False):
793
+ details = analysis.get("date_as_text_details", {})
794
+ confidence = details.get("confidence", 0.0)
795
+ formats = details.get("common_formats", [])
796
+ format_str = ", ".join(f"{fmt[0]}" for fmt in formats[:2]) if formats else "various"
797
+ findings.append(
798
+ f"Column '{col}' likely contains dates in {format_str} format ({confidence:.1f}% confidence)"
799
+ )
800
+ except Exception as e:
801
+ findings.append(f"[Error analyzing date-as-text for '{col}']: {e}")
802
+
803
+ try:
804
+ if analysis.get("is_id_like", False):
805
+ uniqueness_ratio = analysis.get("uniqueness_ratio", 1.0)
806
+ if uniqueness_ratio < 1.0:
807
+ dupe_percentage = (1 - uniqueness_ratio) * 100
808
+ findings.append(
809
+ f"ID-like column '{col}' has {dupe_percentage:.1f}% duplicate values"
810
+ )
811
+ except Exception as e:
812
+ findings.append(f"[Error analyzing ID-like column '{col}']: {e}")
813
+
814
+ return findings
815
+
816
+ def analyze_dataset_dimensionality(self, sample_size=None, detailed=True, verbose=False):
817
+ """Analyze the dimensionality characteristics of a dataset"""
818
+ logger.info(f"Starting dataset dimensionality analysis", log_type="data_statistics", console=verbose)
819
+
820
+ results = {
821
+ "basic_info": {},
822
+ "aspect_ratio": {},
823
+ "sparsity": {},
824
+ "memory_usage": {},
825
+ "processing_complexity": {},
826
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
827
+ }
828
+
829
+ analysis_failures = []
830
+
831
+ try:
832
+ df = self.data
833
+
834
+ if sample_size and sample_size < len(df):
835
+ df = df.sample(sample_size, random_state=42)
836
+
837
+ results["basic_info"] = {
838
+ "rows": len(df),
839
+ "columns": len(df.columns),
840
+ "total_cells": len(df) * len(df.columns),
841
+ "column_names": list(df.columns)
842
+ }
843
+ except Exception as e:
844
+ results["basic_info"] = {
845
+ "status": "failed",
846
+ "error": str(e)
847
+ }
848
+ analysis_failures.append("basic_info")
849
+
850
+ results["status"] = "failed"
851
+ results["error"] = f"Failed to read CSV file: {str(e)}"
852
+ return results
853
+
854
+
855
+
856
+ if sample_size and sample_size < len(df):
857
+ df = df.sample(sample_size, random_state=42)
858
+
859
+ try:
860
+ rows = results["basic_info"]["rows"]
861
+ cols = results["basic_info"]["columns"]
862
+
863
+ results["aspect_ratio"] = {
864
+ "rows_to_columns_ratio": rows / cols if cols > 0 else float('inf'),
865
+ "columns_to_rows_ratio": cols / rows if rows > 0 else float('inf'),
866
+ "is_wide": cols > rows,
867
+ "is_tall": rows > cols,
868
+ "shape_description": self.get_shape_description(rows, cols)
869
+ }
870
+ except Exception as e:
871
+ results["aspect_ratio"] = {
872
+ "status": "failed",
873
+ "error": str(e)
874
+ }
875
+ analysis_failures.append("aspect_ratio")
876
+
877
+ try:
878
+ null_count = df.isna().sum().sum()
879
+ total_cells = results["basic_info"]["total_cells"]
880
+
881
+ empty_str_count = 0
882
+ for col in df.select_dtypes(include=['object']).columns:
883
+ try:
884
+ empty_str_count += (df[col] == "").sum()
885
+ except:
886
+ pass
887
+
888
+ zero_count = 0
889
+ for col in df.select_dtypes(include=['number']).columns:
890
+ try:
891
+ zero_count += (df[col] == 0).sum()
892
+ except:
893
+ pass
894
+
895
+ null_sparsity = null_count / total_cells if total_cells > 0 else 0
896
+ total_sparsity = (null_count + empty_str_count) / total_cells if total_cells > 0 else 0
897
+
898
+ results["sparsity"] = {
899
+ "null_count": int(null_count),
900
+ "empty_string_count": int(empty_str_count),
901
+ "zero_count_in_numeric": int(zero_count),
902
+ "null_sparsity": null_sparsity,
903
+ "total_sparsity": total_sparsity,
904
+ "sparsity_percentage": total_sparsity * 100,
905
+ "sparsity_level": self.get_sparsity_level(total_sparsity)
906
+ }
907
+
908
+ if detailed:
909
+ col_sparsity = {}
910
+ for col in df.columns:
911
+ null_pct = df[col].isna().mean() * 100
912
+
913
+ empty_pct = 0
914
+ if df[col].dtype == 'object':
915
+ try:
916
+ empty_pct = (df[col] == "").mean() * 100
917
+ except:
918
+ pass
919
+
920
+ col_sparsity[col] = {
921
+ "null_percentage": null_pct,
922
+ "empty_percentage": empty_pct,
923
+ "total_sparsity": null_pct + empty_pct
924
+ }
925
+ results["sparsity"]["column_sparsity"] = col_sparsity
926
+ except Exception as e:
927
+ results["sparsity"] = {
928
+ "status": "failed",
929
+ "error": str(e)
930
+ }
931
+ analysis_failures.append("sparsity")
932
+
933
+ try:
934
+ mem_usage = df.memory_usage(deep=True)
935
+ total_mem = mem_usage.sum()
936
+
937
+ mem_by_type = {}
938
+ for dtype in df.dtypes.value_counts().index:
939
+ dtype_cols = df.select_dtypes(include=[dtype]).columns
940
+ dtype_mem = sum(mem_usage[col] for col in dtype_cols if col in mem_usage)
941
+ mem_by_type[str(dtype)] = dtype_mem
942
+
943
+ full_dataset_mem = total_mem
944
+ if sample_size and sample_size < results["basic_info"]["rows"]:
945
+ full_rows = results["basic_info"]["rows"]
946
+ full_dataset_mem = total_mem * (full_rows / sample_size)
947
+
948
+ try:
949
+ system_mem = psutil.virtual_memory().total
950
+ except:
951
+ system_mem = None
952
+
953
+ results["memory_usage"] = {
954
+ "total_bytes": int(total_mem),
955
+ "total_mb": round(total_mem / (1024**2), 2),
956
+ "memory_by_type": {k: int(v) for k, v in mem_by_type.items()},
957
+ "average_bytes_per_row": int(total_mem / len(df)) if len(df) > 0 else 0,
958
+ "full_dataset_estimate_mb": round(full_dataset_mem / (1024**2), 2),
959
+ "system_memory_gb": round(system_mem / (1024**3), 2) if system_mem else None,
960
+ "memory_utilization_percentage": round(full_dataset_mem / system_mem * 100, 2) if system_mem else None
961
+ }
962
+ except Exception as e:
963
+ results["memory_usage"] = {
964
+ "status": "failed",
965
+ "error": str(e)
966
+ }
967
+ analysis_failures.append("memory_usage")
968
+
969
+ try:
970
+ rows = results["basic_info"]["rows"]
971
+ cols = results["basic_info"]["columns"]
972
+
973
+ basic_complexity = rows * cols # O(n*m) operations
974
+ sorting_complexity = rows * math.log2(rows) if rows > 0 else 0 # O(n log n)
975
+
976
+ unique_counts = {}
977
+ for col in df.select_dtypes(include=['object']).columns:
978
+ try:
979
+ unique_counts[col] = df[col].nunique()
980
+ except:
981
+ pass
982
+
983
+ num_numeric = len(df.select_dtypes(include=['number']).columns)
984
+
985
+ complexity_level = self.get_complexity_level(rows, cols)
986
+
987
+ results["processing_complexity"] = {
988
+ "basic_complexity": basic_complexity,
989
+ "sorting_complexity": sorting_complexity,
990
+ "unique_values_in_string_columns": unique_counts,
991
+ "numeric_columns_count": num_numeric,
992
+ "complexity_level": complexity_level,
993
+ "processing_recommendation": self.get_processing_recommendation(rows, cols)
994
+ }
995
+ except Exception as e:
996
+ results["processing_complexity"] = {
997
+ "status": "failed",
998
+ "error": str(e)
999
+ }
1000
+ analysis_failures.append("processing_complexity")
1001
+
1002
+ try:
1003
+ results["summary"] = self.generate_dimensionality_summary(results, analysis_failures)
1004
+ except Exception as e:
1005
+ results["summary"] = {
1006
+ "status": "failed",
1007
+ "error": str(e)
1008
+ }
1009
+
1010
+ if analysis_failures:
1011
+ results["status"] = "partial_success"
1012
+ results["failed_analyses"] = analysis_failures
1013
+ else:
1014
+ results["status"] = "success"
1015
+
1016
+ return results
1017
+
1018
+ def get_shape_description(self, rows, cols):
1019
+ """Get a descriptive term for the dataset shape based on rows and columns ratio"""
1020
+ ratio = rows / cols if cols > 0 else float('inf')
1021
+
1022
+ if ratio > 1000:
1023
+ return "extremely_tall"
1024
+ elif ratio > 100:
1025
+ return "very_tall"
1026
+ elif ratio > 10:
1027
+ return "tall"
1028
+ elif ratio > 3:
1029
+ return "moderately_tall"
1030
+ elif ratio > 0.33:
1031
+ return "balanced"
1032
+ elif ratio > 0.1:
1033
+ return "moderately_wide"
1034
+ elif ratio > 0.01:
1035
+ return "wide"
1036
+ elif ratio > 0.001:
1037
+ return "very_wide"
1038
+ else:
1039
+ return "extremely_wide"
1040
+
1041
+ def get_sparsity_level(self, sparsity_ratio):
1042
+ """Get a descriptive term for the dataset sparsity level"""
1043
+ if sparsity_ratio > 0.9:
1044
+ return "extremely_sparse"
1045
+ elif sparsity_ratio > 0.7:
1046
+ return "very_sparse"
1047
+ elif sparsity_ratio > 0.5:
1048
+ return "sparse"
1049
+ elif sparsity_ratio > 0.3:
1050
+ return "moderately_sparse"
1051
+ elif sparsity_ratio > 0.1:
1052
+ return "slightly_sparse"
1053
+ else:
1054
+ return "dense"
1055
+
1056
+ def get_complexity_level(self, rows, cols):
1057
+ """Get a descriptive term for processing complexity based on dataset size"""
1058
+ cells = rows * cols
1059
+
1060
+ if cells > 1_000_000_000: # 1 billion cells
1061
+ return "extremely_high"
1062
+ elif cells > 100_000_000: # 100 million cells
1063
+ return "very_high"
1064
+ elif cells > 10_000_000: # 10 million cells
1065
+ return "high"
1066
+ elif cells > 1_000_000: # 1 million cells
1067
+ return "moderate"
1068
+ elif cells > 100_000: # 100,000 cells
1069
+ return "low"
1070
+ else:
1071
+ return "very_low"
1072
+
1073
+ def get_processing_recommendation(self, rows, cols):
1074
+ """Get processing recommendations based on dataset size"""
1075
+ cells = rows * cols
1076
+
1077
+ if cells > 1_000_000_000: # 1 billion cells
1078
+ return "Distributed computing recommended (Spark, Dask). Consider data sampling or partitioning."
1079
+ elif cells > 100_000_000: # 100 million cells
1080
+ return "Optimized libraries recommended. Consider chunking data or using out-of-core processing."
1081
+ elif cells > 10_000_000: # 10 million cells
1082
+ return "Standard pandas may be slow. Consider optimizing memory usage or using more efficient libraries."
1083
+ elif cells > 1_000_000: # 1 million cells
1084
+ return "Standard pandas should work well with sufficient memory."
1085
+ else:
1086
+ return "Dataset can be easily processed with standard tools."
1087
+
1088
+ def generate_dimensionality_summary(self, results, analysis_failures):
1089
+ """Generate a summary of the dimensionality analysis"""
1090
+ summary = {}
1091
+
1092
+ try:
1093
+ rows = results["basic_info"]["rows"]
1094
+ cols = results["basic_info"]["columns"]
1095
+ shape_desc = results["aspect_ratio"]["shape_description"]
1096
+ summary["shape"] = f"Dataset has {rows:,} rows and {cols:,} columns ({shape_desc} shape)"
1097
+ except:
1098
+ summary["shape"] = "Could not determine dataset shape"
1099
+
1100
+ try:
1101
+ null_count = results["sparsity"]["null_count"]
1102
+ sparsity_pct = results["sparsity"]["sparsity_percentage"]
1103
+ sparsity_level = results["sparsity"]["sparsity_level"]
1104
+ summary["sparsity"] = f"Dataset is {sparsity_level} with {sparsity_pct:.1f}% missing values"
1105
+ except:
1106
+ summary["sparsity"] = "Could not determine dataset sparsity"
1107
+
1108
+ try:
1109
+ mem_mb = results["memory_usage"]["total_mb"]
1110
+ full_est_mb = results["memory_usage"]["full_dataset_estimate_mb"]
1111
+ if mem_mb == full_est_mb:
1112
+ summary["memory"] = f"Memory usage: {mem_mb:.1f} MB"
1113
+ else:
1114
+ summary["memory"] = f"Memory usage: {mem_mb:.1f} MB (estimated full dataset: {full_est_mb:.1f} MB)"
1115
+ except:
1116
+ summary["memory"] = "Could not determine memory usage"
1117
+
1118
+ try:
1119
+ complexity = results["processing_complexity"]["complexity_level"]
1120
+ summary["complexity"] = f"Processing complexity: {complexity}"
1121
+ except:
1122
+ summary["complexity"] = "Could not determine processing complexity"
1123
+
1124
+ insights = []
1125
+
1126
+ try:
1127
+ if results["aspect_ratio"]["is_wide"]:
1128
+ insights.append("Dataset has more columns than rows, which is unusual and may indicate a wide/transposed format")
1129
+
1130
+ if cols > 100:
1131
+ insights.append(f"High number of columns ({cols}) may indicate a need for dimensionality reduction")
1132
+ except:
1133
+ pass
1134
+
1135
+ try:
1136
+ if results["sparsity"]["sparsity_percentage"] > 50:
1137
+ insights.append(f"Dataset is quite sparse ({results['sparsity']['sparsity_percentage']:.1f}% missing values), consider handling missing data")
1138
+
1139
+ if "column_sparsity" in results["sparsity"]:
1140
+ sparse_cols = [col for col, data in results["sparsity"]["column_sparsity"].items()
1141
+ if data["total_sparsity"] > 80]
1142
+ if sparse_cols:
1143
+ col_count = len(sparse_cols)
1144
+ if col_count <= 3:
1145
+ insights.append(f"Consider dropping or imputing very sparse columns: {', '.join(sparse_cols)}")
1146
+ else:
1147
+ insights.append(f"{col_count} columns have >80% missing values and may be candidates for removal")
1148
+ except:
1149
+ pass
1150
+
1151
+ try:
1152
+ if results["memory_usage"].get("memory_utilization_percentage", 0) > 50:
1153
+ insights.append("Dataset is using a significant portion of system memory, consider chunking or sampling")
1154
+
1155
+ if "memory_by_type" in results["memory_usage"]:
1156
+ mem_by_type = results["memory_usage"]["memory_by_type"]
1157
+ if "object" in mem_by_type:
1158
+ object_mem_ratio = mem_by_type["object"] / results["memory_usage"]["total_bytes"]
1159
+ if object_mem_ratio > 0.7:
1160
+ insights.append("String/object columns use most memory; consider category type conversion")
1161
+ except:
1162
+ pass
1163
+
1164
+ try:
1165
+ complexity_level = results["processing_complexity"]["complexity_level"]
1166
+ if complexity_level in ["high", "very_high", "extremely_high"]:
1167
+ insights.append(results["processing_complexity"]["processing_recommendation"])
1168
+ except:
1169
+ pass
1170
+
1171
+ summary["key_insights"] = insights if insights else ["No specific insights identified"]
1172
+
1173
+ recommendations = []
1174
+
1175
+ try:
1176
+ if cols > 100:
1177
+ recommendations.append("Consider dimensionality reduction (PCA, feature selection)")
1178
+
1179
+ if results["aspect_ratio"]["is_wide"]:
1180
+ recommendations.append("Check if data is in correct format (might need transposing)")
1181
+ except:
1182
+ pass
1183
+
1184
+ try:
1185
+ if results["sparsity"]["sparsity_percentage"] > 30:
1186
+ recommendations.append("Implement appropriate missing value strategy (imputation or removal)")
1187
+ except:
1188
+ pass
1189
+
1190
+ try:
1191
+ if "memory_by_type" in results["memory_usage"] and "object" in results["memory_usage"]["memory_by_type"]:
1192
+ object_mem_ratio = results["memory_usage"]["memory_by_type"]["object"] / results["memory_usage"]["total_bytes"]
1193
+ if object_mem_ratio > 0.5:
1194
+ recommendations.append("Optimize memory by converting string columns to categories or using more efficient data types")
1195
+ except:
1196
+ pass
1197
+
1198
+ try:
1199
+ complexity_level = results["processing_complexity"]["complexity_level"]
1200
+ if complexity_level in ["very_high", "extremely_high"]:
1201
+ recommendations.append("Use chunking, sampling or distributed processing for this dataset")
1202
+ except:
1203
+ pass
1204
+
1205
+ summary["recommendations"] = recommendations if recommendations else ["No specific recommendations"]
1206
+
1207
+ if analysis_failures:
1208
+ summary["failures"] = f"Some analyses failed: {', '.join(analysis_failures)}"
1209
+
1210
+ return summary
1211
+
1212
+ def generate_report_from_agent(self, input)->str:
1213
+ '''Transform the json output to a user-readable report'''
1214
+ try:
1215
+ input = f"ML Task: {self.ml_task}\n{input}"
1216
+ response: RunResponse = self.writer.run(input, stream=False)
1217
+ return response.content
1218
+ except Exception as e:
1219
+ return f"Failed to generate report with error: {e}"
1220
+
1221
+ def convert_numpy_types(self, obj):
1222
+ if isinstance(obj, dict):
1223
+ return {k: self.convert_numpy_types(v) for k, v in obj.items()}
1224
+ elif isinstance(obj, list):
1225
+ return [self.convert_numpy_types(item) for item in obj]
1226
+ elif isinstance(obj, np.integer):
1227
+ return int(obj)
1228
+ elif isinstance(obj, np.floating):
1229
+ return float(obj)
1230
+ elif isinstance(obj, np.bool_):
1231
+ return bool(obj)
1232
+ elif isinstance(obj, np.ndarray):
1233
+ return obj.tolist()
1234
+ else:
1235
+ return obj
1236
+
1237
+ def run(self, verbose=False)-> Dict[str, dict]:
1238
+ '''Run the entire workflow'''
1239
+ statistical_summary_dict = self.build_statistical_summary(verbose=verbose)
1240
+ data_type_analysis_dict = self.analyze_data_types(verbose=verbose)
1241
+ dataset_dimensionality_dict = self.analyze_dataset_dimensionality(verbose=verbose)
1242
+
1243
+ statistical_summary_str = self.format_json(statistical_summary_dict)
1244
+ data_type_analysis_str = self.format_json(data_type_analysis_dict)
1245
+ dataset_dimensionality_str = self.format_json(dataset_dimensionality_dict)
1246
+
1247
+ logger.info("Generating final reports....", log_type='data_statistics', console=verbose)
1248
+ statistical_report = self.generate_report_from_agent(input=statistical_summary_str)
1249
+ data_type_report = self.generate_report_from_agent(input=data_type_analysis_str)
1250
+ dataset_dimensionality_report = self.generate_report_from_agent(input=dataset_dimensionality_str)
1251
+
1252
+ final_result = {
1253
+ "statistical_analysis": {
1254
+ 'dict': self.convert_numpy_types(statistical_summary_dict),
1255
+ 'report': statistical_report
1256
+ },
1257
+ "data_type_analysis": {
1258
+ 'dict': self.convert_numpy_types(data_type_analysis_dict),
1259
+ 'report': data_type_report
1260
+ },
1261
+ "dataset_dimensionality_analysis": {
1262
+ 'dict': self.convert_numpy_types(dataset_dimensionality_dict),
1263
+ 'report': dataset_dimensionality_report
1264
+ },
1265
+ }
1266
+
1267
+ return final_result
1268
+
1269
+
1270
+
src/app/pipelines/modules/data_understanding_context.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import pandas as pd
4
+ from dotenv import load_dotenv
5
+ from src.core.utils import logger
6
+ from pydantic import BaseModel, Field
7
+ from agno.models.openai import OpenAIChat
8
+ from agno.agent import Agent, RunResponse
9
+ from typing import Optional, Union, List, Dict, Tuple
10
+ from agno.tools.duckduckgo import DuckDuckGoTools
11
+
12
+ load_dotenv()
13
+
14
+ # class BCAgentResponseSchema(BaseModel):
15
+ # executive_summary: Optional[str] = Field(
16
+ # default=None,
17
+ # description="Contains information about the industry recognized, business purpose, key insights and impact on analysis"
18
+ # )
19
+ # business_concept_mapping: Optional[str] = Field(
20
+ # default=None,
21
+ # description="Contains all the columns along with their business concept, definition, industry standard term and their metric"
22
+ # )
23
+ # business_dependencies: Optional[str] = Field(
24
+ # default=None,
25
+ # description="Contains all the columns along with their dependent variable(s), nature of dependency and business impact"
26
+ # )
27
+ # key_performance_indicators: Optional[str] = Field(
28
+ # default=None,
29
+ # description="Contains names of all the kpis along with their formula, business objective, target range and industry benchmark"
30
+ # )
31
+ # regulatory_compliance_considerations: Optional[List[str]] = Field(
32
+ # default=None,
33
+ # description="Contains list all the regulations along with their applicability, key requirements and affected variables"
34
+ # )
35
+ # business_rules: Optional[List[str]] = Field(
36
+ # default=None,
37
+ # description="Contains list of all the rules along with their description, affected variables, priority level"
38
+ # )
39
+ # impact_analysis: Optional[List[str]] = Field(
40
+ # default=None,
41
+ # description="Contains list of all the business context elements along with their data quality impact, analysis approach impact, modeling approach followed by their recommended approach"
42
+ # )
43
+ # references: Optional[List[str]] = Field(
44
+ # default=None,
45
+ # description="Contains the links for all the references and sources"
46
+ # )
47
+
48
+ # class SUAgentResponseSchema(BaseModel):
49
+ # variable_definitions: Optional[Dict[str, str]] = Field(
50
+ # default=None,
51
+ # description="Contains the concise definition for each variable along with its unit of measurement observed"
52
+ # )
53
+ # reference_codes: Optional[Dict[str, str]] = Field(
54
+ # default=None,
55
+ # description="Contains the meaning for various abbreviations"
56
+ # )
57
+ # heirarchical_relationships: Optional[Dict[str, str]] = Field(
58
+ # default=None,
59
+ # description="Contains some information about heirarchy between variables"
60
+ # )
61
+
62
+
63
+ # DATA SOURCE ASSESSMENT STILL INCOMPLETE
64
+
65
+ class DataUnderstandingContextWorkflow:
66
+ def __init__(
67
+ self, data_source: str,
68
+ llm_choice: str,
69
+ source_context: Union[str, None] = None,
70
+ business_context: Union[str, None] = None
71
+ ) -> None:
72
+ '''This is the very 1st module which will be executed for all EDA tasks regardless'''
73
+
74
+ self.data = None
75
+ self.source_context = source_context
76
+ self.business_context = business_context
77
+ self.llm = OpenAIChat(
78
+ id=llm_choice, api_key=os.getenv('OPENAI_API_KEY'))
79
+
80
+ try:
81
+ self.data = pd.read_csv(data_source)
82
+ except Exception as e:
83
+ logger.error(
84
+ f"Failed to read the file from the data source with error: {e}", log_type="data_understanding_context", console=True)
85
+
86
+ def extract_column_data(self, max_col_limit=15) -> Union[str, None]:
87
+ """
88
+ Extracts column names and samples rows from a dataset.
89
+
90
+ Rules:
91
+ - If the dataset has more than 15 columns, sample 1 random row.
92
+ - Otherwise, sample 3 random rows.
93
+
94
+ Returns:
95
+ A formatted string containing column names and sampled data.
96
+ """
97
+
98
+ columns = self.data.columns.tolist()
99
+ num_columns = len(columns)
100
+
101
+ if num_columns > max_col_limit:
102
+ sampled_rows = self.data.sample(
103
+ 1, random_state=random.randint(0, 1000))
104
+ else:
105
+ sampled_rows = self.data.sample(
106
+ 3, random_state=random.randint(0, 1000))
107
+
108
+ doc = f"""
109
+ Dataset Summary
110
+ ===============
111
+
112
+ Total Columns: {num_columns}
113
+ Column Names:
114
+ {columns}
115
+
116
+ Sampled Data:
117
+ {sampled_rows.to_string(index=False)}
118
+ """
119
+
120
+ return doc.strip()
121
+
122
+ def build_business_context_integration(self, verbose=False) -> Union[RunResponse, None]:
123
+ logger.info(f"Starting business context integration", log_type='data_understanding_context', console=verbose)
124
+ if not self.business_context:
125
+ logger.info("No business context found. Skipping this section...",
126
+ log_type='data_understanding_context', console=verbose)
127
+ return None
128
+ else:
129
+
130
+ try:
131
+ bc_agent = Agent(
132
+ name="Businesss Context Agent",
133
+ model=self.llm,
134
+ markdown=True,
135
+ reasoning=True,
136
+ # response_model=BCAgentResponseSchema,
137
+ instructions="""
138
+ Business Context Integration Agent Prompt
139
+
140
+ Your objective is to serve as a Business Context Integration Agent. Given a dataset with column names and the type of machine learning task (e.g., classification, regression, clustering), your responsibility is to gather authoritative, relevant business context by performing extensive web-based research.
141
+
142
+ All claims must be supported with referenced links.
143
+
144
+ Use the dataset columns and ML task as the anchor to drive your research and reasoning.
145
+
146
+ -----------------------------------------------
147
+ Research Goals:
148
+ -----------------------------------------------
149
+
150
+ 1. executive_summary:
151
+ - Identify the broader industry and functional domain that the dataset likely belongs to based on the column names.
152
+ - Outline the business purpose for building the machine learning model.
153
+ - Explain how incorporating domain knowledge impacts the model’s relevance and usefulness in a real-world context.
154
+ - Highlight notable trends, challenges, or insights from the industry that are relevant to the problem space.
155
+
156
+ 2. business_concept_mapping:
157
+ - Map each dataset column to its corresponding business concept and provide a clear and formal definition.
158
+ - Identify industry-recognized terms and naming conventions related to the variables.
159
+ - Include units of measurement, and where applicable, mathematical formulas or logic behind computed values.
160
+ - Use standardized terminologies from domain-specific glossaries or data dictionaries to ensure consistency.
161
+
162
+ 3. business_dependencies:
163
+ - For each variable, identify direct and indirect relationships with other variables in the dataset.
164
+ - Define whether these relationships are statistical, causal, temporal, or logical.
165
+ - Reference literature or domain analyses that justify these dependencies and describe their business implications.
166
+ - Explain how these dependencies may affect downstream model features or interpretation.
167
+
168
+ 4. key_performance_indicators:
169
+ - Determine the key performance indicators (KPIs) that are either directly present in the dataset or can be derived from it.
170
+ - Provide detailed formulas, objectives behind each KPI, and their role in strategic or operational decision-making.
171
+ - Include target thresholds or performance ranges based on benchmarks from industry or regulatory sources.
172
+ - Document frameworks or methodologies used to calculate and track these KPIs.
173
+
174
+ 5. regulatory_compliance_considerations:
175
+ - Identify regulations, compliance requirements, or legal constraints that apply to the variables or their use in machine learning models.
176
+ - Summarize mandates from regulatory bodies, compliance frameworks, and data protection laws relevant to the domain.
177
+ - Detail how these regulations affect data collection, storage, analysis, and reporting for the columns involved.
178
+ - Note any enforcement actions or penalties associated with non-compliance, if applicable.
179
+
180
+ 6. business_rules:
181
+ - Collect formal and informal business rules that define the valid behavior, constraints, or validation criteria for each variable.
182
+ - Document rule sources such as process documentation, system specifications, industry best practices, and technical standards.
183
+ - Classify rules by type (validation, constraint, workflow, consistency) and specify affected variables.
184
+ - Assign a priority level (e.g., High, Medium, Low) to each rule based on its impact or frequency.
185
+
186
+ 7. impact_analysis:
187
+ - Assess the influence of the business context elements (e.g., dependencies, rules, KPIs, regulations) on data quality and model effectiveness.
188
+ - Highlight how they should inform exploratory analysis, feature selection, or modeling strategies.
189
+ - Recommend specific techniques or precautions to address potential risks or quality issues stemming from business constraints or misalignments.
190
+ - Discuss how different business-driven assumptions or constraints might affect model interpretability, performance, and deployment.
191
+
192
+ 8. references:
193
+ - Provide a list of URLs for all sources consulted or referenced in each of the above categories.
194
+ - Prioritize credible references including academic publications, regulatory websites, vendor documentation, whitepapers, and official data standards.
195
+
196
+
197
+ 9. metadata_analysis:
198
+ - Evaluate the naming conventions of each column in the dataset to determine if they follow consistent, domain-aligned patterns.
199
+ • Identify the naming style (e.g., snake_case, camelCase, abbreviations) and flag inconsistencies.
200
+ • Recommend improvements to align with industry-standard naming practices where needed.
201
+ • Justify suggestions based on business readability, clarity, and traceability.
202
+ - Categorize all dataset variables into the following types:
203
+ • Identifier Variables: Used to uniquely identify records (e.g., IDs, Ticket Numbers).
204
+ • Analytical Variables: Variables that contain information relevant to prediction, segmentation, or business insight generation.
205
+ • Target Variables: Outcome labels used in supervised machine learning tasks.
206
+ • Derived or Redundant Variables: Columns generated from transformations, combinations, or duplicates of others.
207
+ - Explain the business role and implications of each variable type:
208
+ • Why are certain fields considered identifiers and not used for analysis?
209
+ • Which analytical variables are central to decision-making and why?
210
+ • How does identifying target and redundant variables support data quality and model interpretability?
211
+ - Where applicable, reference industry data dictionaries or data modeling standards to support naming and classification.
212
+
213
+ -----------------------------------------------
214
+ Guidelines:
215
+ -----------------------------------------------
216
+
217
+ - Maintain a descriptive, business-friendly tone in all outputs.
218
+ - Avoid technical jargon unless it is relevant to business understanding or regulatory compliance.
219
+ - Do not make assumptions without a verifiable source; indicate low-confidence insights where applicable.
220
+ - Ensure all responses are grounded in real-world context and supported by credible, traceable information.
221
+ - Be comprehensive but concise—aim to deliver practical business value with each section of the output.
222
+ """,
223
+ tools=[DuckDuckGoTools(search=True, news=False)]
224
+ )
225
+
226
+ columns_data = self.extract_column_data()
227
+ prompt = (
228
+ f'Business Context: \n'
229
+ f'{self.business_context}\n'
230
+ f'{columns_data}'
231
+ )
232
+
233
+ response: RunResponse = bc_agent.run(prompt, stream=False)
234
+
235
+ logger.info(f"Business Context integration finished....",
236
+ log_type="data_understanding_context", console=verbose)
237
+ return response
238
+ except Exception as e:
239
+ logger.error(
240
+ f"Failed to build business context integration with error: {e}", log_type="data_understanding_context", console=verbose)
241
+ return None
242
+
243
+ def build_semantic_understanding(self, verbose=False) -> Union[RunResponse, None]:
244
+ logger.info(f"Starting to build semantic integration", log_type='data_understanding_context', console=verbose)
245
+ try:
246
+ bc_agent = Agent(
247
+ name="Semantic Understanding Agent",
248
+ model=self.llm,
249
+ markdown=True,
250
+ reasoning=True,
251
+ # response_model=SUAgentResponseSchema,
252
+ instructions="""
253
+ Objective:
254
+ You are a Semantic Understanding Agent responsible for analyzing a dataset with no prior documentation. Your goal is to enrich each variable (column) by extracting and constructing meaningful metadata that helps clarify the variable’s semantic role in the dataset.
255
+ This process is essential for enabling downstream tasks like data validation, feature engineering, explainability, and regulatory compliance.
256
+
257
+ You must generate detailed descriptions and metadata across the following four components for each column in the dataset:
258
+
259
+ -------------------------------------------------------------
260
+ 1. Create/Validate Variable Definitions
261
+ -------------------------------------------------------------
262
+ - Infer and construct a clear, concise, and human-readable definition for each variable based solely on the column name, data type, and observed value patterns.
263
+ - Where the column contains obvious domain terms, use domain-specific language to define them precisely.
264
+ - If a variable is ambiguous, document your reasoning and note the uncertainty.
265
+ - Ensure that the definition is useful for a business analyst or data scientist trying to understand the purpose of the variable.
266
+
267
+ -------------------------------------------------------------
268
+ 2. Document Units of Measurement for Each Variable
269
+ -------------------------------------------------------------
270
+ - Determine if the variable has an implicit or explicit unit of measurement.
271
+ - Use value ranges and patterns to infer the unit (e.g., numeric age values imply years).
272
+ - If no unit is applicable (e.g., categorical strings), indicate “N/A”.
273
+ - Standardize the unit using singular form and SI/non-SI conventions when applicable.
274
+
275
+ -------------------------------------------------------------
276
+ 3. Identify Reference Codes and Their Meanings
277
+ -------------------------------------------------------------
278
+ - Identify variables that represent coded values, whether single-letter, numeric, or short abbreviations.
279
+ - For such variables, list all unique values and attempt to map each one to a meaningful label.
280
+ - Use statistical reasoning (value distribution) and common domain knowledge to deduce what each code stands for.
281
+ - If meanings are uncertain, flag them with a confidence level.
282
+
283
+
284
+ -------------------------------------------------------------
285
+ 4. Document Any Known Hierarchical Relationships
286
+ -------------------------------------------------------------
287
+ - Identify columns that relate to one another hierarchically, structurally, or contextually.
288
+ - Describe the hierarchy and explain the nature of the relationship (e.g., parent-child, group-subgroup, categorical-numerical link).
289
+ - If patterns suggest derived or grouped relationships, explain your logic.
290
+
291
+ -------------------------------------------------------------
292
+ Output Guidelines:
293
+ -------------------------------------------------------------
294
+ - Be precise and neutral in language.
295
+ - If uncertain, include a note explaining ambiguity or assumption.
296
+ - Avoid inventing information—only reason from what's in the dataset.
297
+
298
+ -------------------------------------------------------------
299
+ Final Note:
300
+ -------------------------------------------------------------
301
+ You do not have access to external documentation or glossaries. All your outputs must be reasoned and extracted using the dataset column names and data types alone
302
+ """
303
+ )
304
+
305
+ columns_data = self.extract_column_data()
306
+
307
+ prompt = (
308
+ f"{columns_data}"
309
+ )
310
+
311
+ response: RunResponse = bc_agent.run(prompt, stream=False)
312
+
313
+ logger.info(f"Semantic Understanding Finished....",
314
+ log_type="data_understanding_context", console=verbose)
315
+ return response
316
+ except Exception as e:
317
+ logger.error(
318
+ f"Failed to build Semantic Understanding with error: {e}", log_type="data_understanding_context", console=verbose)
319
+ return None
320
+
321
+ def run(self, verbose=False) -> Dict[str, dict]:
322
+ bci_result = self.build_business_context_integration(verbose=verbose)
323
+ su_result = self.build_semantic_understanding(verbose=verbose)
324
+
325
+ return {
326
+ 'business_context_integration':{
327
+ "report": bci_result.content
328
+ },
329
+ 'semantic_understanding': {
330
+ "report": su_result.content
331
+ }
332
+ }
src/app/pipelines/modules/univariate_analysis.py ADDED
@@ -0,0 +1,1437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import math
3
+ import json
4
+ import numpy as np
5
+ import pandas as pd
6
+ from tqdm import tqdm
7
+ import lmoments3 as lm
8
+ import scipy.stats as stats
9
+ from src.core.utils import logger
10
+ from agno.models.openai import OpenAIChat
11
+ from agno.agent import Agent, RunResponse
12
+ from sklearn.neighbors import KernelDensity
13
+ from sklearn.model_selection import GridSearchCV
14
+ from typing import Union, Tuple, Dict, Any, List, Optional
15
+ from scipy.stats import norm, shapiro, anderson, kstest, normaltest
16
+
17
+ class UnivariateAnalysisWorkflow:
18
+ def __init__(
19
+ self, data_source: str,
20
+ llm_choice: str,
21
+ ml_task: str
22
+ ) -> None:
23
+ ''''''
24
+ self.data = None
25
+ self.data_source = data_source
26
+ self.llm_choice = llm_choice
27
+ self.llm = OpenAIChat(id=llm_choice, api_key=os.getenv('OPENAI_API_KEY'))
28
+ self.writer: Agent = Agent(
29
+ model=self.llm,
30
+ instructions=[
31
+ "You will be provided with lots of structured outputs. Your work is to display this"
32
+ "in a nicely formatted manner. You must analayze the results and output a comprehensive and insightful report"
33
+ ],
34
+ markdown=True,
35
+ )
36
+ self.ml_task = ml_task
37
+ _ = self.load_data(data_source=data_source)
38
+
39
+ def load_data(self, data_source: str) -> Union[None, bool]:
40
+ '''Load CSV into dataframe'''
41
+ try:
42
+ self.data = pd.read_csv(data_source)
43
+ return True
44
+ except Exception as e:
45
+ logger.error(f"Failed to read the file from the data source with error: {e}", log_type="data_quality_assessment", console=True)
46
+ return False
47
+
48
+ def detect_variable_types(self)->Tuple[Dict[str, str], Dict[str, str]]:
49
+ """Automatically detect continuous vs categorical variables"""
50
+ continuous_vars = {}
51
+ categorical_vars = {}
52
+
53
+ for col in self.data.columns:
54
+ col_data = self.data[col].dropna()
55
+
56
+ if len(col_data) == 0:
57
+ continue
58
+
59
+ dtype = str(self.data[col].dtype)
60
+ nunique = col_data.nunique()
61
+ sample_values = col_data.head(5).tolist()
62
+
63
+ if np.issubdtype(self.data[col].dtype, np.datetime64):
64
+ continue
65
+
66
+ if dtype == 'bool' or nunique == 2 and set(col_data.unique()).issubset({0, 1, True, False}):
67
+ categorical_vars[col] = 'binary'
68
+ continue
69
+
70
+ if dtype == 'object':
71
+ try:
72
+ _ = pd.to_numeric(col_data)
73
+ col_data = pd.to_numeric(col_data)
74
+ except ValueError:
75
+ categorical_vars[col] = f'categorical (text, {nunique} unique values)'
76
+ continue
77
+
78
+ if np.issubdtype(col_data.dtype, np.number):
79
+ if nunique <= 20:
80
+ if (np.array_equal(col_data.unique(), np.arange(nunique)) or (nunique <= 10 and all(x in col_data.unique() for x in range(1, nunique+1)))):
81
+ categorical_vars[col] = f'categorical (ordinal, {nunique} levels)'
82
+ else:
83
+ categorical_vars[col] = f'categorical (nominal, {nunique} levels)'
84
+ else:
85
+ continuous_vars[col] = 'continuous'
86
+ else:
87
+ categorical_vars[col] = f'categorical (other, {nunique} unique values)'
88
+
89
+ return continuous_vars, categorical_vars
90
+
91
+ def analyze_distributions(self, verbose=False):
92
+ """Run distribution analysis for all continuous variables"""
93
+ logger.info("Starting to analyze distributions..", log_type='univariate_analysis', console=verbose)
94
+ continuous_vars, _ = self.detect_variable_types()
95
+ results = {}
96
+ for var in tqdm(continuous_vars):
97
+ try:
98
+ var_results = {
99
+ 'kde': self.kernel_density_estimation(var),
100
+ 'normality_tests': self.run_normality_tests(var),
101
+ # 'distribution_fit': self.fit_distributions(var),
102
+ # 'modality_tests': self.test_modality(var)
103
+ }
104
+ results[var] = var_results
105
+ except Exception as e:
106
+ if verbose:
107
+ logger.error(f"Error analyzing variable {var}: {str(e)}", log_type="data_quality_assessment", console=verbose)
108
+ results[var] = {'error': str(e)}
109
+ return results
110
+
111
+ def kernel_density_estimation(self, variable: str, verbose: bool = False) -> Dict[str, Any]:
112
+ """Perform comprehensive kernel density estimation with optimal bandwidth selection, multiple kernel types, and comparative analysis."""
113
+ if variable not in self.data.columns:
114
+ raise ValueError(f"Variable {variable} not found in dataset")
115
+
116
+ x = self.data[variable].dropna().values
117
+ results = {}
118
+
119
+ try:
120
+ results['basic_stats'] = {
121
+ 'n': len(x),
122
+ 'mean': np.mean(x),
123
+ 'std': np.std(x),
124
+ 'min': np.min(x),
125
+ 'max': np.max(x),
126
+ 'skewness': stats.skew(x),
127
+ 'kurtosis': stats.kurtosis(x)
128
+ }
129
+ except Exception as e:
130
+ logger.error(f"Failed to compute basic stats: {e}", log_type='univariate_analysis', console=verbose)
131
+
132
+ try:
133
+ silverman_bandwidth = (4 * np.std(x)**5 / (3 * len(x)))**(1/5)
134
+ scott_bandwidth = 1.06 * np.std(x) * len(x)**(-1/5)
135
+ except Exception as e:
136
+ silverman_bandwidth = scott_bandwidth = None
137
+ logger.error(f"Failed to compute Silverman/Scott bandwidths: {e}", log_type='univariate_analysis', console=verbose)
138
+
139
+ try:
140
+ grid = GridSearchCV(
141
+ KernelDensity(kernel='gaussian'),
142
+ {'bandwidth': np.linspace(0.1, 2, 30)},
143
+ cv=5
144
+ )
145
+ grid.fit(x.reshape(-1, 1))
146
+ cv_bandwidth = grid.best_params_['bandwidth']
147
+ except Exception as e:
148
+ cv_bandwidth = None
149
+ logger.error(f"Grid search for bandwidth failed: {e}", log_type='univariate_analysis', console=verbose)
150
+
151
+ results['bandwidth_selection'] = {
152
+ 'silverman': silverman_bandwidth,
153
+ 'scott': scott_bandwidth,
154
+ 'cross_validated': cv_bandwidth,
155
+ 'selected_bandwidth': cv_bandwidth or silverman_bandwidth or scott_bandwidth
156
+ }
157
+ bandwidth = results['bandwidth_selection']['selected_bandwidth']
158
+
159
+ kernels = ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']
160
+ kde_models = {}
161
+ for kernel in kernels:
162
+ try:
163
+ kde = KernelDensity(kernel=kernel, bandwidth=bandwidth)
164
+ kde.fit(x.reshape(-1, 1))
165
+ kde_models[kernel] = kde
166
+ except Exception as e:
167
+ logger.error(f"Failed to fit KDE with kernel {kernel}: {e}", log_type='univariate_analysis', console=verbose)
168
+
169
+ try:
170
+ x_grid = np.linspace(np.min(x), np.max(x), 1000)
171
+ log_likelihoods = {}
172
+ for kernel, model in kde_models.items():
173
+ log_likelihoods[kernel] = model.score_samples(x_grid.reshape(-1, 1)).sum()
174
+
175
+ results['kernel_comparison'] = {
176
+ 'log_likelihoods': log_likelihoods,
177
+ 'best_kernel': max(log_likelihoods, key=log_likelihoods.get)
178
+ }
179
+ best_kernel = results['kernel_comparison']['best_kernel']
180
+ best_kde = kde_models[best_kernel]
181
+ density = np.exp(best_kde.score_samples(x_grid.reshape(-1, 1)))
182
+ except Exception as e:
183
+ results['kernel_comparison'] = {}
184
+ density = None
185
+ x_grid = None
186
+ logger.error(f"Kernel comparison failed: {e}", log_type='univariate_analysis', console=verbose)
187
+
188
+
189
+ dist_names = ['norm', 'lognorm', 'expon', 'gamma', 'beta']
190
+ dist_results = []
191
+ for dist_name in dist_names:
192
+ try:
193
+ params = getattr(stats, dist_name).fit(x)
194
+ pdf = getattr(stats, dist_name).pdf(x_grid, *params)
195
+ kl_div = stats.entropy((density + 1e-10), (pdf + 1e-10))
196
+ dist_results.append({
197
+ 'distribution': dist_name,
198
+ 'params': params,
199
+ 'kl_divergence': kl_div
200
+ })
201
+ except Exception as e:
202
+ logger.error(f"Failed to fit and evaluate {dist_name}: {e}", log_type='univariate_analysis', console=verbose)
203
+
204
+
205
+ try:
206
+ results['parametric_comparison'] = sorted(dist_results, key=lambda x: x['kl_divergence'])
207
+ if results['parametric_comparison']:
208
+ best_dist = results['parametric_comparison'][0]
209
+ dist_pdf = getattr(stats, best_dist['distribution']).pdf(x_grid, *best_dist['params'])
210
+ except Exception as e:
211
+ results['parametric_comparison'] = []
212
+ logger.error(f"Parametric comparison failed: {e}", log_type='univariate_analysis', console=verbose)
213
+
214
+
215
+ try:
216
+ bandwidths = np.linspace(bandwidth * 0.1, bandwidth * 2, 20)
217
+ log_liks = []
218
+ for bw in bandwidths:
219
+ kde = KernelDensity(kernel=best_kernel, bandwidth=bw)
220
+ kde.fit(x.reshape(-1, 1))
221
+ log_liks.append(kde.score(x.reshape(-1, 1)))
222
+ results['bandwidth_sensitivity'] = {
223
+ 'bandwidths': bandwidths.tolist(),
224
+ 'log_likelihoods': log_liks
225
+ }
226
+ except Exception as e:
227
+ logger.error(f"Bandwidth sensitivity analysis failed: {e}", log_type='univariate_analysis', console=verbose)
228
+
229
+ results['summary'] = self.summarize_kde_results(kde_results=results, variable=variable)
230
+
231
+ return results
232
+
233
+ def summarize_kde_results(self, kde_results: Dict[str, Any], variable: str) -> Dict[str, Any]:
234
+ """Generate a precise and insightful summary of kernel density estimation results"""
235
+
236
+ summary = {
237
+ 'variable': variable,
238
+ 'distribution_characteristics': {},
239
+ 'best_fit': {},
240
+ 'technical_details': {}
241
+ }
242
+
243
+ try:
244
+ if 'basic_stats' in kde_results:
245
+ stats = kde_results['basic_stats']
246
+ shape = "symmetric"
247
+ if stats.get('skewness') is not None:
248
+ if stats['skewness'] > 0.5:
249
+ shape = "right-skewed"
250
+ elif stats['skewness'] < -0.5:
251
+ shape = "left-skewed"
252
+ peakedness = "mesokurtic"
253
+ if stats.get('kurtosis') is not None:
254
+ if stats['kurtosis'] > 0.5:
255
+ peakedness = "leptokurtic (heavy-tailed)"
256
+ elif stats['kurtosis'] < -0.5:
257
+ peakedness = "platykurtic (light-tailed)"
258
+ range_width = stats.get('max', 0) - stats.get('min', 0)
259
+ std_ranges = range_width / stats.get('std', 1) if stats.get('std') else None
260
+ summary['distribution_characteristics'] = {
261
+ 'shape': shape,
262
+ 'peakedness': peakedness,
263
+ 'central_tendency': stats.get('mean'),
264
+ 'dispersion': stats.get('std'),
265
+ 'range_in_std_units': round(std_ranges, 2) if std_ranges else None,
266
+ 'sample_size': stats.get('n')
267
+ }
268
+ except Exception as e:
269
+ if variable:
270
+ summary['distribution_characteristics'] = {'error': str(e)}
271
+
272
+ try:
273
+ if 'kernel_comparison' in kde_results and kde_results['kernel_comparison']:
274
+ summary['best_fit']['nonparametric'] = {
275
+ 'method': 'kernel_density',
276
+ 'best_kernel': kde_results['kernel_comparison'].get('best_kernel')
277
+ }
278
+ except Exception as e:
279
+ summary['best_fit']['nonparametric'] = {'error': str(e)}
280
+
281
+ try:
282
+ if 'parametric_comparison' in kde_results and kde_results['parametric_comparison']:
283
+ best_dist = kde_results['parametric_comparison'][0]
284
+ kl_div = best_dist.get('kl_divergence', None)
285
+ summary['best_fit']['parametric'] = {
286
+ 'distribution': best_dist.get('distribution'),
287
+ 'kl_divergence': round(kl_div, 4) if kl_div else None
288
+ }
289
+ if kl_div is not None:
290
+ if kl_div < 0.05:
291
+ fit_quality = "excellent"
292
+ elif kl_div < 0.1:
293
+ fit_quality = "good"
294
+ elif kl_div < 0.3:
295
+ fit_quality = "moderate"
296
+ else:
297
+ fit_quality = "poor"
298
+ summary['best_fit']['parametric']['fit_quality'] = fit_quality
299
+ except Exception as e:
300
+ summary['best_fit']['parametric'] = {'error': str(e)}
301
+
302
+ try:
303
+ if 'bandwidth_selection' in kde_results:
304
+ bw = kde_results['bandwidth_selection']
305
+ selected = bw.get('selected_bandwidth')
306
+ method = next((k for k, v in bw.items()
307
+ if v == selected and k != 'selected_bandwidth'), None)
308
+ summary['technical_details']['bandwidth'] = {
309
+ 'selected': selected,
310
+ 'selection_method': method
311
+ }
312
+ except Exception as e:
313
+ summary['technical_details']['bandwidth'] = {'error': str(e)}
314
+
315
+ try:
316
+ if 'basic_stats' in kde_results and 'bandwidth_selection' in kde_results:
317
+ range_width = kde_results['basic_stats'].get('max', 0) - kde_results['basic_stats'].get('min', 0)
318
+ selected_bw = kde_results['bandwidth_selection'].get('selected_bandwidth', 0)
319
+ if selected_bw and range_width:
320
+ relative_bw = selected_bw / range_width
321
+ if relative_bw < 0.03:
322
+ multimodality = "highly likely"
323
+ elif relative_bw < 0.06:
324
+ multimodality = "possible"
325
+ else:
326
+ multimodality = "unlikely"
327
+ summary['distribution_characteristics']['multimodality'] = multimodality
328
+ except Exception as e:
329
+ summary['distribution_characteristics']['multimodality_error'] = str(e)
330
+
331
+ try:
332
+ recommendations = []
333
+ if 'basic_stats' in kde_results and kde_results['basic_stats'].get('n', 0) < 30:
334
+ recommendations.append("Sample size is small; interpret results with caution")
335
+ if ('basic_stats' in kde_results and
336
+ abs(kde_results['basic_stats'].get('skewness', 0)) > 1.5 or
337
+ abs(kde_results['basic_stats'].get('kurtosis', 0)) > 2):
338
+ recommendations.append("Distribution has extreme values; consider transformation or robust methods")
339
+ if ('parametric_comparison' in kde_results and kde_results['parametric_comparison'] and
340
+ kde_results['parametric_comparison'][0].get('kl_divergence', 1) < 0.1):
341
+ best_dist = kde_results['parametric_comparison'][0]['distribution']
342
+ recommendations.append(f"Consider using {best_dist} distribution for parametric modeling")
343
+ summary['recommendations'] = recommendations
344
+ except Exception as e:
345
+ summary['recommendations'] = [f"Failed to generate recommendations: {str(e)}"]
346
+
347
+ return summary
348
+
349
+ def run_normality_tests(self, variable, verbose=False):
350
+ """Run battery of normality tests"""
351
+ try:
352
+ data = self.data[variable].dropna()
353
+ sample_size = len(data)
354
+
355
+ if len(self.data) > 10000:
356
+ sample_size = 1000
357
+ elif 5000 < len(self.data) <= 10000:
358
+ sample_size = int(0.2 * len(self.data))
359
+ else:
360
+ sample_size = len(self.data)
361
+
362
+ if sample_size < len(data):
363
+ data = data.sample(n=sample_size, random_state=42)
364
+
365
+ normality_tests = {}
366
+
367
+ try:
368
+ if sample_size < 30:
369
+ normality_tests['shapiro_wilk'] = self.shapiro_wilk_normality_test(data)
370
+ normality_tests['dagostino_pearson'] = self.dagostino_pearson_normality_test(data)
371
+ elif 30 <= sample_size <= 2000:
372
+ normality_tests['shapiro_wilk'] = self.shapiro_wilk_normality_test(data)
373
+ normality_tests['dagostino_pearson'] = self.dagostino_pearson_normality_test(data)
374
+ normality_tests['anderson_darling'] = self.anderson_darling_normality_test(data)
375
+ elif sample_size > 2000:
376
+ normality_tests['dagostino_pearson'] = self.dagostino_pearson_normality_test(data)
377
+ normality_tests['anderson_darling'] = self.anderson_darling_normality_test(data)
378
+ normality_tests['kolmogorov_smirnov'] = self.kolmogorov_smirnov_normality_test(data)
379
+ except Exception as e:
380
+ logger.error(f"Error running normality tests for {variable}: {str(e)}", log_type="data_quality_assessment", console=verbose)
381
+ normality_tests['error'] = str(e)
382
+
383
+ normality_tests['summary'] = self.generate_normality_test_summary(normality_tests, variable)
384
+
385
+ return normality_tests
386
+
387
+ except Exception as e:
388
+ logger.error(f"Error processing variable {variable}: {str(e)}", log_type="data_quality_assessment", console=verbose)
389
+ return {'error': str(e)}
390
+
391
+ def shapiro_wilk_normality_test(self, data):
392
+ """Perform Shapiro-Wilk test for normality"""
393
+ try:
394
+ stat, p = shapiro(data)
395
+ return {
396
+ 'test_statistic': stat,
397
+ 'p_value': p,
398
+ 'interpretation': 'Data looks normally distributed (fail to reject H0)' if p > 0.05 else 'Data does not look normally distributed (reject H0)'
399
+ }
400
+ except Exception as e:
401
+ return {
402
+ 'error': f"Shapiro-Wilk test failed: {str(e)}",
403
+ 'test_statistic': None,
404
+ 'p_value': None
405
+ }
406
+
407
+ def dagostino_pearson_normality_test(self, data):
408
+ """Perform D'Agostino-Pearson test for normality"""
409
+ try:
410
+ stat, p = normaltest(data)
411
+ return {
412
+ 'test_statistic': stat,
413
+ 'p_value': p,
414
+ 'interpretation': 'Data looks normally distributed (fail to reject H0)' if p > 0.05 else 'Data does not look normally distributed (reject H0)'
415
+ }
416
+ except Exception as e:
417
+ return {
418
+ 'error': f"D'Agostino-Pearson test failed: {str(e)}",
419
+ 'test_statistic': None,
420
+ 'p_value': None
421
+ }
422
+
423
+ def anderson_darling_normality_test(self, data):
424
+ """Perform Anderson-Darling test for normality"""
425
+ try:
426
+ result = anderson(data, dist='norm')
427
+ return {
428
+ 'test_statistic': result.statistic,
429
+ 'critical_values': result.critical_values,
430
+ 'significance_levels': result.significance_level,
431
+ 'interpretation': 'Data looks normally distributed at 5% level' if result.statistic < result.critical_values[2] else 'Data does not look normally distributed at 5% level'
432
+ }
433
+ except Exception as e:
434
+ return {
435
+ 'error': f"Anderson-Darling test failed: {str(e)}",
436
+ 'test_statistic': None,
437
+ 'critical_values': None
438
+ }
439
+
440
+ def kolmogorov_smirnov_normality_test(self, data):
441
+ """Perform Kolmogorov-Smirnov test for normality"""
442
+ try:
443
+ mu, std = norm.fit(data)
444
+
445
+ stat, p = kstest(data, 'norm', args=(mu, std))
446
+
447
+ return {
448
+ 'test_statistic': stat,
449
+ 'p_value': p,
450
+ 'interpretation': 'Data looks normally distributed (fail to reject H0)' if p > 0.05 else 'Data does not look normally distributed (reject H0)'
451
+ }
452
+ except Exception as e:
453
+ return {
454
+ 'error': f"Kolmogorov-Smirnov test failed: {str(e)}",
455
+ 'test_statistic': None,
456
+ 'p_value': None
457
+ }
458
+
459
+ def generate_normality_test_summary(self, normality_tests_results, variable_name):
460
+ """Generate a structured and insightful summary of normality test results"""
461
+ try:
462
+ if 'error' in normality_tests_results:
463
+ return {
464
+ 'variable': variable_name,
465
+ 'normality_assessment': 'Error in analysis',
466
+ 'error_message': normality_tests_results['error'],
467
+ 'recommendation': 'Check the data quality or transform the variable'
468
+ }
469
+ except Exception as e:
470
+ return {
471
+ 'variable': variable_name,
472
+ 'normality_assessment': 'Error in checking error flag',
473
+ 'error_message': str(e),
474
+ 'recommendation': 'Ensure error key structure is valid'
475
+ }
476
+
477
+ total_tests = 0
478
+ normal_tests = 0
479
+ tests_performed = []
480
+ test_details = {}
481
+
482
+ try:
483
+ for test_name, result in normality_tests_results.items():
484
+ try:
485
+ if 'error' in result:
486
+ test_details[test_name] = {
487
+ 'status': 'Failed',
488
+ 'reason': result['error']
489
+ }
490
+ continue
491
+
492
+ total_tests += 1
493
+ tests_performed.append(test_name)
494
+
495
+ is_normal = False
496
+
497
+ if test_name in ['shapiro_wilk', 'dagostino_pearson', 'kolmogorov_smirnov']:
498
+ is_normal = result.get('p_value', 0) > 0.05
499
+ test_details[test_name] = {
500
+ 'test_statistic': result.get('test_statistic'),
501
+ 'p_value': result.get('p_value'),
502
+ 'suggests_normality': is_normal
503
+ }
504
+
505
+ elif test_name == 'anderson_darling':
506
+ is_normal = result.get('test_statistic', float('inf')) < result.get('critical_values', [0, 0, 0])[2]
507
+ test_details[test_name] = {
508
+ 'test_statistic': result.get('test_statistic'),
509
+ 'critical_value_5pct': result.get('critical_values', [0, 0, 0])[2],
510
+ 'suggests_normality': is_normal
511
+ }
512
+
513
+ if is_normal:
514
+ normal_tests += 1
515
+
516
+ except Exception as e:
517
+ test_details[test_name] = {
518
+ 'status': 'Failed',
519
+ 'reason': f"Exception during processing: {str(e)}"
520
+ }
521
+ except Exception as e:
522
+ return {
523
+ 'variable': variable_name,
524
+ 'normality_assessment': 'Error in parsing test results',
525
+ 'error_message': str(e),
526
+ 'recommendation': 'Ensure the test result structure is consistent'
527
+ }
528
+
529
+ try:
530
+ normality_score = normal_tests / total_tests if total_tests > 0 else 0
531
+
532
+ if normality_score >= 0.75:
533
+ normality_assessment = 'Normal'
534
+ confidence = 'High'
535
+ recommendation = 'Proceed with parametric tests'
536
+ elif normality_score >= 0.5:
537
+ normality_assessment = 'Likely Normal'
538
+ confidence = 'Moderate'
539
+ recommendation = 'Consider parametric tests, but verify with visual inspection'
540
+ elif normality_score >= 0.25:
541
+ normality_assessment = 'Likely Non-Normal'
542
+ confidence = 'Moderate'
543
+ recommendation = 'Consider non-parametric alternatives or data transformation'
544
+ else:
545
+ normality_assessment = 'Non-Normal'
546
+ confidence = 'High'
547
+ recommendation = 'Use non-parametric tests or transform the data'
548
+ except Exception as e:
549
+ return {
550
+ 'variable': variable_name,
551
+ 'normality_assessment': 'Error in calculating summary statistics',
552
+ 'error_message': str(e),
553
+ 'recommendation': 'Check calculations or inputs for division errors'
554
+ }
555
+
556
+ try:
557
+ summary = {
558
+ 'variable': variable_name,
559
+ 'normality_assessment': normality_assessment,
560
+ 'confidence': confidence,
561
+ 'normal_tests_ratio': f"{normal_tests}/{total_tests}",
562
+ 'normality_score': normality_score,
563
+ 'tests_performed': tests_performed,
564
+ 'test_details': test_details,
565
+ 'recommendation': recommendation
566
+ }
567
+ except Exception as e:
568
+ return {
569
+ 'variable': variable_name,
570
+ 'normality_assessment': 'Error in creating summary',
571
+ 'error_message': str(e),
572
+ 'recommendation': 'Inspect result structure and try again'
573
+ }
574
+
575
+ return summary
576
+
577
+ def fit_distributions(self, variable, verbose=False):
578
+ """Fit common distributions and assess goodness-of-fit"""
579
+ pass
580
+
581
+ def test_modality(self, variable, verbose=False):
582
+ """ Test for unimodal vs multimodal distributions using Hartigan's Dip Test"""
583
+ try:
584
+ x = np.asarray(variable).flatten()
585
+
586
+ x = x[~np.isnan(x)]
587
+
588
+ if len(x) <= 3:
589
+ if verbose:
590
+ logger.info("Warning: Too few data points for reliable modality testing.", log_type='univariate_analysis', console=verbose)
591
+ return {
592
+ 'dip_statistic': None,
593
+ 'p_value': None,
594
+ 'conclusion': "Not enough data points for reliable testing (minimum 4 required)",
595
+ 'is_multimodal': None,
596
+ 'sample_size': len(x)
597
+ }
598
+
599
+ dip, p_value = self.hartigan_dip_test(x)
600
+
601
+ alpha = 0.05
602
+ if p_value < alpha:
603
+ conclusion = "Likely multimodal distribution (reject unimodality)"
604
+ is_multimodal = True
605
+ else:
606
+ conclusion = "Likely unimodal distribution (fail to reject unimodality)"
607
+ is_multimodal = False
608
+
609
+ results = {
610
+ 'dip_statistic': dip,
611
+ 'p_value': p_value,
612
+ 'conclusion': conclusion,
613
+ 'is_multimodal': is_multimodal,
614
+ 'sample_size': len(x)
615
+ }
616
+
617
+ results['summary'] = self.summarize_modality_test(results)
618
+
619
+ return results
620
+
621
+ except Exception as e:
622
+ logger.error(f"Failed to run modality test with error: {e}", log_type="univariate_analysis", console=verbose)
623
+ return {"error": str(e)}
624
+
625
+ def hartigan_dip_test(self, x: np.ndarray) -> Tuple[float, float]:
626
+ """Implement Hartigan's dip test for unimodality """
627
+
628
+ x = np.sort(x)
629
+ n = len(x)
630
+
631
+ ecdf = np.arange(1, n + 1) / n
632
+
633
+ dip = self.calculate_dip_statistic(x, ecdf)
634
+
635
+ p_value = self.calculate_dip_pvalue(dip, n)
636
+
637
+ return dip, p_value
638
+
639
+ def calculate_dip_statistic(self, x: np.ndarray, ecdf: np.ndarray) -> float:
640
+ """Calculate the Hartigan's dip statistic """
641
+
642
+ n = len(x)
643
+ gcm = self.compute_gcm(x, ecdf)
644
+
645
+ lcm = self.compute_lcm(x, ecdf)
646
+
647
+ diffs = np.maximum(
648
+ np.abs(ecdf - gcm),
649
+ np.abs(ecdf - lcm)
650
+ )
651
+
652
+ dip = np.max(diffs)
653
+
654
+ return dip
655
+
656
+ def compute_gcm(self, x: np.ndarray, ecdf: np.ndarray) -> np.ndarray:
657
+ """Compute the greatest convex minorant (GCM) of the empirical CDF"""
658
+ n = len(x)
659
+ gcm = np.zeros(n)
660
+
661
+ gcm[0] = ecdf[0]
662
+
663
+ for i in range(1, n):
664
+ slopes = (ecdf[i] - gcm[:i]) / (x[i] - x[:i])
665
+ max_slope_idx = np.argmax(slopes)
666
+ gcm[i] = gcm[max_slope_idx] + slopes[max_slope_idx] * (x[i] - x[max_slope_idx])
667
+
668
+ gcm[i] = min(gcm[i], ecdf[i])
669
+
670
+ return gcm
671
+
672
+ def compute_lcm(self, x: np.ndarray, ecdf: np.ndarray) -> np.ndarray:
673
+ """Compute the least concave majorant (LCM) of the empirical CDF"""
674
+ n = len(x)
675
+ lcm = np.zeros(n)
676
+
677
+ lcm[-1] = ecdf[-1]
678
+
679
+ for i in range(n-2, -1, -1):
680
+ slopes = (lcm[i+1:] - ecdf[i]) / (x[i+1:] - x[i])
681
+ min_slope_idx = np.argmin(slopes) + i + 1
682
+ lcm[i] = ecdf[i] + slopes[min_slope_idx - i - 1] * (x[min_slope_idx] - x[i])
683
+
684
+ lcm[i] = max(lcm[i], ecdf[i])
685
+
686
+ return lcm
687
+
688
+ def calculate_dip_pvalue(self, dip: float, n: int) -> float:
689
+ """Calculate the p-value for the dip statistic"""
690
+ adjusted_dip = dip * (np.sqrt(n) + 0.12 + 0.11 / np.sqrt(n))
691
+
692
+ if adjusted_dip < 0.01:
693
+ p_value = 1.0
694
+ elif adjusted_dip < 0.7:
695
+ p_value = np.exp(-4.0 * adjusted_dip * adjusted_dip)
696
+ elif adjusted_dip < 0.9:
697
+ p_value = 0.15
698
+ elif adjusted_dip < 1.0:
699
+ p_value = 0.10
700
+ elif adjusted_dip < 1.1:
701
+ p_value = 0.05
702
+ elif adjusted_dip < 1.2:
703
+ p_value = 0.025
704
+ elif adjusted_dip < 1.3:
705
+ p_value = 0.01
706
+ elif adjusted_dip < 1.5:
707
+ p_value = 0.005
708
+ else:
709
+ p_value = 0.001
710
+
711
+ return p_value
712
+
713
+ def summarize_modality_test(self, test_results: Dict) -> Dict:
714
+ """Generate a precise and insightful summary of modality test results"""
715
+
716
+ if test_results.get('dip_statistic') is None:
717
+ return {
718
+ 'summary': "Insufficient data for modality testing.",
719
+ 'recommendation': "Collect more data points (minimum 4 required).",
720
+ 'reliability': "Not applicable"
721
+ }
722
+
723
+ dip = test_results['dip_statistic']
724
+ p_value = test_results['p_value']
725
+ n = test_results['sample_size']
726
+ is_multimodal = test_results['is_multimodal']
727
+
728
+ confidence_level = 1 - p_value
729
+
730
+ if p_value < 0.001:
731
+ strength = "Very strong"
732
+ reliability = "High"
733
+ elif p_value < 0.01:
734
+ strength = "Strong"
735
+ reliability = "High"
736
+ elif p_value < 0.05:
737
+ strength = "Moderate"
738
+ reliability = "Moderate"
739
+ elif p_value < 0.1:
740
+ strength = "Weak"
741
+ reliability = "Low"
742
+ else:
743
+ strength = "Very weak"
744
+ reliability = "Low"
745
+
746
+ if n < 20:
747
+ reliability = "Low (small sample size)"
748
+ elif n < 50:
749
+ reliability = reliability + " (moderate sample size)"
750
+ else:
751
+ reliability = reliability + " (large sample size)"
752
+
753
+ if is_multimodal:
754
+ conclusion = f"{strength} evidence of multimodality (dip={dip:.6f}, p={p_value:.6f})"
755
+ recommendation = "Consider mixture modeling or clustering approaches."
756
+ else:
757
+ conclusion = f"{strength} evidence for unimodality (dip={dip:.6f}, p={p_value:.6f})"
758
+ recommendation = "Parametric distributional analyses may be appropriate."
759
+
760
+ return {
761
+ 'summary': conclusion,
762
+ 'dip_statistic_interpretation': f"Dip statistic of {dip:.6f} indicates " +
763
+ ("significant" if is_multimodal else "non-significant") +
764
+ " deviation from unimodality.",
765
+ 'confidence_level': f"{confidence_level:.1%}",
766
+ 'evidence_strength': strength,
767
+ 'reliability': reliability,
768
+ 'recommendation': recommendation,
769
+ 'sample_size_adequacy': "Adequate" if n >= 50 else "Limited" if n >= 20 else "Inadequate",
770
+ 'technical_details': {
771
+ 'dip_statistic': dip,
772
+ 'p_value': p_value,
773
+ 'sample_size': n,
774
+ 'critical_alpha': 0.05,
775
+ 'null_hypothesis': "The distribution is unimodal",
776
+ 'alternative_hypothesis': "The distribution is multimodal"
777
+ }
778
+ }
779
+
780
+ def calculate_shape_metrics(self, continuous_cols: List[str] = None, verbose=False):
781
+ """Calculate shape characteristics for all continuous variables"""
782
+ logger.info("Starting to calculate shape metrics..", log_type='univariate_analysis', console=verbose)
783
+ results = {
784
+ "skewness": {},
785
+ "kurtosis": {},
786
+ "l_moments": {},
787
+ "entropy": {},
788
+ "tail_weight": {},
789
+ "zero_variance": {},
790
+ "summary": {}
791
+ }
792
+
793
+ if continuous_cols is None:
794
+ try:
795
+ continuous_cols = df.select_dtypes(include=['number']).columns.tolist()
796
+ except Exception as e:
797
+ logger.error(f"Error detecting continuous columns: {str(e)}", log_type="data_quality_assessment", console=verbose)
798
+ continuous_cols = []
799
+
800
+ if not continuous_cols:
801
+ return {"error": "No continuous columns found or provided"}
802
+
803
+ for col in continuous_cols:
804
+ try:
805
+ values = self.data[col].dropna().values
806
+
807
+ if len(values) == 0:
808
+ results["summary"][col] = "All values are NA"
809
+ continue
810
+
811
+ col_results = {}
812
+
813
+ try:
814
+ skewness = stats.skew(values)
815
+ skewness_interpretation = self.interpret_skewness(skewness)
816
+ results["skewness"][col] = {
817
+ "value": float(skewness),
818
+ "interpretation": skewness_interpretation
819
+ }
820
+ col_results["skewness"] = skewness_interpretation
821
+ except Exception as e:
822
+ results["skewness"][col] = {"error": str(e)}
823
+ col_results["skewness"] = "Error calculating"
824
+
825
+ try:
826
+ kurtosis = stats.kurtosis(values)
827
+ kurtosis_interpretation = self.interpret_kurtosis(kurtosis)
828
+ results["kurtosis"][col] = {
829
+ "value": float(kurtosis),
830
+ "interpretation": kurtosis_interpretation
831
+ }
832
+ col_results["kurtosis"] = kurtosis_interpretation
833
+ except Exception as e:
834
+ results["kurtosis"][col] = {"error": str(e)}
835
+ col_results["kurtosis"] = "Error calculating"
836
+
837
+ try:
838
+ l_moments = self.calculate_l_moments(values)
839
+ results["l_moments"][col] = l_moments
840
+ col_results["l_moments"] = f"L-CV: {l_moments['l_cv']:.3f}, L-skewness: {l_moments['l_skewness']:.3f}, L-kurtosis: {l_moments['l_kurtosis']:.3f}"
841
+ except Exception as e:
842
+ results["l_moments"][col] = {"error": str(e)}
843
+ col_results["l_moments"] = "Error calculating"
844
+
845
+ try:
846
+ entropy_values = self.calculate_entropy(values)
847
+ results["entropy"][col] = entropy_values
848
+ col_results["entropy"] = f"Shannon: {entropy_values['shannon']:.3f}, Differential: {entropy_values['differential']:.3f}"
849
+ except Exception as e:
850
+ results["entropy"][col] = {"error": str(e)}
851
+ col_results["entropy"] = "Error calculating"
852
+
853
+ try:
854
+ tail_weights = self.assess_tail_weight(values)
855
+ results["tail_weight"][col] = tail_weights
856
+ col_results["tail_weight"] = f"Left: {tail_weights['left_tail']:.3f}, Right: {tail_weights['right_tail']:.3f}"
857
+ except Exception as e:
858
+ results["tail_weight"][col] = {"error": str(e)}
859
+ col_results["tail_weight"] = "Error calculating"
860
+
861
+ try:
862
+ zero_var_results = self.test_zero_variance(values)
863
+ results["zero_variance"][col] = zero_var_results
864
+ if zero_var_results["is_near_zero_variance"]:
865
+ col_results["zero_variance"] = "Near-zero variance detected"
866
+ else:
867
+ col_results["zero_variance"] = "Variable has sufficient variance"
868
+ except Exception as e:
869
+ results["zero_variance"][col] = {"error": str(e)}
870
+ col_results["zero_variance"] = "Error calculating"
871
+
872
+ results["summary"][col] = self.generate_column_summary(col, col_results)
873
+
874
+ except Exception as e:
875
+ results["summary"][col] = f"Error processing column {col}: {str(e)}"
876
+ if verbose:
877
+ print(f"Error processing column {col}: {str(e)}")
878
+
879
+ results["overall_summary"] = self.generate_overall_summary(results)
880
+
881
+ return results
882
+
883
+ def interpret_skewness(self, skewness: float) -> str:
884
+ """Interpret the skewness value."""
885
+ if abs(skewness) < 0.5:
886
+ return "Approximately symmetric"
887
+ elif skewness < -1:
888
+ return "Highly negatively skewed"
889
+ elif skewness < -0.5:
890
+ return "Moderately negatively skewed"
891
+ elif skewness > 1:
892
+ return "Highly positively skewed"
893
+ else: # skewness > 0.5
894
+ return "Moderately positively skewed"
895
+
896
+ def interpret_kurtosis(self, kurtosis: float) -> str:
897
+ """Interpret the kurtosis value."""
898
+ if abs(kurtosis) < 0.5:
899
+ return "Approximately mesokurtic (normal-like tails)"
900
+ elif kurtosis < -0.5:
901
+ return "Platykurtic (thinner tails than normal)"
902
+ else: # kurtosis > 0.5
903
+ return "Leptokurtic (heavier tails than normal)"
904
+
905
+ def calculate_l_moments(self, values: np.ndarray) -> Dict:
906
+ """Calculate L-moments for a sample."""
907
+ try:
908
+ lmoms = lm.lmom_ratios(values, nmom=5)
909
+ return {
910
+ "l1": float(lmoms[0]), # L-mean
911
+ "l2": float(lmoms[1]), # L-scale
912
+ "l_cv": float(lmoms[1] / lmoms[0]) if lmoms[0] != 0 else float('nan'), # L-coefficient of variation
913
+ "l_skewness": float(lmoms[2]), # L-skewness
914
+ "l_kurtosis": float(lmoms[3]), # L-kurtosis
915
+ "tau5": float(lmoms[4]) if len(lmoms) > 4 else float('nan') # 5th L-moment ratio
916
+ }
917
+ except Exception:
918
+ try:
919
+ sorted_data = np.sort(values)
920
+ n = len(sorted_data)
921
+
922
+ l1 = np.mean(sorted_data)
923
+
924
+ cumsum = np.cumsum(sorted_data)
925
+ indices = np.arange(n)
926
+ l2 = np.mean((indices / (n - 1)) * sorted_data - cumsum / (n - 1))
927
+
928
+ return {
929
+ "l1": float(l1),
930
+ "l2": float(l2),
931
+ "l_cv": float(l2 / l1) if l1 != 0 else float('nan'),
932
+ "l_skewness": float('nan'),
933
+ "l_kurtosis": float('nan'),
934
+ "tau5": float('nan')
935
+ }
936
+ except Exception as e:
937
+ logger.error(f"Error calculating L-moments: {str(e)}", log_type="data_quality_assessment", console=True)
938
+
939
+ def calculate_entropy(self, values: np.ndarray) -> Dict:
940
+ """Calculate entropy measures for a sample."""
941
+ try:
942
+ hist, bin_edges = np.histogram(values, bins='auto', density=True)
943
+ hist = hist[hist > 0]
944
+ shannon_entropy = -np.sum(hist * np.log(hist)) * (bin_edges[1] - bin_edges[0])
945
+
946
+ n = len(values)
947
+ std = np.std(values)
948
+ diff_entropy = 0.5 * np.log(2 * np.pi * np.e * std**2) if std > 0 else float('-inf')
949
+
950
+ return {
951
+ "shannon": float(shannon_entropy),
952
+ "differential": float(diff_entropy),
953
+ "normalized_shannon": float(shannon_entropy / np.log(len(hist))) if len(hist) > 1 else 0.0
954
+ }
955
+ except Exception as e:
956
+ logger.error(f"Error calculating entropy: {str(e)}", log_type="data_quality_assessment", console=True)
957
+
958
+ def assess_tail_weight(self, values: np.ndarray) -> Dict:
959
+ """Assess tail weight of a distribution."""
960
+ try:
961
+ q = np.quantile(values, [0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])
962
+
963
+ iqr = q[4] - q[2] # 75th - 25th
964
+
965
+ left_tail = (q[2] - q[0]) / iqr if iqr > 0 else float('inf')
966
+ right_tail = (q[6] - q[4]) / iqr if iqr > 0 else float('inf')
967
+
968
+ left_sorted = np.sort(values - np.min(values) + 1e-10)
969
+ left_sorted = left_sorted[:int(0.2 * len(left_sorted))]
970
+ left_tail_index = 1 / np.mean(np.log(left_sorted[-1] / left_sorted)) if len(left_sorted) > 1 else float('nan')
971
+
972
+ right_sorted = np.sort(-values + np.max(values) + 1e-10)
973
+ right_sorted = right_sorted[:int(0.2 * len(right_sorted))]
974
+ right_tail_index = 1 / np.mean(np.log(right_sorted[-1] / right_sorted)) if len(right_sorted) > 1 else float('nan')
975
+
976
+ interpretation = "Symmetric tails"
977
+ if abs(left_tail - right_tail) > 0.5:
978
+ if left_tail > right_tail:
979
+ interpretation = "Heavier left tail"
980
+ else:
981
+ interpretation = "Heavier right tail"
982
+
983
+ return {
984
+ "left_tail": float(left_tail),
985
+ "right_tail": float(right_tail),
986
+ "left_tail_index": float(left_tail_index),
987
+ "right_tail_index": float(right_tail_index),
988
+ "interpretation": interpretation
989
+ }
990
+ except Exception as e:
991
+ logger.error(f"Error assessing tail weight: {str(e)}", log_type="data_quality_assessment", console=True)
992
+
993
+ def test_zero_variance(self, values: np.ndarray) -> Dict:
994
+ """Test for zero or near-zero variance."""
995
+ try:
996
+ n = len(values)
997
+ variance = np.var(values)
998
+
999
+ n_unique = len(np.unique(values))
1000
+ unique_ratio = n_unique / n if n > 0 else 0
1001
+
1002
+ if n_unique > 1:
1003
+ counts = np.bincount(np.digitize(values, np.unique(values)))
1004
+ sorted_counts = np.sort(counts[counts > 0])
1005
+ freq_ratio = sorted_counts[-1] / sorted_counts[-2] if sorted_counts[-2] > 0 else float('inf')
1006
+ else:
1007
+ freq_ratio = float('inf')
1008
+
1009
+ is_zero_variance = variance < 1e-10
1010
+ is_near_zero_variance = (is_zero_variance or
1011
+ unique_ratio < 0.1 or
1012
+ freq_ratio > 20)
1013
+
1014
+ mean = np.mean(values)
1015
+ cv = np.sqrt(variance) / mean if mean != 0 else float('inf')
1016
+
1017
+ return {
1018
+ "variance": float(variance),
1019
+ "unique_ratio": float(unique_ratio),
1020
+ "n_unique": int(n_unique),
1021
+ "freq_ratio": float(freq_ratio),
1022
+ "cv": float(cv),
1023
+ "is_zero_variance": bool(is_zero_variance),
1024
+ "is_near_zero_variance": bool(is_near_zero_variance)
1025
+ }
1026
+ except Exception as e:
1027
+ logger.error(f"Error testing zero variance: {str(e)}", log_type="data_quality_assessment", console=True)
1028
+
1029
+ def generate_column_summary(self, col_name: str, col_results: Dict) -> str:
1030
+ """Generate a summary string for a single column."""
1031
+ summary = f"Column '{col_name}': "
1032
+
1033
+ points = []
1034
+
1035
+ if "skewness" in col_results:
1036
+ points.append(f"distribution is {col_results['skewness'].lower()}")
1037
+
1038
+ if "kurtosis" in col_results:
1039
+ points.append(f"{col_results['kurtosis'].lower()}")
1040
+
1041
+ if "tail_weight" in col_results and "tail_weight" in col_results:
1042
+ if "Heavier" in col_results["tail_weight"]:
1043
+ points.append(col_results["tail_weight"])
1044
+
1045
+ if "zero_variance" in col_results and "Near-zero variance" in col_results["zero_variance"]:
1046
+ points.append("WARNING: near-zero variance detected, may not be useful for modeling")
1047
+
1048
+ if points:
1049
+ summary += ", ".join(points)
1050
+ else:
1051
+ summary += "insufficient data to calculate shape metrics"
1052
+
1053
+ return summary
1054
+
1055
+ def generate_overall_summary(self, results: Dict) -> str:
1056
+ """Generate an overall summary of shape metrics for all columns."""
1057
+ summary_lines = []
1058
+
1059
+ skewed_cols = []
1060
+ heavy_tailed_cols = []
1061
+ near_zero_variance_cols = []
1062
+
1063
+ for col, summary in results["summary"].items():
1064
+ if "highly" in summary.lower() and "skew" in summary.lower():
1065
+ skewed_cols.append(col)
1066
+
1067
+ if "leptokurtic" in summary.lower() or "heavier tail" in summary.lower():
1068
+ heavy_tailed_cols.append(col)
1069
+
1070
+ if "zero variance" in summary.lower():
1071
+ near_zero_variance_cols.append(col)
1072
+
1073
+ summary_lines.append(f"Analyzed {len(results['summary'])} continuous variables.")
1074
+
1075
+ if skewed_cols:
1076
+ summary_lines.append(f"Found {len(skewed_cols)} highly skewed variables: {', '.join(skewed_cols[:5])}" +
1077
+ (f" and {len(skewed_cols) - 5} more" if len(skewed_cols) > 5 else ""))
1078
+
1079
+ if heavy_tailed_cols:
1080
+ summary_lines.append(f"Found {len(heavy_tailed_cols)} variables with heavy tails: {', '.join(heavy_tailed_cols[:5])}" +
1081
+ (f" and {len(heavy_tailed_cols) - 5} more" if len(heavy_tailed_cols) > 5 else ""))
1082
+
1083
+ if near_zero_variance_cols:
1084
+ summary_lines.append(f"WARNING: Found {len(near_zero_variance_cols)} variables with near-zero variance which may not be useful for modeling: {', '.join(near_zero_variance_cols[:5])}" +
1085
+ (f" and {len(near_zero_variance_cols) - 5} more" if len(near_zero_variance_cols) > 5 else ""))
1086
+
1087
+ if skewed_cols or heavy_tailed_cols:
1088
+ summary_lines.append("Recommendation: Consider applying transformations (log, Box-Cox, etc.) to heavily skewed or heavy-tailed variables.")
1089
+
1090
+ if near_zero_variance_cols:
1091
+ summary_lines.append("Recommendation: Consider removing or carefully reviewing near-zero variance variables.")
1092
+
1093
+ return "\n".join(summary_lines)
1094
+
1095
+ def detect_outliers(self, verbose=False):
1096
+ """Run multiple outlier detection methods"""
1097
+ pass
1098
+
1099
+ def analyze_categoricals(
1100
+ self,
1101
+ data: pd.DataFrame,
1102
+ categorical_columns: Optional[List[str]] = None,
1103
+ alpha: float = 0.05,
1104
+ rare_threshold: float = 0.05,
1105
+ verbose: bool = False) -> Dict[str, Any]:
1106
+ '''Perform comprehensive analysis on categorical variables in a dataset.'''
1107
+ results = {
1108
+ "frequency_tables": {},
1109
+ "prevalence_rates": {},
1110
+ "rare_categories": {},
1111
+ "entropy": {},
1112
+ "gini_impurity": {},
1113
+ "chi_square_results": {},
1114
+ "simpson_diversity": {},
1115
+ "shannon_diversity": {},
1116
+ "cardinality": {},
1117
+ "errors": [],
1118
+ "summary": ""
1119
+ }
1120
+
1121
+ try:
1122
+ if categorical_columns is None:
1123
+ categorical_columns = list(data.select_dtypes(include=['object', 'category']).columns)
1124
+
1125
+ for col in data.select_dtypes(include=['int64', 'float64']).columns:
1126
+ if data[col].nunique() <= 30 and data[col].nunique() / len(data) < 0.05:
1127
+ categorical_columns.append(col)
1128
+ except Exception as e:
1129
+ error_msg = f"Error identifying categorical columns: {str(e)}"
1130
+ logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
1131
+ results["errors"].append(error_msg)
1132
+ categorical_columns = []
1133
+
1134
+ if not categorical_columns:
1135
+ results["summary"] = "No categorical columns identified for analysis."
1136
+ return results
1137
+
1138
+ for col in categorical_columns:
1139
+ try:
1140
+ if col not in data.columns:
1141
+ continue
1142
+
1143
+ missing_rate = data[col].isna().mean()
1144
+ if missing_rate > 0.8:
1145
+ results["errors"].append(f"Column {col} has {missing_rate:.2%} missing values - skipping analysis")
1146
+ continue
1147
+
1148
+ column_data = data[col].dropna()
1149
+
1150
+ if len(column_data) == 0:
1151
+ results["errors"].append(f"Column {col} has no valid values after removing NA - skipping analysis")
1152
+ continue
1153
+
1154
+ try:
1155
+ freq_table = column_data.value_counts(dropna=False).reset_index()
1156
+ freq_table.columns = ['value', 'frequency']
1157
+ freq_table['percentage'] = (freq_table['frequency'] / len(column_data)) * 100
1158
+ # results["frequency_tables"][col] = freq_table.to_dict('records')
1159
+ except Exception as e:
1160
+ error_msg = f"Error calculating frequency table for {col}: {str(e)}"
1161
+ logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
1162
+ results["errors"].append(error_msg)
1163
+
1164
+ try:
1165
+ prevalence_rates = {}
1166
+ value_counts = column_data.value_counts(normalize=True)
1167
+ for value, rate in value_counts.items():
1168
+ prevalence_rates[str(value)] = float(rate)
1169
+ # results["prevalence_rates"][col] = prevalence_rates
1170
+ except Exception as e:
1171
+ error_msg = f"Error calculating prevalence rates for {col}: {str(e)}"
1172
+ logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
1173
+ results["errors"].append(error_msg)
1174
+
1175
+ try:
1176
+ rare_categories = {}
1177
+ for value, rate in value_counts.items():
1178
+ if rate < rare_threshold:
1179
+ rare_categories[str(value)] = float(rate)
1180
+ results["rare_categories"][col] = rare_categories
1181
+ except Exception as e:
1182
+ error_msg = f"Error detecting rare categories for {col}: {str(e)}"
1183
+ logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
1184
+ results["errors"].append(error_msg)
1185
+
1186
+ try:
1187
+ entropy_value = 0
1188
+ for prob in value_counts:
1189
+ if prob > 0:
1190
+ entropy_value -= prob * math.log2(prob)
1191
+ results["entropy"][col] = float(entropy_value)
1192
+ except Exception as e:
1193
+ error_msg = f"Error calculating entropy for {col}: {str(e)}"
1194
+ logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
1195
+ results["errors"].append(error_msg)
1196
+
1197
+ try:
1198
+ gini_impurity = 1 - sum(prob ** 2 for prob in value_counts)
1199
+ results["gini_impurity"][col] = float(gini_impurity)
1200
+ except Exception as e:
1201
+ error_msg = f"Error calculating Gini impurity for {col}: {str(e)}"
1202
+ logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
1203
+ results["errors"].append(error_msg)
1204
+
1205
+ try:
1206
+
1207
+ observed = column_data.value_counts().values
1208
+ n_categories = len(observed)
1209
+ expected = np.ones(n_categories) * len(column_data) / n_categories
1210
+
1211
+ if n_categories >= 2:
1212
+ chi2_stat, p_value = stats.chisquare(observed, expected)
1213
+ results["chi_square_results"][col] = {
1214
+ "chi2_statistic": float(chi2_stat),
1215
+ "p_value": float(p_value),
1216
+ "reject_null_hypothesis": p_value < alpha,
1217
+ "interpretation": "Distribution is not uniform" if p_value < alpha else "Distribution may be uniform"
1218
+ }
1219
+ else:
1220
+ results["chi_square_results"][col] = {
1221
+ "chi2_statistic": None,
1222
+ "p_value": None,
1223
+ "reject_null_hypothesis": None,
1224
+ "interpretation": "Not enough categories for chi-square test"
1225
+ }
1226
+ except Exception as e:
1227
+ error_msg = f"Error performing chi-square test for {col}: {str(e)}"
1228
+ logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
1229
+ results["errors"].append(error_msg)
1230
+
1231
+ try:
1232
+ simpson_index = sum(prob ** 2 for prob in value_counts)
1233
+ simpson_diversity = 1 - simpson_index
1234
+ results["simpson_diversity"][col] = float(simpson_diversity)
1235
+ except Exception as e:
1236
+ error_msg = f"Error calculating Simpson's diversity index for {col}: {str(e)}"
1237
+ logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
1238
+ results["errors"].append(error_msg)
1239
+
1240
+ try:
1241
+ shannon_diversity = -sum(prob * np.log(prob) for prob in value_counts if prob > 0)
1242
+ results["shannon_diversity"][col] = float(shannon_diversity)
1243
+ except Exception as e:
1244
+ error_msg = f"Error calculating Shannon's diversity index for {col}: {str(e)}"
1245
+ logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
1246
+ results["errors"].append(error_msg)
1247
+
1248
+ try:
1249
+ cardinality = len(value_counts)
1250
+ results["cardinality"][col] = {
1251
+ "unique_values": int(cardinality),
1252
+ "total_records": int(len(column_data)),
1253
+ "ratio": float(cardinality / len(column_data))
1254
+ }
1255
+ except Exception as e:
1256
+ error_msg = f"Error performing cardinality analysis for {col}: {str(e)}"
1257
+ logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
1258
+ results["errors"].append(error_msg)
1259
+
1260
+ except Exception as e:
1261
+ error_msg = f"Error analyzing column {col}: {str(e)}"
1262
+ logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
1263
+ results["errors"].append(error_msg)
1264
+
1265
+ try:
1266
+ summary_lines = []
1267
+ summary_lines.append(f"Categorical Analysis Summary for {len(categorical_columns)} columns:")
1268
+
1269
+ if results["cardinality"]:
1270
+ high_cardinality = [col for col, data in results["cardinality"].items()
1271
+ if data["ratio"] > 0.5 and data["unique_values"] > 10]
1272
+ if high_cardinality:
1273
+ summary_lines.append(f"- High cardinality columns ({len(high_cardinality)}): {', '.join(high_cardinality)}")
1274
+
1275
+ if results["rare_categories"]:
1276
+ cols_with_rare = [col for col, rare_cats in results["rare_categories"].items() if rare_cats]
1277
+ if cols_with_rare:
1278
+ summary_lines.append(f"- Columns with rare categories ({len(cols_with_rare)}): {', '.join(cols_with_rare)}")
1279
+
1280
+ if results["shannon_diversity"]:
1281
+ high_diversity = [col for col, value in results["shannon_diversity"].items() if value > 2.0]
1282
+ low_diversity = [col for col, value in results["shannon_diversity"].items() if value < 0.5]
1283
+ if high_diversity:
1284
+ summary_lines.append(f"- High diversity columns ({len(high_diversity)}): {', '.join(high_diversity)}")
1285
+ if low_diversity:
1286
+ summary_lines.append(f"- Low diversity columns ({len(low_diversity)}): {', '.join(low_diversity)}")
1287
+
1288
+ if results["chi_square_results"]:
1289
+ non_uniform = [col for col, result in results["chi_square_results"].items()
1290
+ if result.get("reject_null_hypothesis") is True]
1291
+ if non_uniform:
1292
+ summary_lines.append(f"- Columns with non-uniform distributions ({len(non_uniform)}): {', '.join(non_uniform)}")
1293
+
1294
+ if results["errors"]:
1295
+ summary_lines.append(f"- Analysis encountered {len(results['errors'])} errors during processing")
1296
+
1297
+ results["summary"] = "\n".join(summary_lines)
1298
+ except Exception as e:
1299
+ error_msg = f"Error generating summary: {str(e)}"
1300
+ logger.error(error_msg, log_type='univariate_analaysis', console=verbose)
1301
+ results["errors"].append(error_msg)
1302
+ results["summary"] = "Error generating summary."
1303
+
1304
+ return results
1305
+
1306
+ def analyze_all_categoricals(self, verbose=False):
1307
+ """Analyze all categorical variables in the datase"""
1308
+ logger.info("Starting to analyze all categories..", log_type='univariate_analysis', console=verbose)
1309
+ try:
1310
+ if self.data is None:
1311
+ return {"error": "No data loaded", "summary": "Error: No data loaded"}
1312
+
1313
+ categorical_columns = list(self.data.select_dtypes(include=['object', 'category']).columns)
1314
+
1315
+ for col in self.data.select_dtypes(include=['int64', 'float64']).columns:
1316
+ if self.data[col].nunique() <= 30 and self.data[col].nunique() / len(self.data) < 0.05:
1317
+ categorical_columns.append(col)
1318
+
1319
+ results = self.analyze_categoricals(
1320
+ data=self.data,
1321
+ categorical_columns=categorical_columns,
1322
+ verbose=verbose
1323
+ )
1324
+
1325
+ return results
1326
+
1327
+ except Exception as e:
1328
+ error_msg = f"Categorical analysis failed with error: {str(e)}"
1329
+ logger.error(error_msg, log_type="univariate_analysis", console=True)
1330
+ return {"error": error_msg, "summary": f"Error in categorical analysis: {str(e)}"}
1331
+
1332
+ def generate_report_from_agent(self, input)->str:
1333
+ '''Transform the json output to a user-readable report'''
1334
+ try:
1335
+ input = f"ML Task: {self.ml_task}\n{input}"
1336
+ response: RunResponse = self.writer.run(input, stream=False)
1337
+ return response.content
1338
+ except Exception as e:
1339
+ return f"Failed to generate report with error: {e}"
1340
+
1341
+ def convert_numpy_types(self, obj):
1342
+ if isinstance(obj, dict):
1343
+ return {k: self.convert_numpy_types(v) for k, v in obj.items()}
1344
+ elif isinstance(obj, list):
1345
+ return [self.convert_numpy_types(item) for item in obj]
1346
+ elif isinstance(obj, np.integer):
1347
+ return int(obj)
1348
+ elif isinstance(obj, np.floating):
1349
+ val = float(obj)
1350
+ if np.isnan(val) or np.isinf(val):
1351
+ return None
1352
+ return val
1353
+ elif isinstance(obj, float):
1354
+ if math.isnan(obj) or math.isinf(obj):
1355
+ return None
1356
+ return obj
1357
+ elif isinstance(obj, np.bool_):
1358
+ return bool(obj)
1359
+ elif isinstance(obj, np.ndarray):
1360
+ return self.convert_numpy_types(obj.tolist())
1361
+ else:
1362
+ return obj
1363
+
1364
+
1365
+ def run(self, verbose=False) -> Dict[str, dict]:
1366
+ '''Trigger point of the entire pipeline'''
1367
+ distrbution_analysis_results = self.analyze_distributions(verbose=verbose)
1368
+ continuous_vars, _ = self.detect_variable_types()
1369
+ shape_metrics_analysis_results = self.calculate_shape_metrics(continuous_cols=continuous_vars, verbose=verbose)
1370
+ categorical_analysis_results = self.analyze_all_categoricals(verbose=verbose)
1371
+
1372
+ kde_results = {}
1373
+ normality_results = {}
1374
+ kde_summary = {}
1375
+ normality_summary = {}
1376
+
1377
+ for column in distrbution_analysis_results:
1378
+ try:
1379
+ kde_results[column] = distrbution_analysis_results[column]['kde']
1380
+ normality_results[column] = distrbution_analysis_results[column]['normality_tests']
1381
+ kde_summary[column] = distrbution_analysis_results[column]['kde']['summary']
1382
+ normality_summary[column] = distrbution_analysis_results[column]['normality_tests']['summary']
1383
+ except:
1384
+ continue
1385
+
1386
+ logger.info("Generating final reports....", log_type='univariate_analysis', console=verbose)
1387
+
1388
+ final_result = {
1389
+ "kde_analysis": {
1390
+ 'dict': self.convert_numpy_types(kde_results),
1391
+ 'report': ""
1392
+ },
1393
+ "normality_analysis": {
1394
+ 'dict': self.convert_numpy_types(normality_results),
1395
+ 'report': ""
1396
+ },
1397
+ "shape_metrics_analysis": {
1398
+ "dict": self.convert_numpy_types(shape_metrics_analysis_results),
1399
+ "report": ""
1400
+ },
1401
+ "categorical_analysis": {
1402
+ # "dict": categorical_analysis_results,
1403
+ "report": ""
1404
+ }
1405
+ }
1406
+
1407
+ try:
1408
+ kde_results_str = json.dumps(kde_summary, indent=2, default=str, allow_nan=True)
1409
+ final_result['kde_analysis']['report'] = self.generate_report_from_agent(kde_results_str)
1410
+ except:
1411
+ logger.error("Failed to generate report for kde....", log_type='univariate_analysis', console=verbose)
1412
+ pass
1413
+
1414
+ try:
1415
+ normality_results_str = json.dumps(normality_summary, indent=2, default=str, allow_nan=True)
1416
+ final_result['normality_analysis']['report'] = self.generate_report_from_agent(normality_results_str)
1417
+ except:
1418
+ logger.error("Failed to generate report for normality....", log_type='univariate_analysis', console=verbose)
1419
+ pass
1420
+
1421
+ try:
1422
+ shape_metrics_analysis_results_str = json.dumps(shape_metrics_analysis_results, indent=2, default=str, allow_nan=True)
1423
+ final_result['shape_metrics_analysis']['report'] = self.generate_report_from_agent(shape_metrics_analysis_results_str)
1424
+ except:
1425
+ logger.error("Failed to generate report for shape metrics....", log_type='univariate_analysis', console=verbose)
1426
+ pass
1427
+
1428
+ try:
1429
+ categorical_analysis_results_str = json.dumps(categorical_analysis_results, indent=2, default=str, allow_nan=True)
1430
+ final_result['categorical_analysis']['report'] = self.generate_report_from_agent(categorical_analysis_results_str)
1431
+ except:
1432
+ logger.error("Failed to generate report for categorical analysis....", log_type='univariate_analysis', console=verbose)
1433
+ pass
1434
+
1435
+ return final_result
1436
+
1437
+
src/app/pipelines/task_analysis/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .ml_analysis_workflow import MLAnalysisWorkflow
2
+ from .ml_implementation_planner_workflow import MLImplementationPlannerWorkflow
src/app/pipelines/task_analysis/ml_analysis_workflow.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from agno.models.openai import OpenAIChat # type: ignore
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from src.core.utils import logger
5
+ from typing import Optional
6
+ from .model import RequirementsAnalysis, TechnicalResearch, ModelResponseStatus
7
+ from agno.agent import Agent, RunResponse # type: ignore
8
+ from agno.tools.duckduckgo import DuckDuckGoTools # type: ignore
9
+ from typing import Iterator, List
10
+ import json
11
+
12
+ load_dotenv()
13
+
14
+ class MLAnalysisWorkflow:
15
+ def __init__(self, user_prompt: str):
16
+ self.user_prompt = user_prompt
17
+ self.llm = OpenAIChat(id="gpt-4o", api_key=os.getenv('OPENAI_API_KEY'))
18
+
19
+ def analyze_requirements(self, verbose=False) -> Optional[RequirementsAnalysis]:
20
+ """Stream requirements analysis"""
21
+
22
+ logger.info("Analyzing requirements...", log_type="pipeline: task_analysis", console=verbose)
23
+
24
+ prompt = f"Analyze this business problem and provide initial technical specifications: {self.user_prompt}"
25
+
26
+ self.requirements_analyst: Agent = Agent(
27
+ name="ML Requirements Analyst",
28
+ model=self.llm,
29
+ description="Expert ML Solutions Architect specialized in analyzing business requirements",
30
+ instructions=[
31
+ "Analyze business problems and translate them into technical ML specifications.",
32
+ "1. Understand the core business problem and objectives",
33
+ "2. Identify the type of ML task required",
34
+ "3. Determine data requirements and constraints",
35
+ "4. List unclear points that need clarification",
36
+ "5. Specify areas that need technical research",
37
+ "Be precise in identifying what information is missing or needs validation."
38
+ ],
39
+ response_model=RequirementsAnalysis,
40
+ structured_outputs=True,
41
+ reasoning=True,
42
+ )
43
+
44
+ analyse_stream = self.requirements_analyst.run(prompt, stream=False)
45
+ return analyse_stream.content
46
+
47
+ def write_requirements_post(self, requirements_results: RequirementsAnalysis, verbose=False) -> Iterator[RunResponse]:
48
+ """
49
+ Write a blog post on a topic.
50
+ :param requirements_results: requirements_analyst response
51
+ :return: iterator for the workflow response
52
+ """
53
+ logger.info("Writing requirements analysis...", log_type="pipeline: task_analysis", console=verbose)
54
+
55
+ writer_input = {"model_response": requirements_results.model_response.model_dump(),
56
+ "unclear_points": requirements_results.unclear_points,
57
+ "search_queries": requirements_results.search_queries,
58
+ "business_understanding": requirements_results.business_understanding
59
+ }
60
+
61
+ model_response = self.writer.run(json.dumps(writer_input, indent=4), stream=True)
62
+
63
+ return model_response
64
+
65
+ def write_research_post(self, research_results: TechnicalResearch, verbose=False) -> Iterator[RunResponse]:
66
+ """
67
+ Write a blog post on a topic.
68
+ :param research_results: research content
69
+ :return: iterator for the workflow response
70
+ """
71
+ logger.info("Writing research findings...", log_type="pipeline: task_analysis", console=verbose)
72
+
73
+ writer_input = {"research_findings": research_results.research_findings,
74
+ "reference_implementations": research_results.reference_implementations,
75
+ "sources": research_results.sources
76
+ }
77
+
78
+ self.writer: Agent = Agent(
79
+ model=self.llm,
80
+ instructions=[
81
+ "You will be provided with lots of structured outputs. Your work is to display this"
82
+ "in a nicely formatted manner without changing any of the content. Present all the links"
83
+ "as they are, with explicitly mentioned hyperlinks. Do not change any content."
84
+ ],
85
+ markdown=True,
86
+ )
87
+
88
+ model_response = self.writer.run(json.dumps(writer_input, indent=4), stream=True)
89
+
90
+ return model_response
91
+
92
+ def validate_model_response(self, response: ModelResponseStatus, verbose=False) -> List[str]:
93
+ """Check for missing or incomplete fields in ModelResponseStatus"""
94
+ logger.info("Checking for missing or incomplete fields in ModelResponseStatus...", log_type="pipeline: task_analysis", console=verbose)
95
+
96
+ missing_fields = []
97
+ response_dict = response.model_dump()
98
+
99
+ for field, value in response_dict.items():
100
+ if value == "..." or value == ["..."]:
101
+ missing_fields.append(field)
102
+ elif isinstance(value, list) and not value:
103
+ missing_fields.append(field)
104
+
105
+ return missing_fields
106
+
107
+ def conduct_research(self, research_prompt: str, verbose=False) -> Optional[TechnicalResearch]:
108
+ """Stream technical research"""
109
+ logger.info("Conducting technical research...", log_type="pipeline: task_analysis", console=verbose)
110
+
111
+ self.technical_researcher: Agent = Agent(
112
+ name="ML Technical Researcher",
113
+ model=self.llm,
114
+ description="ML Expert specialized in researching technical implementations",
115
+ tools=[DuckDuckGoTools(search=True, news=False)],
116
+ instructions=[
117
+ "Research and validate technical aspects of ML solutions.",
118
+ "1. Search for similar ML implementations and best practices",
119
+ "2. Find recommended models and architectures",
120
+ "3. Research typical hyperparameters and evaluation metrics",
121
+ "4. Look for implementation constraints and requirements",
122
+ "5. Validate technical feasibility",
123
+ "Provide sources for all technical information.",
124
+ "Focus on recent and reliable technical sources."
125
+ ],
126
+ response_model=TechnicalResearch,
127
+ structured_outputs=True,
128
+ reasoning=True,
129
+ # debug_mode=True,
130
+ )
131
+
132
+ conduct_stream = self.technical_researcher.run(research_prompt)
133
+ return conduct_stream.content
134
+
135
+ def finalize_analysis(self, final_prompt: str, verbose=False) -> Optional[RequirementsAnalysis]:
136
+ """Stream final analysis"""
137
+ logger.info("Finalizing analysis...", log_type="pipeline: task_analysis", console=verbose)
138
+
139
+ finalise_stream = self.requirements_analyst.run(final_prompt)
140
+ return finalise_stream.content
141
+
142
+ def run(self, verbose=False):
143
+ """
144
+ Run the ML analysis workflow
145
+ Args:
146
+ user_query: Description of the business problem
147
+ """
148
+ try:
149
+ requirements_result: Optional[RequirementsAnalysis] = self.analyze_requirements(verbose=verbose)
150
+
151
+ # logger.info("Writing initial requirements analysis...", log_type="pipeline: task_analysis", console=verbose)
152
+ # yield from self.write_requirements_post(requirements_result, verbose=verbose)
153
+
154
+ '''Check what needs research'''
155
+ missing_fields = self.validate_model_response(requirements_result.model_response, verbose=verbose)
156
+ logger.info("Missing fields found!", log_type="pipeline: task_analysis", console=verbose)
157
+ search_queries = requirements_result.search_queries
158
+ logger.info("Search queries found!", log_type="pipeline: task_analysis", console=verbose)
159
+ unclear_points = requirements_result.unclear_points
160
+ logger.info("Unclear points found!", log_type="pipeline: task_analysis", console=verbose)
161
+
162
+ if missing_fields or search_queries:
163
+ '''Conduct technical research'''
164
+ logger.info("Researching technical specifications...", log_type="pipeline: task_analysis", console=verbose)
165
+
166
+ research_prompt = (
167
+ f"Research the following for this ML problem: {self.user_prompt}\n"
168
+ f"Missing information needed for: {', '.join(missing_fields)}\n"
169
+ f"Specific topics to research: {', '.join(search_queries)}\n"
170
+ f"Points needing clarification: {', '.join(unclear_points)}\n"
171
+ f"Current understanding: {requirements_result.business_understanding}"
172
+ )
173
+ logger.info("Conducting research...", log_type="pipeline: task_analysis", console=verbose)
174
+
175
+ research_result: Optional[TechnicalResearch] = self.conduct_research(research_prompt, verbose=verbose)
176
+
177
+ # logger.info("Sharing research findings...", log_type="pipeline: task_analysis", console=verbose)
178
+ # research_post = self.write_research_post(research_result, verbose=verbose)
179
+
180
+ final_prompt = (
181
+ f"Original problem: {self.user_prompt}\n"
182
+ f"Research findings: {research_result.research_findings}\n"
183
+ "Please provide final technical specifications incorporating this research."
184
+ )
185
+
186
+ logger.info("Obtaining final requirements", log_type="pipeline: task_analysis", console=verbose)
187
+ final_result: Optional[RequirementsAnalysis] = self.finalize_analysis(final_prompt, verbose=verbose)
188
+
189
+ # logger.info("Writing final requirements...", log_type="pipeline: task_analysis", console=verbose)
190
+ # requirement_post = self.write_requirements_post(final_result, verbose=verbose)
191
+
192
+ return (final_result, research_result)
193
+
194
+ except Exception as e:
195
+ logger.error(f"Workflow error: {str(e)}", log_type="pipeline: task_analysis", console=verbose)
196
+
197
+
198
+
199
+
200
+
201
+
202
+
src/app/pipelines/task_analysis/ml_implementation_planner_workflow.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from agno.models.openai import OpenAIChat # type: ignore
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from src.core.utils import logger
5
+ from typing import Optional
6
+ from .model import RequirementsAnalysis, TechnicalResearch, ImplementationPlan
7
+ from agno.agent import Agent, RunResponse # type: ignore
8
+ from typing import Iterator
9
+ import json
10
+
11
+ load_dotenv()
12
+
13
+ class MLImplementationPlannerWorkflow:
14
+ def __init__(self, requirements_analysis: RequirementsAnalysis, technical_research: Optional[TechnicalResearch] = None):
15
+ self.requirements_analysis = requirements_analysis
16
+ self.technical_research = technical_research
17
+ self.llm = OpenAIChat(id="gpt-4o", api_key=os.getenv('OPENAI_API_KEY'))
18
+
19
+ def create_implementation_plan(self, planning_prompt: str, verbose=False) -> Optional[ImplementationPlan]:
20
+ """Stream implementation plan creation"""
21
+ logger.info("Creating implementation plan...", log_type="pipeline: task_analysis", console=verbose)
22
+
23
+ self.architect: Agent = Agent(
24
+ name="ML System Architect",
25
+ model=self.llm,
26
+ description="Expert ML System Architect specialized in detailed implementation planning",
27
+ instructions=[
28
+ "Create detailed technical implementation plans for ML systems.",
29
+ "1. Break down the system into logical components",
30
+ "2. Define detailed function specifications for each component",
31
+ "3. Specify clear interfaces between components",
32
+ "4. Consider error handling and edge cases",
33
+ "5. Plan testing and deployment strategies",
34
+ "Be extremely specific about function signatures and component interactions.",
35
+ "Focus on maintainability and scalability in the design."
36
+ ],
37
+ response_model=ImplementationPlan,
38
+ structured_outputs=True,
39
+ reasoning=True,
40
+ # debug_mode=True,
41
+ )
42
+
43
+ planning_stream = self.architect.run(planning_prompt)
44
+ return planning_stream.content
45
+
46
+ def validate_interfaces(self, validation_prompt: str, verbose=False) -> Optional[ImplementationPlan]:
47
+ """Stream interface validation"""
48
+ logger.info("Validating interfaces...", log_type="pipeline: task_analysis", console=verbose)
49
+
50
+ architect_stream = self.architect.run(validation_prompt)
51
+ return architect_stream.content
52
+
53
+ def write_implementation_post(self, implementation_results: ImplementationPlan, verbose=False) -> Iterator[RunResponse]:
54
+ """
55
+ Write a blog post on a topic.
56
+ :param implementation_results: implementation plan results
57
+ :return: iterator for the workflow response
58
+ """
59
+ logger.info("Writing implementation plan...", log_type="pipeline: task_analysis", console=verbose)
60
+
61
+ writer_input = {"components": [comp.model_dump() for comp in implementation_results.components],
62
+ "system_requirements": implementation_results.system_requirements,
63
+ "deployment_notes": implementation_results.deployment_notes,
64
+ "testing_strategy": implementation_results.testing_strategy,
65
+ "implementation_order": implementation_results.implementation_order
66
+ }
67
+
68
+ self.writer: Agent = Agent(
69
+ model=self.llm,
70
+ instructions=[
71
+ "You will be provided with lots of structured outputs. Your work is to display this"
72
+ "in a nicely formatted manner without changing any of the content."
73
+ ],
74
+ markdown=True,
75
+ )
76
+
77
+ model_response = self.writer.run(json.dumps(writer_input, indent=4), stream=True)
78
+
79
+ return model_response.content
80
+
81
+ def run(self, verbose=False):
82
+ """
83
+ Create implementation plan based on requirements analysis and research
84
+
85
+ Args:
86
+ requirements_analysis: Results from requirements analysis
87
+ technical_research: Optional results from technical research
88
+ """
89
+ try:
90
+ logger.info("Starting planning workflow...", log_type="pipeline: task_analysis", console=verbose)
91
+
92
+ '''Prepare comprehensive prompt for the architect'''
93
+ planning_prompt = (
94
+ f"Create a detailed implementation plan for this ML system.\n\n"
95
+ f"Business Understanding:\n{self.requirements_analysis.business_understanding}\n\n"
96
+ f"Technical Specifications:\n"
97
+ f"- Task Type: {self.requirements_analysis.model_response.task}\n"
98
+ f"- Models: {', '.join(self.requirements_analysis.model_response.models)}\n"
99
+ f"- Data Requirements: {self.requirements_analysis.model_response.data_source}\n"
100
+ f"- Technical Requirements: {self.requirements_analysis.model_response.technical_requirements}\n"
101
+ )
102
+
103
+ if self.technical_research:
104
+ logger.info("Technical Research found! Modifying context...", log_type="pipeline: task_analysis", console=verbose)
105
+
106
+ planning_prompt += (
107
+ f"\nResearch Findings:\n{self.technical_research.research_findings}\n"
108
+ f"Reference Implementations:\n"
109
+ f"{chr(10).join(self.technical_research.reference_implementations)}"
110
+ )
111
+
112
+ '''Stream implementation plan'''
113
+
114
+ logger.info("generating implementation plan...", log_type="pipeline: task_analysis", console=verbose)
115
+ plan_result: Optional[ImplementationPlan] = self.create_implementation_plan(planning_prompt, verbose=verbose)
116
+
117
+ if plan_result:
118
+ validation_prompt = (
119
+ "Validate the interfaces between these components "
120
+ "and ensure all dependencies are properly specified:\n"
121
+ f"{plan_result.components}"
122
+ )
123
+ logger.info("validating results...", log_type="pipeline: task_analysis", console=verbose)
124
+
125
+ validate_result: Optional[ImplementationPlan] = self.validate_interfaces(validation_prompt, verbose=verbose)
126
+ # logger.info("writing validated implementation plan...", log_type="pipeline: task_analysis", console=verbose)
127
+ # final_response = self.write_implementation_post(validate_result, verbose=verbose)
128
+
129
+ return validate_result
130
+
131
+ except Exception as e:
132
+ logger.error("Error in planning workflow".format(e), log_type="pipeline: task_analysis", console=verbose)
src/app/pipelines/task_analysis/model.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from typing import Iterator, List, Optional
3
+ from enum import Enum
4
+ from pydantic import BaseModel, Field
5
+
6
+ class MLTaskType(str, Enum):
7
+ CLASSIFICATION = "classification"
8
+ REGRESSION = "regression"
9
+ CLUSTERING = "clustering"
10
+ NLP = "natural_language_processing"
11
+ COMPUTER_VISION = "computer_vision"
12
+ TIME_SERIES = "time_series"
13
+ ANOMALY_DETECTION = "anomaly_detection"
14
+ RECOMMENDATION = "recommendation"
15
+ OTHER = "other"
16
+
17
+
18
+ class ModelResponseStatus(BaseModel):
19
+ """Technical specification for ML implementation"""
20
+ data_source: str = Field(
21
+ # default="...",
22
+ description="Required data sources and their characteristics"
23
+ )
24
+ data_format: str = Field(
25
+ # default="...",
26
+ description="Expected format of input data"
27
+ )
28
+ additional_data_requirement: bool = Field(
29
+ # default=False,
30
+ description="Whether additional data is needed"
31
+ )
32
+ constraints: str = Field(
33
+ # default="...",
34
+ description="Business and technical constraints"
35
+ )
36
+ task: MLTaskType = Field(
37
+ # default=MLTaskType.OTHER,
38
+ description="Type of ML task"
39
+ )
40
+ models: List[str] = Field(
41
+ # default=["..."],
42
+ description="Suggested ML models"
43
+ )
44
+ hyperparameters: List[str] = Field(
45
+ # default=["..."],
46
+ description="Key hyperparameters to consider"
47
+ )
48
+ eval_metrics: List[str] = Field(
49
+ # default=["..."],
50
+ description="Evaluation metrics for the solution"
51
+ )
52
+ technical_requirements: str = Field(
53
+ # default="...",
54
+ description="Technical implementation requirements"
55
+ )
56
+
57
+
58
+ class RequirementsAnalysis(BaseModel):
59
+ """Initial analysis of business requirements"""
60
+ model_response: ModelResponseStatus
61
+ unclear_points: List[str] = Field(
62
+ default_factory=list,
63
+ description="Points needing clarification"
64
+ )
65
+ search_queries: List[str] = Field(
66
+ default_factory=list,
67
+ description="Topics to research"
68
+ )
69
+ business_understanding: str = Field(
70
+ description="Summary of business problem understanding"
71
+ )
72
+
73
+
74
+ class TechnicalResearch(BaseModel):
75
+ """Results from technical research"""
76
+ model_response: ModelResponseStatus
77
+ research_findings: str = Field(
78
+ description="Key findings from research"
79
+ )
80
+ reference_implementations: List[str] = Field(
81
+ default_factory=list,
82
+ description="Similar implementation examples found"
83
+ )
84
+ sources: List[str] = Field(
85
+ default_factory=list,
86
+ description="Sources of information"
87
+ )
88
+
89
+
90
+ # Implementation Planning Models
91
+ class ComponentType(str, Enum):
92
+ DATA_PIPELINE = "data_pipeline"
93
+ PREPROCESSOR = "preprocessor"
94
+ MODEL = "model"
95
+ EVALUATOR = "evaluator"
96
+ INFERENCE = "inference"
97
+ MONITORING = "monitoring"
98
+ UTILITY = "utility"
99
+
100
+
101
+ class ParameterSpec(BaseModel):
102
+ """Specification for a single parameter"""
103
+ name: str = Field(description="Name of the parameter")
104
+ param_type: str = Field(description="Type of the parameter")
105
+ description: str = Field(description="Description of the parameter")
106
+ default_value: str = Field(description="Default value if any")
107
+ required: bool = Field(description="Whether the parameter is required")
108
+
109
+
110
+ class ConfigParam(BaseModel):
111
+ """Specification for a configuration parameter"""
112
+ name: str = Field(description="Name of the configuration parameter")
113
+ value_type: str = Field(description="Type of value expected")
114
+ description: str = Field(description="Description of the configuration parameter")
115
+ default: str = Field(description="Default value if any")
116
+
117
+
118
+ class FunctionSpec(BaseModel):
119
+ """Detailed specification for a single function"""
120
+ name: str = Field(description="Name of the function")
121
+ description: str = Field(description="Detailed description of function's purpose")
122
+ input_params: List[ParameterSpec] = Field(
123
+ description="List of input parameters and their specifications"
124
+ )
125
+ return_type: str = Field(description="Return type and description")
126
+ dependencies: List[str] = Field(
127
+ description="Required dependencies/imports"
128
+ )
129
+ error_handling: List[str] = Field(
130
+ description="Expected errors and handling strategies"
131
+ )
132
+
133
+
134
+ class ComponentSpec(BaseModel):
135
+ """Specification for a component (module) of the system"""
136
+ name: str = Field(description="Name of the component")
137
+ type: ComponentType = Field(description="Type of component")
138
+ description: str = Field(description="Detailed description of component's purpose")
139
+ functions: List[FunctionSpec] = Field(description="Functions within this component")
140
+ dependencies: List[str] = Field(
141
+ description="External package dependencies"
142
+ )
143
+ config_params: List[ConfigParam] = Field(
144
+ description="Configuration parameters needed"
145
+ )
146
+
147
+
148
+ class ImplementationPlan(BaseModel):
149
+ """Complete implementation plan for the ML system"""
150
+ components: List[ComponentSpec] = Field(description="System components")
151
+ system_requirements: List[str] = Field(
152
+ description="System-level requirements and dependencies"
153
+ )
154
+ deployment_notes: str = Field(
155
+ description="Notes on deployment and infrastructure"
156
+ )
157
+ testing_strategy: str = Field(
158
+ description="Strategy for testing components"
159
+ )
160
+ implementation_order: List[str] = Field(
161
+ description="Suggested order of implementation"
162
+ )
src/app/schemas/requests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .task_analysis import TaskAnalysisRequestSchema
src/app/schemas/requests/eda.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from src.app.pipelines.task_analysis.model import ImplementationPlan, RequirementsAnalysis, TechnicalResearch
3
+ from typing import Optional
4
+
5
+ class EdaRequestSchema(BaseModel):
6
+ dataset_path: str = Field(
7
+ default=None,
8
+ description="Path of the dataset stored locally"
9
+ )
10
+ user_prompt: str = Field(
11
+ default=None,
12
+ description="Contains the initial prompt given by the user"
13
+ )
14
+ requirement_analysis: Optional[RequirementsAnalysis] = Field(
15
+ default=None,
16
+ description="Contains the analysis of the user task/prompt"
17
+ )
18
+ technical_research: Optional[TechnicalResearch] = Field(
19
+ default=None,
20
+ description="Contains the technical research of the user task/prompt"
21
+ )
22
+ implementation_plan: Optional[ImplementationPlan] = Field(
23
+ default=None,
24
+ description="Includes the detailed plan for what steps to take"
25
+ )
src/app/schemas/requests/task_analysis.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+
3
+ class TaskAnalysisRequestSchema(BaseModel):
4
+ '''Schema for accepting user instructions/prompts/task and dataset (Currently only supporting CSV)'''
5
+ user_prompt: str = Field(
6
+ default = None,
7
+ description = "Defines the user prompt/instructions/task"
8
+ )
9
+ file_name: str = Field(
10
+ default = None,
11
+ description = "Contains the filename of the dataset. Stored in a temporary storage"
12
+ )
src/app/schemas/responses/eda.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import List
3
+ from src.app.pipelines.eda.agents.models import OrchestratorAgentResponseSchema, ExecuterAgentResponseSchema, AnalyzerAgentResponseSchema, JudgingAgentResponseSchema
4
+
5
+ class IterationDetails(BaseModel):
6
+ iteration_number: int = Field(
7
+ default=None,
8
+ description="Contains the iteration number"
9
+ )
10
+ orchestrator_response: OrchestratorAgentResponseSchema = Field(
11
+ default=None,
12
+ description="Contains orchestrator agent's response for this iteration"
13
+ )
14
+ executer_response: ExecuterAgentResponseSchema = Field(
15
+ default=None,
16
+ description="Contains executer agent's response for this iteration"
17
+ )
18
+ analyzer_response: AnalyzerAgentResponseSchema = Field(
19
+ default=None,
20
+ description="Contains analyzer agent's response for this iteration"
21
+ )
22
+ judge_response: JudgingAgentResponseSchema = Field(
23
+ default=None,
24
+ description="Contains judging agent's response for this iteration"
25
+ )
26
+
27
+ class IterationLogs(BaseModel):
28
+ logs: List[IterationDetails] = Field(
29
+ default=None,
30
+ description="Contains a list of logs for each iteration"
31
+ )
src/core/cache/redis_cache.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import redis.asyncio as redis # type: ignore
3
+ from typing import Any, Optional
4
+ from src.core.utils import read_config
5
+
6
+
7
+ class RedisCache:
8
+ '''Reads the server settings from config file. Can import from anywhere'''
9
+ def __init__(self):
10
+ settings = read_config(config_path="config.yaml")
11
+
12
+ self._client = redis.Redis(
13
+ host = settings['redis_server']['host'],
14
+ port = settings['redis_server']['port'],
15
+ db = settings['redis_server']['db'],
16
+ decode_responses = True
17
+ )
18
+ async def set(self, key: str, value: Any)->None:
19
+ await self._client.set(key, json.dumps(value))
20
+
21
+ async def get(self, key: str)-> Optional[Any]:
22
+ value = await self._client.get(key)
23
+ if value:
24
+ return json.loads(value)
25
+ return None
26
+
27
+ async def delete(self, key: str)->None:
28
+ await self._client.delete(key)
29
+
30
+ async def clear(self)->None:
31
+ await self._client.flushdb()
32
+
33
+ cache = RedisCache()
src/core/server.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from src.api import router
3
+ from src.core.utils import logger, read_config
4
+
5
+ def init_routers(app_: FastAPI) -> None:
6
+ app_.include_router(router)
7
+
8
+ def create_app() -> FastAPI:
9
+ config = read_config("config.yaml")
10
+
11
+ app_ = FastAPI(
12
+ title="Franky API",
13
+ description="In Development",
14
+ version="1.0.0",
15
+ )
16
+
17
+ app_.state.config = config
18
+
19
+ init_routers(app_=app_)
20
+ logger.info("Server started successfully", log_type="server", console=True)
21
+ return app_
22
+
23
+ app = create_app()
src/core/utils/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .read_config import read_config
2
+ from .logger import logger
3
+ from .knowledge_base import KnowledgeBaseClass
src/core/utils/knowledge_base.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chromadb
3
+ from typing import List
4
+ from agno.agent import Agent
5
+ from dotenv import load_dotenv
6
+ from pydantic import BaseModel, Field
7
+ from agno.models.openai import OpenAIChat
8
+ from llama_index.core import StorageContext
9
+ from llama_index.core import VectorStoreIndex
10
+ from llama_index.core.retrievers import VectorIndexRetriever
11
+ from agno.knowledge.llamaindex import LlamaIndexKnowledgeBase
12
+ from llama_index.vector_stores.chroma import ChromaVectorStore
13
+ from .prompts import missing_value_analysis_agent_desc, missing_value_analysis_agent_instructions
14
+
15
+ load_dotenv()
16
+
17
+ class ResponseSchema(BaseModel):
18
+ code_generated: str = Field(..., description="Python code generated for the identified statistical tests.")
19
+ libraries_necessary: List[str] = Field(..., description="List of necessary Python libraries with preferred versions (e.g., ['pandas>=1.5.0', 'numpy>=1.24.0', 'scipy>=1.10.0']).")
20
+ reasoning: str = Field(..., description="Detailed reasoning for choosing the specific statistical tests based on the input report and the knowledge base.")
21
+
22
+ class KnowledgeBaseClass:
23
+ def __init__(self)->None:
24
+
25
+ self.knowlede_base_map = {
26
+ "classification": {
27
+ "raw_data_path": r"knowledge_base\raw\classification",
28
+ "vector_index_path": r"knowledge_base\vector\classification",
29
+ "collection_name": "classification_db"
30
+ },
31
+ "regression": {
32
+ "raw_data_path": r"knowledge_base\raw\regression",
33
+ "vector_index_path": r"knowledge_base\vector\regression",
34
+ "collection_name": "regression_db"
35
+ },
36
+ "time_series": {
37
+ "raw_data_path": r"knowledge_base\raw\time_series",
38
+ "vector_index_path": r"knowledge_base\vector\time_series",
39
+ "collection_name": "time_series_db"
40
+ }
41
+ }
42
+
43
+ self.agent_map = {
44
+ 'missing_value_analysis_agent': {
45
+ 'description': missing_value_analysis_agent_desc,
46
+ 'instructions': missing_value_analysis_agent_instructions
47
+ }
48
+ }
49
+
50
+ def initialize_knowledge_base(self, task_type: str)->LlamaIndexKnowledgeBase:
51
+ selected_knowledge_base = self.knowlede_base_map[task_type]
52
+ db = chromadb.PersistentClient(path=selected_knowledge_base['vector_index_path'])
53
+ chroma_collection = db.get_or_create_collection(selected_knowledge_base['collection_name'])
54
+ vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
55
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
56
+ index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)
57
+ retriever = VectorIndexRetriever(index)
58
+ knowledge_base = LlamaIndexKnowledgeBase(retriever=retriever)
59
+
60
+ return knowledge_base
61
+
62
+ def initialize_agent(self, agent_name: str, llm_choice: str, knowledge_base: LlamaIndexKnowledgeBase)->Agent:
63
+ selected_agent = self.agent_map[agent_name]
64
+ llm = OpenAIChat(id=llm_choice, api_key=os.getenv('OPENAI_API_KEY'))
65
+
66
+ agent = Agent(
67
+ model = llm,
68
+ description=selected_agent['description'],
69
+ instructions=selected_agent['instructions'],
70
+ knowledge=knowledge_base,
71
+ search_knowledge=True,
72
+ response_model=ResponseSchema
73
+ )
74
+
75
+ return agent
76
+
77
+
78
+
79
+
80
+
81
+
src/core/utils/logger.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from datetime import datetime
3
+
4
+ class LogManager:
5
+ def __init__(self, log_file_path: str):
6
+ self.logger = logging.getLogger("LogManager")
7
+ self.logger.setLevel(logging.DEBUG)
8
+
9
+ if not self.logger.handlers:
10
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - [%(log_type)s] - %(message)s')
11
+
12
+ file_handler = logging.FileHandler(log_file_path, encoding='utf-8')
13
+ file_handler.setFormatter(formatter)
14
+ file_handler.setLevel(logging.DEBUG)
15
+ self.logger.addHandler(file_handler)
16
+
17
+ self.console_handler = logging.StreamHandler()
18
+ self.console_handler.setFormatter(formatter)
19
+ self.console_handler.setLevel(logging.DEBUG)
20
+
21
+ else:
22
+ self.console_handler = None
23
+
24
+ def log(self, level, message, log_type: str, console=False):
25
+ extra = {'log_type': log_type}
26
+
27
+ if console and self.console_handler:
28
+ self.logger.addHandler(self.console_handler)
29
+
30
+ self.logger.log(level, message, extra=extra)
31
+
32
+ if console and self.console_handler:
33
+ self.logger.removeHandler(self.console_handler)
34
+
35
+ def info(self, message, log_type: str, console=False):
36
+ self.log(logging.INFO, message, log_type, console)
37
+
38
+ def error(self, message, log_type: str, console=False):
39
+ self.log(logging.ERROR, message, log_type, console)
40
+
41
+ def debug(self, message, log_type: str, console=False):
42
+ self.log(logging.DEBUG, message, log_type, console)
43
+
44
+
45
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H")
46
+ logger = LogManager(log_file_path=f"src/core/logs/log_{timestamp}.log")