Spaces:
Running
Running
Commit
·
badef87
1
Parent(s):
10635ce
fixed errors
Browse files- src/api/v1/eda_engine/data_quality.py +16 -4
- src/api/v1/eda_engine/data_statistics.py +19 -5
- src/api/v1/eda_engine/univariate_analysis.py +16 -5
- src/app/pipelines/modules/data_quality_assessment.py +3 -3
- src/app/pipelines/modules/data_statistics.py +1 -1
- src/app/pipelines/task_analysis/ml_analysis_workflow.py +1 -1
src/api/v1/eda_engine/data_quality.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
import os
|
|
|
2 |
import shutil
|
3 |
from fastapi import APIRouter
|
4 |
from src.core.utils import logger
|
|
|
5 |
from fastapi import APIRouter, UploadFile, File, HTTPException, Form
|
6 |
from src.app.pipelines.modules import DataQualityAssessmentWorkflow
|
7 |
|
@@ -13,6 +15,17 @@ def delete_dir_contents(directory: str)->None:
|
|
13 |
if os.path.isfile(file_path):
|
14 |
os.remove(file_path)
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
@data_quality_router.post('/')
|
17 |
async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
|
18 |
''' ## This endpoint accepts a CSV file upload to initiate the Data Quality Workflow.
|
@@ -46,10 +59,9 @@ async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
|
|
46 |
ds_wf = DataQualityAssessmentWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
|
47 |
results = ds_wf.run(verbose=True)
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
}
|
53 |
|
54 |
except Exception as e:
|
55 |
logger.error(f"DataQualityAssessmentWorkflow failed with error: {e}", log_type='eda-engine/data_quality', console=True)
|
|
|
1 |
import os
|
2 |
+
import math
|
3 |
import shutil
|
4 |
from fastapi import APIRouter
|
5 |
from src.core.utils import logger
|
6 |
+
from fastapi.responses import JSONResponse
|
7 |
from fastapi import APIRouter, UploadFile, File, HTTPException, Form
|
8 |
from src.app.pipelines.modules import DataQualityAssessmentWorkflow
|
9 |
|
|
|
15 |
if os.path.isfile(file_path):
|
16 |
os.remove(file_path)
|
17 |
|
18 |
+
def sanitize_for_json(data):
|
19 |
+
if isinstance(data, dict):
|
20 |
+
return {k: sanitize_for_json(v) for k, v in data.items()}
|
21 |
+
elif isinstance(data, list):
|
22 |
+
return [sanitize_for_json(v) for v in data]
|
23 |
+
elif isinstance(data, float):
|
24 |
+
if math.isinf(data) or math.isnan(data):
|
25 |
+
return None
|
26 |
+
return data
|
27 |
+
return data
|
28 |
+
|
29 |
@data_quality_router.post('/')
|
30 |
async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
|
31 |
''' ## This endpoint accepts a CSV file upload to initiate the Data Quality Workflow.
|
|
|
59 |
ds_wf = DataQualityAssessmentWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
|
60 |
results = ds_wf.run(verbose=True)
|
61 |
|
62 |
+
sanitized_data = sanitize_for_json(results)
|
63 |
+
|
64 |
+
return JSONResponse(content=sanitized_data)
|
|
|
65 |
|
66 |
except Exception as e:
|
67 |
logger.error(f"DataQualityAssessmentWorkflow failed with error: {e}", log_type='eda-engine/data_quality', console=True)
|
src/api/v1/eda_engine/data_statistics.py
CHANGED
@@ -1,12 +1,26 @@
|
|
1 |
import os
|
|
|
2 |
import shutil
|
3 |
from fastapi import APIRouter
|
4 |
from src.core.utils import logger
|
5 |
-
from fastapi import
|
6 |
from src.app.pipelines.modules import DataStatisticsWorkflow
|
|
|
|
|
7 |
|
8 |
data_statistics_router = APIRouter()
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def delete_dir_contents(directory: str)->None:
|
11 |
for filename in os.listdir(directory):
|
12 |
file_path = os.path.join(directory, filename)
|
@@ -47,10 +61,10 @@ async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
|
|
47 |
ds_wf = DataStatisticsWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
|
48 |
results = ds_wf.run(verbose=True)
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
|
55 |
except Exception as e:
|
56 |
logger.error(f"DataStatisticsWorkflow failed with error: {e}", log_type='eda-engine/data_statistics', console=True)
|
|
|
1 |
import os
|
2 |
+
import math
|
3 |
import shutil
|
4 |
from fastapi import APIRouter
|
5 |
from src.core.utils import logger
|
6 |
+
from fastapi.responses import JSONResponse
|
7 |
from src.app.pipelines.modules import DataStatisticsWorkflow
|
8 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, Form
|
9 |
+
|
10 |
|
11 |
data_statistics_router = APIRouter()
|
12 |
|
13 |
+
def sanitize_for_json(data):
|
14 |
+
if isinstance(data, dict):
|
15 |
+
return {k: sanitize_for_json(v) for k, v in data.items()}
|
16 |
+
elif isinstance(data, list):
|
17 |
+
return [sanitize_for_json(v) for v in data]
|
18 |
+
elif isinstance(data, float):
|
19 |
+
if math.isinf(data) or math.isnan(data):
|
20 |
+
return None
|
21 |
+
return data
|
22 |
+
return data
|
23 |
+
|
24 |
def delete_dir_contents(directory: str)->None:
|
25 |
for filename in os.listdir(directory):
|
26 |
file_path = os.path.join(directory, filename)
|
|
|
61 |
ds_wf = DataStatisticsWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
|
62 |
results = ds_wf.run(verbose=True)
|
63 |
|
64 |
+
sanitized_data = sanitize_for_json(results)
|
65 |
+
|
66 |
+
return JSONResponse(content=sanitized_data)
|
67 |
+
|
68 |
|
69 |
except Exception as e:
|
70 |
logger.error(f"DataStatisticsWorkflow failed with error: {e}", log_type='eda-engine/data_statistics', console=True)
|
src/api/v1/eda_engine/univariate_analysis.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import os
|
|
|
2 |
import shutil
|
3 |
-
from typing import Optional
|
4 |
from src.core.utils import logger
|
|
|
5 |
from fastapi import APIRouter, UploadFile, File, HTTPException, Form
|
6 |
from src.app.pipelines.modules import UnivariateAnalysisWorkflow
|
7 |
|
@@ -13,6 +14,17 @@ def delete_dir_contents(directory: str)->None:
|
|
13 |
if os.path.isfile(file_path):
|
14 |
os.remove(file_path)
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
@univariate_analysis_router.post('/')
|
17 |
async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
|
18 |
''' ## This endpoint accepts a CSV file upload to initiate the Univarite Analysis Workflow.
|
@@ -47,10 +59,9 @@ async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
|
|
47 |
ua_wf = UnivariateAnalysisWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
|
48 |
results = ua_wf.run(verbose=True)
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
}
|
54 |
|
55 |
except Exception as e:
|
56 |
logger.error(f"UnivariateAnalysisWorkflow failed with error: {e}", log_type='eda-engine/dataunivariate_analysis_statistics', console=True)
|
|
|
1 |
import os
|
2 |
+
import math
|
3 |
import shutil
|
|
|
4 |
from src.core.utils import logger
|
5 |
+
from fastapi.responses import JSONResponse
|
6 |
from fastapi import APIRouter, UploadFile, File, HTTPException, Form
|
7 |
from src.app.pipelines.modules import UnivariateAnalysisWorkflow
|
8 |
|
|
|
14 |
if os.path.isfile(file_path):
|
15 |
os.remove(file_path)
|
16 |
|
17 |
+
def sanitize_for_json(data):
|
18 |
+
if isinstance(data, dict):
|
19 |
+
return {k: sanitize_for_json(v) for k, v in data.items()}
|
20 |
+
elif isinstance(data, list):
|
21 |
+
return [sanitize_for_json(v) for v in data]
|
22 |
+
elif isinstance(data, float):
|
23 |
+
if math.isinf(data) or math.isnan(data):
|
24 |
+
return None
|
25 |
+
return data
|
26 |
+
return data
|
27 |
+
|
28 |
@univariate_analysis_router.post('/')
|
29 |
async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
|
30 |
''' ## This endpoint accepts a CSV file upload to initiate the Univarite Analysis Workflow.
|
|
|
59 |
ua_wf = UnivariateAnalysisWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
|
60 |
results = ua_wf.run(verbose=True)
|
61 |
|
62 |
+
sanitized_data = sanitize_for_json(results)
|
63 |
+
|
64 |
+
return JSONResponse(content=sanitized_data)
|
|
|
65 |
|
66 |
except Exception as e:
|
67 |
logger.error(f"UnivariateAnalysisWorkflow failed with error: {e}", log_type='eda-engine/dataunivariate_analysis_statistics', console=True)
|
src/app/pipelines/modules/data_quality_assessment.py
CHANGED
@@ -1632,21 +1632,21 @@ class DataQualityAssessmentWorkflow:
|
|
1632 |
}
|
1633 |
|
1634 |
try:
|
1635 |
-
mva_str = json.dumps(mva_results, indent=2, default=str)
|
1636 |
final_result['missing_value_analysis']['report'] = self.generate_report_from_agent(mva_str)
|
1637 |
except:
|
1638 |
logger.error("Failed to generate report for mva....", log_type='data_quality_assessment', console=verbose)
|
1639 |
pass
|
1640 |
|
1641 |
try:
|
1642 |
-
duplicate_analysis_results_str = json.dumps(duplicate_analysis_results, indent=2, default=str)
|
1643 |
final_result['duplicate_analysis']['report'] = self.generate_report_from_agent(duplicate_analysis_results_str)
|
1644 |
except:
|
1645 |
logger.error("Failed to generate report for duplicate analysis....", log_type='data_quality_assessment', console=verbose)
|
1646 |
pass
|
1647 |
|
1648 |
try:
|
1649 |
-
data_quality_results_str = json.dumps(data_quality_results, indent=2, default=str)
|
1650 |
data_quality_results_str = data_quality_results_str +'\n'+data_quality_summary
|
1651 |
final_result['data_quality_analysis']['report'] = self.generate_report_from_agent(data_quality_results_str)
|
1652 |
except:
|
|
|
1632 |
}
|
1633 |
|
1634 |
try:
|
1635 |
+
mva_str = json.dumps(mva_results, indent=2, default=str, allow_nan=True)
|
1636 |
final_result['missing_value_analysis']['report'] = self.generate_report_from_agent(mva_str)
|
1637 |
except:
|
1638 |
logger.error("Failed to generate report for mva....", log_type='data_quality_assessment', console=verbose)
|
1639 |
pass
|
1640 |
|
1641 |
try:
|
1642 |
+
duplicate_analysis_results_str = json.dumps(duplicate_analysis_results, indent=2, default=str, allow_nan=True)
|
1643 |
final_result['duplicate_analysis']['report'] = self.generate_report_from_agent(duplicate_analysis_results_str)
|
1644 |
except:
|
1645 |
logger.error("Failed to generate report for duplicate analysis....", log_type='data_quality_assessment', console=verbose)
|
1646 |
pass
|
1647 |
|
1648 |
try:
|
1649 |
+
data_quality_results_str = json.dumps(data_quality_results, indent=2, default=str, allow_nan=True)
|
1650 |
data_quality_results_str = data_quality_results_str +'\n'+data_quality_summary
|
1651 |
final_result['data_quality_analysis']['report'] = self.generate_report_from_agent(data_quality_results_str)
|
1652 |
except:
|
src/app/pipelines/modules/data_statistics.py
CHANGED
@@ -71,7 +71,7 @@ class DataStatisticsWorkflow:
|
|
71 |
|
72 |
serializable_results = process_dict(results)
|
73 |
|
74 |
-
return json.dumps(serializable_results, indent=indent)
|
75 |
|
76 |
def build_statistical_summary(self, data_source: str = None, verbose=False) -> Dict[str, Any]:
|
77 |
'''Get the basic central tendancy, dispersion, quantiles, distinct values, frequency distributions and sparsity'''
|
|
|
71 |
|
72 |
serializable_results = process_dict(results)
|
73 |
|
74 |
+
return json.dumps(serializable_results, indent=indent, allow_nan=True)
|
75 |
|
76 |
def build_statistical_summary(self, data_source: str = None, verbose=False) -> Dict[str, Any]:
|
77 |
'''Get the basic central tendancy, dispersion, quantiles, distinct values, frequency distributions and sparsity'''
|
src/app/pipelines/task_analysis/ml_analysis_workflow.py
CHANGED
@@ -138,7 +138,7 @@ class MLAnalysisWorkflow:
|
|
138 |
try:
|
139 |
return future.result(timeout=30)
|
140 |
except concurrent.futures.TimeoutError:
|
141 |
-
logger.
|
142 |
return None
|
143 |
|
144 |
def finalize_analysis(self, final_prompt: str, verbose=False) -> Optional[RequirementsAnalysis]:
|
|
|
138 |
try:
|
139 |
return future.result(timeout=30)
|
140 |
except concurrent.futures.TimeoutError:
|
141 |
+
logger.info("Technical research timed out after 30 seconds.", log_type="pipeline: timeout", console=verbose)
|
142 |
return None
|
143 |
|
144 |
def finalize_analysis(self, final_prompt: str, verbose=False) -> Optional[RequirementsAnalysis]:
|