architojha commited on
Commit
badef87
·
1 Parent(s): 10635ce

fixed errors

Browse files
src/api/v1/eda_engine/data_quality.py CHANGED
@@ -1,7 +1,9 @@
1
  import os
 
2
  import shutil
3
  from fastapi import APIRouter
4
  from src.core.utils import logger
 
5
  from fastapi import APIRouter, UploadFile, File, HTTPException, Form
6
  from src.app.pipelines.modules import DataQualityAssessmentWorkflow
7
 
@@ -13,6 +15,17 @@ def delete_dir_contents(directory: str)->None:
13
  if os.path.isfile(file_path):
14
  os.remove(file_path)
15
 
 
 
 
 
 
 
 
 
 
 
 
16
  @data_quality_router.post('/')
17
  async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
18
  ''' ## This endpoint accepts a CSV file upload to initiate the Data Quality Workflow.
@@ -46,10 +59,9 @@ async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
46
  ds_wf = DataQualityAssessmentWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
47
  results = ds_wf.run(verbose=True)
48
 
49
- return {
50
- "status": "Pipeline finished running",
51
- "results": results
52
- }
53
 
54
  except Exception as e:
55
  logger.error(f"DataQualityAssessmentWorkflow failed with error: {e}", log_type='eda-engine/data_quality', console=True)
 
1
  import os
2
+ import math
3
  import shutil
4
  from fastapi import APIRouter
5
  from src.core.utils import logger
6
+ from fastapi.responses import JSONResponse
7
  from fastapi import APIRouter, UploadFile, File, HTTPException, Form
8
  from src.app.pipelines.modules import DataQualityAssessmentWorkflow
9
 
 
15
  if os.path.isfile(file_path):
16
  os.remove(file_path)
17
 
18
+ def sanitize_for_json(data):
19
+ if isinstance(data, dict):
20
+ return {k: sanitize_for_json(v) for k, v in data.items()}
21
+ elif isinstance(data, list):
22
+ return [sanitize_for_json(v) for v in data]
23
+ elif isinstance(data, float):
24
+ if math.isinf(data) or math.isnan(data):
25
+ return None
26
+ return data
27
+ return data
28
+
29
  @data_quality_router.post('/')
30
  async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
31
  ''' ## This endpoint accepts a CSV file upload to initiate the Data Quality Workflow.
 
59
  ds_wf = DataQualityAssessmentWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
60
  results = ds_wf.run(verbose=True)
61
 
62
+ sanitized_data = sanitize_for_json(results)
63
+
64
+ return JSONResponse(content=sanitized_data)
 
65
 
66
  except Exception as e:
67
  logger.error(f"DataQualityAssessmentWorkflow failed with error: {e}", log_type='eda-engine/data_quality', console=True)
src/api/v1/eda_engine/data_statistics.py CHANGED
@@ -1,12 +1,26 @@
1
  import os
 
2
  import shutil
3
  from fastapi import APIRouter
4
  from src.core.utils import logger
5
- from fastapi import APIRouter, UploadFile, File, HTTPException, Form
6
  from src.app.pipelines.modules import DataStatisticsWorkflow
 
 
7
 
8
  data_statistics_router = APIRouter()
9
 
 
 
 
 
 
 
 
 
 
 
 
10
  def delete_dir_contents(directory: str)->None:
11
  for filename in os.listdir(directory):
12
  file_path = os.path.join(directory, filename)
@@ -47,10 +61,10 @@ async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
47
  ds_wf = DataStatisticsWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
48
  results = ds_wf.run(verbose=True)
49
 
50
- return {
51
- "status": "Pipeline finished running",
52
- "results": results
53
- }
54
 
55
  except Exception as e:
56
  logger.error(f"DataStatisticsWorkflow failed with error: {e}", log_type='eda-engine/data_statistics', console=True)
 
1
  import os
2
+ import math
3
  import shutil
4
  from fastapi import APIRouter
5
  from src.core.utils import logger
6
+ from fastapi.responses import JSONResponse
7
  from src.app.pipelines.modules import DataStatisticsWorkflow
8
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Form
9
+
10
 
11
  data_statistics_router = APIRouter()
12
 
13
+ def sanitize_for_json(data):
14
+ if isinstance(data, dict):
15
+ return {k: sanitize_for_json(v) for k, v in data.items()}
16
+ elif isinstance(data, list):
17
+ return [sanitize_for_json(v) for v in data]
18
+ elif isinstance(data, float):
19
+ if math.isinf(data) or math.isnan(data):
20
+ return None
21
+ return data
22
+ return data
23
+
24
  def delete_dir_contents(directory: str)->None:
25
  for filename in os.listdir(directory):
26
  file_path = os.path.join(directory, filename)
 
61
  ds_wf = DataStatisticsWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
62
  results = ds_wf.run(verbose=True)
63
 
64
+ sanitized_data = sanitize_for_json(results)
65
+
66
+ return JSONResponse(content=sanitized_data)
67
+
68
 
69
  except Exception as e:
70
  logger.error(f"DataStatisticsWorkflow failed with error: {e}", log_type='eda-engine/data_statistics', console=True)
src/api/v1/eda_engine/univariate_analysis.py CHANGED
@@ -1,7 +1,8 @@
1
  import os
 
2
  import shutil
3
- from typing import Optional
4
  from src.core.utils import logger
 
5
  from fastapi import APIRouter, UploadFile, File, HTTPException, Form
6
  from src.app.pipelines.modules import UnivariateAnalysisWorkflow
7
 
@@ -13,6 +14,17 @@ def delete_dir_contents(directory: str)->None:
13
  if os.path.isfile(file_path):
14
  os.remove(file_path)
15
 
 
 
 
 
 
 
 
 
 
 
 
16
  @univariate_analysis_router.post('/')
17
  async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
18
  ''' ## This endpoint accepts a CSV file upload to initiate the Univarite Analysis Workflow.
@@ -47,10 +59,9 @@ async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
47
  ua_wf = UnivariateAnalysisWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
48
  results = ua_wf.run(verbose=True)
49
 
50
- return {
51
- "status": "Pipeline finished running",
52
- "results": results
53
- }
54
 
55
  except Exception as e:
56
  logger.error(f"UnivariateAnalysisWorkflow failed with error: {e}", log_type='eda-engine/dataunivariate_analysis_statistics', console=True)
 
1
  import os
2
+ import math
3
  import shutil
 
4
  from src.core.utils import logger
5
+ from fastapi.responses import JSONResponse
6
  from fastapi import APIRouter, UploadFile, File, HTTPException, Form
7
  from src.app.pipelines.modules import UnivariateAnalysisWorkflow
8
 
 
14
  if os.path.isfile(file_path):
15
  os.remove(file_path)
16
 
17
+ def sanitize_for_json(data):
18
+ if isinstance(data, dict):
19
+ return {k: sanitize_for_json(v) for k, v in data.items()}
20
+ elif isinstance(data, list):
21
+ return [sanitize_for_json(v) for v in data]
22
+ elif isinstance(data, float):
23
+ if math.isinf(data) or math.isnan(data):
24
+ return None
25
+ return data
26
+ return data
27
+
28
  @univariate_analysis_router.post('/')
29
  async def main(file: UploadFile = File(...), ml_task: str = Form(None)):
30
  ''' ## This endpoint accepts a CSV file upload to initiate the Univarite Analysis Workflow.
 
59
  ua_wf = UnivariateAnalysisWorkflow(data_source=f'{downloads_path}/dataset.csv', llm_choice="gpt-4o-mini", ml_task=ml_task)
60
  results = ua_wf.run(verbose=True)
61
 
62
+ sanitized_data = sanitize_for_json(results)
63
+
64
+ return JSONResponse(content=sanitized_data)
 
65
 
66
  except Exception as e:
67
  logger.error(f"UnivariateAnalysisWorkflow failed with error: {e}", log_type='eda-engine/dataunivariate_analysis_statistics', console=True)
src/app/pipelines/modules/data_quality_assessment.py CHANGED
@@ -1632,21 +1632,21 @@ class DataQualityAssessmentWorkflow:
1632
  }
1633
 
1634
  try:
1635
- mva_str = json.dumps(mva_results, indent=2, default=str)
1636
  final_result['missing_value_analysis']['report'] = self.generate_report_from_agent(mva_str)
1637
  except:
1638
  logger.error("Failed to generate report for mva....", log_type='data_quality_assessment', console=verbose)
1639
  pass
1640
 
1641
  try:
1642
- duplicate_analysis_results_str = json.dumps(duplicate_analysis_results, indent=2, default=str)
1643
  final_result['duplicate_analysis']['report'] = self.generate_report_from_agent(duplicate_analysis_results_str)
1644
  except:
1645
  logger.error("Failed to generate report for duplicate analysis....", log_type='data_quality_assessment', console=verbose)
1646
  pass
1647
 
1648
  try:
1649
- data_quality_results_str = json.dumps(data_quality_results, indent=2, default=str)
1650
  data_quality_results_str = data_quality_results_str +'\n'+data_quality_summary
1651
  final_result['data_quality_analysis']['report'] = self.generate_report_from_agent(data_quality_results_str)
1652
  except:
 
1632
  }
1633
 
1634
  try:
1635
+ mva_str = json.dumps(mva_results, indent=2, default=str, allow_nan=True)
1636
  final_result['missing_value_analysis']['report'] = self.generate_report_from_agent(mva_str)
1637
  except:
1638
  logger.error("Failed to generate report for mva....", log_type='data_quality_assessment', console=verbose)
1639
  pass
1640
 
1641
  try:
1642
+ duplicate_analysis_results_str = json.dumps(duplicate_analysis_results, indent=2, default=str, allow_nan=True)
1643
  final_result['duplicate_analysis']['report'] = self.generate_report_from_agent(duplicate_analysis_results_str)
1644
  except:
1645
  logger.error("Failed to generate report for duplicate analysis....", log_type='data_quality_assessment', console=verbose)
1646
  pass
1647
 
1648
  try:
1649
+ data_quality_results_str = json.dumps(data_quality_results, indent=2, default=str, allow_nan=True)
1650
  data_quality_results_str = data_quality_results_str +'\n'+data_quality_summary
1651
  final_result['data_quality_analysis']['report'] = self.generate_report_from_agent(data_quality_results_str)
1652
  except:
src/app/pipelines/modules/data_statistics.py CHANGED
@@ -71,7 +71,7 @@ class DataStatisticsWorkflow:
71
 
72
  serializable_results = process_dict(results)
73
 
74
- return json.dumps(serializable_results, indent=indent)
75
 
76
  def build_statistical_summary(self, data_source: str = None, verbose=False) -> Dict[str, Any]:
77
  '''Get the basic central tendancy, dispersion, quantiles, distinct values, frequency distributions and sparsity'''
 
71
 
72
  serializable_results = process_dict(results)
73
 
74
+ return json.dumps(serializable_results, indent=indent, allow_nan=True)
75
 
76
  def build_statistical_summary(self, data_source: str = None, verbose=False) -> Dict[str, Any]:
77
  '''Get the basic central tendancy, dispersion, quantiles, distinct values, frequency distributions and sparsity'''
src/app/pipelines/task_analysis/ml_analysis_workflow.py CHANGED
@@ -138,7 +138,7 @@ class MLAnalysisWorkflow:
138
  try:
139
  return future.result(timeout=30)
140
  except concurrent.futures.TimeoutError:
141
- logger.warning("Technical research timed out after 30 seconds.", log_type="pipeline: timeout", console=verbose)
142
  return None
143
 
144
  def finalize_analysis(self, final_prompt: str, verbose=False) -> Optional[RequirementsAnalysis]:
 
138
  try:
139
  return future.result(timeout=30)
140
  except concurrent.futures.TimeoutError:
141
+ logger.info("Technical research timed out after 30 seconds.", log_type="pipeline: timeout", console=verbose)
142
  return None
143
 
144
  def finalize_analysis(self, final_prompt: str, verbose=False) -> Optional[RequirementsAnalysis]: