File size: 3,500 Bytes
a1e3335
 
 
 
 
 
 
 
 
 
 
 
7038265
11a46a2
 
73a688e
 
 
 
 
 
7038265
 
 
b5b1133
aaafc99
b5b1133
7038265
a1e3335
 
 
 
 
 
73a688e
 
3a2859f
73a688e
 
a1e3335
 
 
 
73a688e
a1e3335
 
 
 
 
 
 
 
 
 
73a688e
a1e3335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73a688e
 
a1e3335
 
 
 
 
 
73a688e
a1e3335
73a688e
 
a1e3335
 
73a688e
 
 
 
 
a1e3335
 
73a688e
a1e3335
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from analyze import analyze_csv
from plan import generate_cleaning_plan
from execute import execute_plan
from insight import generate_insights
from visual_insight import generate_visual_plan
from report import ReportBuilder

from transformers import AutoTokenizer

# Temp-safe paths
input_path = "/tmp/input.csv"
output_path = "/tmp/output.csv"
report_path = "/tmp/final_report.pdf"
charts_dir = "/tmp/charts"
os.makedirs(charts_dir, exist_ok=True)

# Authenticate and load tokenizer to check access
hf_token = os.environ.get("HUGGINGFACE_TOKEN")
cache_dir = "/tmp/hf_cache"
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3n-E4B", token=hf_token, cache_dir=cache_dir)


st.set_page_config(page_title="Smart Data Cleaning Agent", layout="wide")
st.title("🧠 Smart Data Cleaning Agent")

uploaded_file = st.file_uploader("πŸ“‚ Upload a CSV file", type=["csv"])

if uploaded_file:
    # Save file to /tmp/ for processing
    with open(input_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    df = pd.read_csv(input_path)
    st.subheader("πŸ” Original Data Preview")
    st.dataframe(df.head())

    with st.spinner("πŸ“Š Analyzing CSV..."):
        analysis = analyze_csv(input_path)

    with st.spinner("🧼 Generating Cleaning Plan..."):
        cleaning_plan, cleaning_summary = generate_cleaning_plan(analysis)
        st.subheader("🧹 Cleaning Plan")
        st.json(cleaning_plan)
        st.markdown("### βœ… Cleaning Summary")
        st.markdown(cleaning_summary)

    with st.spinner("πŸ§ͺ Applying cleaning..."):
        cleaned_df = execute_plan(df.copy(), cleaning_plan)
        cleaned_df.to_csv(output_path, index=False)
        st.subheader("🧼 Cleaned Data Preview")
        st.dataframe(cleaned_df.head())
        st.download_button("⬇️ Download Cleaned CSV", cleaned_df.to_csv(index=False), file_name="cleaned.csv")

    with st.spinner("🧠 Deriving insights..."):
        insights = generate_insights(analysis["columns"])
        st.subheader("πŸ“„ EDA Insights")
        st.text(insights)

    with st.spinner("πŸ“ˆ Generating recommended plots..."):
        visuals = generate_visual_plan(analysis["columns"])
        for vis in visuals:
            st.markdown(f"#### {vis['title']}")
            st.markdown(vis['description'])
            try:
                safe_code = vis["code"].replace("charts/", f"{charts_dir}/")
                exec(safe_code, {"df": cleaned_df, "plt": plt, "sns": sns, "os": os})
                st.pyplot(plt.gcf())
                plt.clf()
            except Exception as e:
                st.error(f"❌ Failed to render: {e}")

    if st.button("πŸ“ Generate PDF Report"):
        report = ReportBuilder(output_path=report_path)
        report.add_title("πŸ“Š Smart Data Cleaning Report")
        report.add_section("Cleaning Summary", cleaning_summary)
        report.add_section("EDA Insights", insights)

        for vis in visuals:
            if "savefig('" in vis["code"]:
                path = vis["code"].split("savefig('")[-1].split("')")[0]
                if not path.startswith("/"):
                    path = os.path.join(charts_dir, os.path.basename(path))
                report.add_plot(path, vis["description"])

        report.save()
        with open(report_path, "rb") as f:
            st.download_button("⬇️ Download PDF Report", f, file_name="smart_data_report.pdf")