import pandas as pd import numpy as np import streamlit as st import os import matplotlib.pyplot as plt import seaborn as sns try: import tabula from tabula import read_pdf except: read_pdf = None # ----------- File Upload Handler ----------- # def file_upload(file): file_ext = os.path.splitext(file.name)[1].lower() try: if file_ext == '.csv': df = pd.read_csv(file) elif file_ext in ['.xls', '.xlsx']: df = pd.read_excel(file) elif file_ext == '.json': df = pd.read_json(file) elif file_ext == '.pdf' and read_pdf: df = read_pdf(file, pages='all', multiple_tables=False)[0] else: st.error("โŒ Unsupported file type or missing dependencies for PDF.") return None return df except Exception as e: st.error(f"โš ๏ธ Error loading file: {e}") return None # ----------- Cleaning Functions ----------- # def remove_empty_rows(df): st.info("๐Ÿ” Null values before cleaning:") st.write(df.isnull().sum()) df_cleaned = df.dropna() st.success("โœ… Null values removed.") return df_cleaned def replace_nulls(df, value): st.info("๐Ÿ” Null values before replacement:") st.write(df.isnull().sum()) df_filled = df.fillna(value) st.success("โœ… Null values replaced.") return df_filled def remove_noise(df): noise_words = {'the', 'is', 'an', 'a', 'in', 'of', 'to'} def clean_text(val): if isinstance(val, str): return ' '.join(word for word in val.split() if word.lower() not in noise_words) return val df_cleaned = df.applymap(clean_text) st.success("โœ… Noise words removed.") return df_cleaned def remove_duplicates(df): df_deduped = df.drop_duplicates() st.success("โœ… Duplicate rows removed.") return df_deduped def convert_column_dtype(df, column, dtype): try: df[column] = df[column].astype(dtype) st.success(f"โœ… Converted '{column}' to {dtype}") except Exception as e: st.error(f"โš ๏ธ Conversion error: {e}") return df def detect_outliers(df, column): if column in df.select_dtypes(include=['float', 'int']).columns: Q1 = df[column].quantile(0.25) Q3 = df[column].quantile(0.75) IQR = Q3 - Q1 lower = Q1 - 1.5 * IQR upper = Q3 + 1.5 * IQR outliers = df[(df[column] < lower) | (df[column] > upper)] st.write(f"๐Ÿ” Found {len(outliers)} outliers in column '{column}'") return outliers else: st.warning("โš ๏ธ Column must be numeric to detect outliers.") return pd.DataFrame() def plot_distributions(df): st.subheader("๐Ÿ“Š Data Distributions") numeric_cols = df.select_dtypes(include=['float', 'int']).columns for col in numeric_cols: fig, ax = plt.subplots() sns.histplot(df[col].dropna(), kde=True, ax=ax) ax.set_title(f"Distribution of {col}") st.pyplot(fig) def plot_missing_data(df): st.subheader("๐Ÿ“‰ Missing Data Heatmap") fig, ax = plt.subplots() sns.heatmap(df.isnull(), cbar=False, cmap='viridis') st.pyplot(fig) def main(): st.set_page_config(page_title="๐Ÿงน Smart Dataset Cleaner", layout="wide") st.title("๐Ÿงน Smart Dataset Cleaner") st.caption("โœจ Clean, analyze, and preprocess your dataset with ease") uploaded_file = st.file_uploader("๐Ÿ“‚ Upload your dataset", type=["csv", "xlsx", "xls", "json", "pdf"]) if uploaded_file: df = file_upload(uploaded_file) if df is not None: st.subheader("๐Ÿ“‹ Original Dataset Preview") st.dataframe(df.head()) st.markdown("## ๐Ÿงฐ Data Cleaning Tools") with st.expander("โž• Replace Null Values"): fill_value = st.text_input("Enter value to replace nulls with:") if st.button("Replace Nulls"): df = replace_nulls(df, fill_value) st.dataframe(df) if st.button("๐Ÿงผ Remove Empty Rows"): df = remove_empty_rows(df) st.dataframe(df) if st.button("๐Ÿงน Remove Duplicate Rows"): df = remove_duplicates(df) st.dataframe(df) if st.button("๐Ÿ“‰ Remove Noise Words from Text"): df = remove_noise(df) st.dataframe(df) with st.expander("๐Ÿ” Convert Column DataType"): selected_col = st.selectbox("Select column", df.columns) dtype = st.selectbox("Select target type", ["int", "float", "str", "bool"]) if st.button("Convert"): df = convert_column_dtype(df, selected_col, dtype) st.dataframe(df) st.markdown("## ๐Ÿ“Š Data Visualizations") if st.checkbox("๐Ÿ“ˆ Show Summary Stats"): st.write(df.describe(include='all')) if st.checkbox("๐Ÿ“Œ Plot Column Distributions"): plot_distributions(df) if st.checkbox("๐Ÿ“ Show Missing Data Heatmap"): plot_missing_data(df) st.markdown("## ๐Ÿšจ Outlier Detection") outlier_col = st.selectbox("Select numeric column", df.select_dtypes(include=['float', 'int']).columns) if st.button("Detect Outliers"): outliers = detect_outliers(df, outlier_col) if not outliers.empty: st.write(outliers) st.markdown("## ๐Ÿ’พ Download Cleaned Dataset") file_name = st.text_input("Filename:", "cleaned_dataset.csv") if st.button("Download CSV"): st.download_button("๐Ÿ“„ Download", df.to_csv(index=False), file_name, mime="text/csv") else: st.warning("โš ๏ธ Please upload a supported file to begin.") if __name__ == "__main__": main()