Haseeb-001's picture
Update app.py
ac98ac9 verified
import pandas as pd
import numpy as np
import streamlit as st
import os
import matplotlib.pyplot as plt
import seaborn as sns
try:
import tabula
from tabula import read_pdf
except:
read_pdf = None
# ----------- File Upload Handler ----------- #
def file_upload(file):
file_ext = os.path.splitext(file.name)[1].lower()
try:
if file_ext == '.csv':
df = pd.read_csv(file)
elif file_ext in ['.xls', '.xlsx']:
df = pd.read_excel(file)
elif file_ext == '.json':
df = pd.read_json(file)
elif file_ext == '.pdf' and read_pdf:
df = read_pdf(file, pages='all', multiple_tables=False)[0]
else:
st.error("❌ Unsupported file type or missing dependencies for PDF.")
return None
return df
except Exception as e:
st.error(f"⚠️ Error loading file: {e}")
return None
# ----------- Cleaning Functions ----------- #
def remove_empty_rows(df):
st.info("πŸ” Null values before cleaning:")
st.write(df.isnull().sum())
df_cleaned = df.dropna()
st.success("βœ… Null values removed.")
return df_cleaned
def replace_nulls(df, value):
st.info("πŸ” Null values before replacement:")
st.write(df.isnull().sum())
df_filled = df.fillna(value)
st.success("βœ… Null values replaced.")
return df_filled
def remove_noise(df):
noise_words = {'the', 'is', 'an', 'a', 'in', 'of', 'to'}
def clean_text(val):
if isinstance(val, str):
return ' '.join(word for word in val.split() if word.lower() not in noise_words)
return val
df_cleaned = df.applymap(clean_text)
st.success("βœ… Noise words removed.")
return df_cleaned
def remove_duplicates(df):
df_deduped = df.drop_duplicates()
st.success("βœ… Duplicate rows removed.")
return df_deduped
def convert_column_dtype(df, column, dtype):
try:
df[column] = df[column].astype(dtype)
st.success(f"βœ… Converted '{column}' to {dtype}")
except Exception as e:
st.error(f"⚠️ Conversion error: {e}")
return df
def detect_outliers(df, column):
if column in df.select_dtypes(include=['float', 'int']).columns:
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers = df[(df[column] < lower) | (df[column] > upper)]
st.write(f"πŸ” Found {len(outliers)} outliers in column '{column}'")
return outliers
else:
st.warning("⚠️ Column must be numeric to detect outliers.")
return pd.DataFrame()
def plot_distributions(df):
st.subheader("πŸ“Š Data Distributions")
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
for col in numeric_cols:
fig, ax = plt.subplots()
sns.histplot(df[col].dropna(), kde=True, ax=ax)
ax.set_title(f"Distribution of {col}")
st.pyplot(fig)
def plot_missing_data(df):
st.subheader("πŸ“‰ Missing Data Heatmap")
fig, ax = plt.subplots()
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
st.pyplot(fig)
def main():
st.set_page_config(page_title="🧹 Smart Dataset Cleaner", layout="wide")
st.title("🧹 Smart Dataset Cleaner")
st.caption("✨ Clean, analyze, and preprocess your dataset with ease")
uploaded_file = st.file_uploader("πŸ“‚ Upload your dataset", type=["csv", "xlsx", "xls", "json", "pdf"])
if uploaded_file:
df = file_upload(uploaded_file)
if df is not None:
st.subheader("πŸ“‹ Original Dataset Preview")
st.dataframe(df.head())
st.markdown("## 🧰 Data Cleaning Tools")
with st.expander("βž• Replace Null Values"):
fill_value = st.text_input("Enter value to replace nulls with:")
if st.button("Replace Nulls"):
df = replace_nulls(df, fill_value)
st.dataframe(df)
if st.button("🧼 Remove Empty Rows"):
df = remove_empty_rows(df)
st.dataframe(df)
if st.button("🧹 Remove Duplicate Rows"):
df = remove_duplicates(df)
st.dataframe(df)
if st.button("πŸ“‰ Remove Noise Words from Text"):
df = remove_noise(df)
st.dataframe(df)
with st.expander("πŸ” Convert Column DataType"):
selected_col = st.selectbox("Select column", df.columns)
dtype = st.selectbox("Select target type", ["int", "float", "str", "bool"])
if st.button("Convert"):
df = convert_column_dtype(df, selected_col, dtype)
st.dataframe(df)
st.markdown("## πŸ“Š Data Visualizations")
if st.checkbox("πŸ“ˆ Show Summary Stats"):
st.write(df.describe(include='all'))
if st.checkbox("πŸ“Œ Plot Column Distributions"):
plot_distributions(df)
if st.checkbox("πŸ“ Show Missing Data Heatmap"):
plot_missing_data(df)
st.markdown("## 🚨 Outlier Detection")
outlier_col = st.selectbox("Select numeric column", df.select_dtypes(include=['float', 'int']).columns)
if st.button("Detect Outliers"):
outliers = detect_outliers(df, outlier_col)
if not outliers.empty:
st.write(outliers)
st.markdown("## πŸ’Ύ Download Cleaned Dataset")
file_name = st.text_input("Filename:", "cleaned_dataset.csv")
if st.button("Download CSV"):
st.download_button("πŸ“„ Download", df.to_csv(index=False), file_name, mime="text/csv")
else:
st.warning("⚠️ Please upload a supported file to begin.")
if __name__ == "__main__":
main()