Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
import streamlit as st | |
import os | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
try: | |
import tabula | |
from tabula import read_pdf | |
except: | |
read_pdf = None | |
# ----------- File Upload Handler ----------- # | |
def file_upload(file): | |
file_ext = os.path.splitext(file.name)[1].lower() | |
try: | |
if file_ext == '.csv': | |
df = pd.read_csv(file) | |
elif file_ext in ['.xls', '.xlsx']: | |
df = pd.read_excel(file) | |
elif file_ext == '.json': | |
df = pd.read_json(file) | |
elif file_ext == '.pdf' and read_pdf: | |
df = read_pdf(file, pages='all', multiple_tables=False)[0] | |
else: | |
st.error("β Unsupported file type or missing dependencies for PDF.") | |
return None | |
return df | |
except Exception as e: | |
st.error(f"β οΈ Error loading file: {e}") | |
return None | |
# ----------- Cleaning Functions ----------- # | |
def remove_empty_rows(df): | |
st.info("π Null values before cleaning:") | |
st.write(df.isnull().sum()) | |
df_cleaned = df.dropna() | |
st.success("β Null values removed.") | |
return df_cleaned | |
def replace_nulls(df, value): | |
st.info("π Null values before replacement:") | |
st.write(df.isnull().sum()) | |
df_filled = df.fillna(value) | |
st.success("β Null values replaced.") | |
return df_filled | |
def remove_noise(df): | |
noise_words = {'the', 'is', 'an', 'a', 'in', 'of', 'to'} | |
def clean_text(val): | |
if isinstance(val, str): | |
return ' '.join(word for word in val.split() if word.lower() not in noise_words) | |
return val | |
df_cleaned = df.applymap(clean_text) | |
st.success("β Noise words removed.") | |
return df_cleaned | |
def remove_duplicates(df): | |
df_deduped = df.drop_duplicates() | |
st.success("β Duplicate rows removed.") | |
return df_deduped | |
def convert_column_dtype(df, column, dtype): | |
try: | |
df[column] = df[column].astype(dtype) | |
st.success(f"β Converted '{column}' to {dtype}") | |
except Exception as e: | |
st.error(f"β οΈ Conversion error: {e}") | |
return df | |
def detect_outliers(df, column): | |
if column in df.select_dtypes(include=['float', 'int']).columns: | |
Q1 = df[column].quantile(0.25) | |
Q3 = df[column].quantile(0.75) | |
IQR = Q3 - Q1 | |
lower = Q1 - 1.5 * IQR | |
upper = Q3 + 1.5 * IQR | |
outliers = df[(df[column] < lower) | (df[column] > upper)] | |
st.write(f"π Found {len(outliers)} outliers in column '{column}'") | |
return outliers | |
else: | |
st.warning("β οΈ Column must be numeric to detect outliers.") | |
return pd.DataFrame() | |
def plot_distributions(df): | |
st.subheader("π Data Distributions") | |
numeric_cols = df.select_dtypes(include=['float', 'int']).columns | |
for col in numeric_cols: | |
fig, ax = plt.subplots() | |
sns.histplot(df[col].dropna(), kde=True, ax=ax) | |
ax.set_title(f"Distribution of {col}") | |
st.pyplot(fig) | |
def plot_missing_data(df): | |
st.subheader("π Missing Data Heatmap") | |
fig, ax = plt.subplots() | |
sns.heatmap(df.isnull(), cbar=False, cmap='viridis') | |
st.pyplot(fig) | |
def main(): | |
st.set_page_config(page_title="π§Ή Smart Dataset Cleaner", layout="wide") | |
st.title("π§Ή Smart Dataset Cleaner") | |
st.caption("β¨ Clean, analyze, and preprocess your dataset with ease") | |
uploaded_file = st.file_uploader("π Upload your dataset", type=["csv", "xlsx", "xls", "json", "pdf"]) | |
if uploaded_file: | |
df = file_upload(uploaded_file) | |
if df is not None: | |
st.subheader("π Original Dataset Preview") | |
st.dataframe(df.head()) | |
st.markdown("## π§° Data Cleaning Tools") | |
with st.expander("β Replace Null Values"): | |
fill_value = st.text_input("Enter value to replace nulls with:") | |
if st.button("Replace Nulls"): | |
df = replace_nulls(df, fill_value) | |
st.dataframe(df) | |
if st.button("π§Ό Remove Empty Rows"): | |
df = remove_empty_rows(df) | |
st.dataframe(df) | |
if st.button("π§Ή Remove Duplicate Rows"): | |
df = remove_duplicates(df) | |
st.dataframe(df) | |
if st.button("π Remove Noise Words from Text"): | |
df = remove_noise(df) | |
st.dataframe(df) | |
with st.expander("π Convert Column DataType"): | |
selected_col = st.selectbox("Select column", df.columns) | |
dtype = st.selectbox("Select target type", ["int", "float", "str", "bool"]) | |
if st.button("Convert"): | |
df = convert_column_dtype(df, selected_col, dtype) | |
st.dataframe(df) | |
st.markdown("## π Data Visualizations") | |
if st.checkbox("π Show Summary Stats"): | |
st.write(df.describe(include='all')) | |
if st.checkbox("π Plot Column Distributions"): | |
plot_distributions(df) | |
if st.checkbox("π Show Missing Data Heatmap"): | |
plot_missing_data(df) | |
st.markdown("## π¨ Outlier Detection") | |
outlier_col = st.selectbox("Select numeric column", df.select_dtypes(include=['float', 'int']).columns) | |
if st.button("Detect Outliers"): | |
outliers = detect_outliers(df, outlier_col) | |
if not outliers.empty: | |
st.write(outliers) | |
st.markdown("## πΎ Download Cleaned Dataset") | |
file_name = st.text_input("Filename:", "cleaned_dataset.csv") | |
if st.button("Download CSV"): | |
st.download_button("π Download", df.to_csv(index=False), file_name, mime="text/csv") | |
else: | |
st.warning("β οΈ Please upload a supported file to begin.") | |
if __name__ == "__main__": | |
main() | |