Spaces:
Running
Running
import pandas as pd | |
import numpy as np | |
def get_dataset_info(df): | |
""" | |
Get basic information about a dataset. | |
Args: | |
df: Pandas DataFrame | |
Returns: | |
Dictionary with dataset information | |
""" | |
info = { | |
'rows': df.shape[0], | |
'columns': df.shape[1], | |
'missing_values': df.isna().sum().sum(), | |
'duplicate_rows': df.duplicated().sum(), | |
'memory_usage': df.memory_usage(deep=True).sum() / (1024 * 1024), # MB | |
'column_types': df.dtypes.astype(str).value_counts().to_dict(), | |
'column_info': [] | |
} | |
# Get info for each column | |
for col in df.columns: | |
col_info = { | |
'name': col, | |
'type': str(df[col].dtype), | |
'missing': df[col].isna().sum(), | |
'missing_pct': (df[col].isna().sum() / len(df)) * 100, | |
'unique_values': df[col].nunique() | |
} | |
# Add additional info for numeric columns | |
if pd.api.types.is_numeric_dtype(df[col]): | |
col_info.update({ | |
'min': df[col].min(), | |
'max': df[col].max(), | |
'mean': df[col].mean(), | |
'median': df[col].median(), | |
'std': df[col].std() | |
}) | |
# Add additional info for categorical/text columns | |
elif pd.api.types.is_object_dtype(df[col]): | |
# Get top values | |
value_counts = df[col].value_counts().head(5).to_dict() | |
col_info['top_values'] = value_counts | |
# Estimate if it's a categorical column | |
if df[col].nunique() / len(df) < 0.1: # If less than 10% of rows have unique values | |
col_info['likely_categorical'] = True | |
else: | |
col_info['likely_categorical'] = False | |
info['column_info'].append(col_info) | |
return info | |
def detect_dataset_format(df): | |
""" | |
Try to detect the format/type of the dataset based on its structure. | |
Args: | |
df: Pandas DataFrame | |
Returns: | |
String indicating the likely format | |
""" | |
# Check for text data | |
text_cols = 0 | |
for col in df.columns: | |
if pd.api.types.is_string_dtype(df[col]) and df[col].str.len().mean() > 100: | |
text_cols += 1 | |
if text_cols / len(df.columns) > 0.5: | |
return "text" | |
# Check for time series data | |
date_cols = 0 | |
for col in df.columns: | |
if pd.api.types.is_datetime64_dtype(df[col]): | |
date_cols += 1 | |
if date_cols > 0: | |
return "time_series" | |
# Check if it looks like tabular data | |
numeric_cols = len(df.select_dtypes(include=[np.number]).columns) | |
categorical_cols = len(df.select_dtypes(include=['object', 'category']).columns) | |
if numeric_cols > 0 and categorical_cols > 0: | |
return "mixed" | |
elif numeric_cols > 0: | |
return "numeric" | |
elif categorical_cols > 0: | |
return "categorical" | |
# Default | |
return "generic" | |
def check_column_completeness(df, threshold=0.8): | |
""" | |
Check if columns have good completeness (less than 20% missing values by default). | |
Args: | |
df: Pandas DataFrame | |
threshold: Completeness threshold (0.8 = 80% complete) | |
Returns: | |
List of columns with poor completeness | |
""" | |
results = [] | |
for col in df.columns: | |
missing_ratio = df[col].isna().sum() / len(df) | |
completeness = 1 - missing_ratio | |
if completeness < threshold: | |
results.append({ | |
'Column': col, | |
'Completeness': f"{completeness:.2%}", | |
'Missing': f"{missing_ratio:.2%}", | |
'Recommendation': 'Consider imputing or removing this column' | |
}) | |
return results | |
def detect_outliers(series, method='iqr', factor=1.5): | |
""" | |
Detect outliers in a pandas Series using IQR or Z-score method. | |
Args: | |
series: Pandas Series with numeric values | |
method: 'iqr' or 'zscore' | |
factor: Multiplier for IQR or Z-score threshold | |
Returns: | |
Tuple of (outlier_indices, lower_bound, upper_bound) | |
""" | |
if method == 'iqr': | |
# IQR method | |
q1 = series.quantile(0.25) | |
q3 = series.quantile(0.75) | |
iqr = q3 - q1 | |
lower_bound = q1 - factor * iqr | |
upper_bound = q3 + factor * iqr | |
outliers = series[(series < lower_bound) | (series > upper_bound)].index.tolist() | |
else: # zscore | |
# Z-score method | |
from scipy import stats | |
z_scores = stats.zscore(series.dropna()) | |
abs_z_scores = abs(z_scores) | |
# Filter for Z-scores above threshold | |
outlier_indices = np.where(abs_z_scores > factor)[0] | |
outliers = series.dropna().iloc[outlier_indices].index.tolist() | |
# Compute equivalent bounds for consistency | |
mean = series.mean() | |
std = series.std() | |
lower_bound = mean - factor * std | |
upper_bound = mean + factor * std | |
return outliers, lower_bound, upper_bound | |