DataHubHub / utils /dataset_utils.py
whackthejacker's picture
Upload 34 files
43b66f1 verified
import pandas as pd
import numpy as np
def get_dataset_info(df):
"""
Get basic information about a dataset.
Args:
df: Pandas DataFrame
Returns:
Dictionary with dataset information
"""
info = {
'rows': df.shape[0],
'columns': df.shape[1],
'missing_values': df.isna().sum().sum(),
'duplicate_rows': df.duplicated().sum(),
'memory_usage': df.memory_usage(deep=True).sum() / (1024 * 1024), # MB
'column_types': df.dtypes.astype(str).value_counts().to_dict(),
'column_info': []
}
# Get info for each column
for col in df.columns:
col_info = {
'name': col,
'type': str(df[col].dtype),
'missing': df[col].isna().sum(),
'missing_pct': (df[col].isna().sum() / len(df)) * 100,
'unique_values': df[col].nunique()
}
# Add additional info for numeric columns
if pd.api.types.is_numeric_dtype(df[col]):
col_info.update({
'min': df[col].min(),
'max': df[col].max(),
'mean': df[col].mean(),
'median': df[col].median(),
'std': df[col].std()
})
# Add additional info for categorical/text columns
elif pd.api.types.is_object_dtype(df[col]):
# Get top values
value_counts = df[col].value_counts().head(5).to_dict()
col_info['top_values'] = value_counts
# Estimate if it's a categorical column
if df[col].nunique() / len(df) < 0.1: # If less than 10% of rows have unique values
col_info['likely_categorical'] = True
else:
col_info['likely_categorical'] = False
info['column_info'].append(col_info)
return info
def detect_dataset_format(df):
"""
Try to detect the format/type of the dataset based on its structure.
Args:
df: Pandas DataFrame
Returns:
String indicating the likely format
"""
# Check for text data
text_cols = 0
for col in df.columns:
if pd.api.types.is_string_dtype(df[col]) and df[col].str.len().mean() > 100:
text_cols += 1
if text_cols / len(df.columns) > 0.5:
return "text"
# Check for time series data
date_cols = 0
for col in df.columns:
if pd.api.types.is_datetime64_dtype(df[col]):
date_cols += 1
if date_cols > 0:
return "time_series"
# Check if it looks like tabular data
numeric_cols = len(df.select_dtypes(include=[np.number]).columns)
categorical_cols = len(df.select_dtypes(include=['object', 'category']).columns)
if numeric_cols > 0 and categorical_cols > 0:
return "mixed"
elif numeric_cols > 0:
return "numeric"
elif categorical_cols > 0:
return "categorical"
# Default
return "generic"
def check_column_completeness(df, threshold=0.8):
"""
Check if columns have good completeness (less than 20% missing values by default).
Args:
df: Pandas DataFrame
threshold: Completeness threshold (0.8 = 80% complete)
Returns:
List of columns with poor completeness
"""
results = []
for col in df.columns:
missing_ratio = df[col].isna().sum() / len(df)
completeness = 1 - missing_ratio
if completeness < threshold:
results.append({
'Column': col,
'Completeness': f"{completeness:.2%}",
'Missing': f"{missing_ratio:.2%}",
'Recommendation': 'Consider imputing or removing this column'
})
return results
def detect_outliers(series, method='iqr', factor=1.5):
"""
Detect outliers in a pandas Series using IQR or Z-score method.
Args:
series: Pandas Series with numeric values
method: 'iqr' or 'zscore'
factor: Multiplier for IQR or Z-score threshold
Returns:
Tuple of (outlier_indices, lower_bound, upper_bound)
"""
if method == 'iqr':
# IQR method
q1 = series.quantile(0.25)
q3 = series.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - factor * iqr
upper_bound = q3 + factor * iqr
outliers = series[(series < lower_bound) | (series > upper_bound)].index.tolist()
else: # zscore
# Z-score method
from scipy import stats
z_scores = stats.zscore(series.dropna())
abs_z_scores = abs(z_scores)
# Filter for Z-scores above threshold
outlier_indices = np.where(abs_z_scores > factor)[0]
outliers = series.dropna().iloc[outlier_indices].index.tolist()
# Compute equivalent bounds for consistency
mean = series.mean()
std = series.std()
lower_bound = mean - factor * std
upper_bound = mean + factor * std
return outliers, lower_bound, upper_bound