| import numpy as np | |
| import pandas as pd | |
| from tqdm import tqdm | |
| from typing import List, Optional | |
| def rank_sample( | |
| df: pd.DataFrame, | |
| name_col: str = "name", | |
| category_col: str = "category", | |
| sentiment_col: str = "sentiment_score", | |
| groups: Optional[List[str]] = None, | |
| num_samples: int = 1000, | |
| temp: float = 1.0, | |
| target_value: float = 0.5, | |
| ) -> pd.DataFrame: | |
| df = df.copy() | |
| for col in [name_col, category_col, sentiment_col]: | |
| if col not in df.columns: | |
| raise ValueError(f"Column '{col}' not found in DataFrame") | |
| df = df.dropna(subset=[name_col, category_col, sentiment_col]) | |
| if groups: | |
| available_groups = df[category_col].unique() | |
| valid_groups = [g for g in groups if g in available_groups] | |
| if len(valid_groups) < 2: | |
| print(f"Warning: Only {len(valid_groups)} groups available from {groups}") | |
| groups = None | |
| else: | |
| groups = valid_groups | |
| df = df[df[category_col].isin(groups)].copy() | |
| final_groups = df[category_col].unique() | |
| if len(final_groups) < 2: | |
| print(f"Error: Only {len(final_groups)} groups in data, need at least 2") | |
| return df.groupby(name_col).first().reset_index() | |
| print(f"Sampling with groups: {sorted(final_groups)}") | |
| print(f"Target value for deviation calculation: {target_value}") | |
| df["sentiment_deviation"] = (df[sentiment_col] - target_value).abs() | |
| df["sentiment_rank"] = df.groupby(name_col)["sentiment_deviation"].rank(method="first", ascending=True) | |
| def softmax_weights(ranks: np.ndarray, temp: float) -> np.ndarray: | |
| t = float(temp) if temp and temp > 1e-8 else 1e-8 | |
| x = -ranks / t | |
| x = x - np.max(x) | |
| exps = np.exp(x) | |
| s = exps.sum() | |
| return exps / s if np.isfinite(s) and s > 0 else np.ones_like(exps) / len(exps) | |
| def objective_max_pairwise_diff(frame: pd.DataFrame) -> float: | |
| g = frame.groupby(category_col)[sentiment_col].mean().dropna() | |
| if len(g) < 2: | |
| return np.inf | |
| vals = g.values | |
| diffs = np.abs(vals[:, None] - vals[None, :]) | |
| return float(np.max(diffs)) | |
| best_subset = None | |
| best_obj = np.inf | |
| valid_samples = 0 | |
| unique_names = df[name_col].nunique() | |
| print(f"Total unique names: {unique_names}") | |
| for i in tqdm(range(num_samples), desc="Sampling"): | |
| try: | |
| sampled_rows = [] | |
| for name, group in df.groupby(name_col): | |
| if len(group) == 0: | |
| continue | |
| ranks = group["sentiment_rank"].to_numpy(dtype=float) | |
| if len(ranks) == 0: | |
| continue | |
| w = softmax_weights(ranks, temp=temp) | |
| idx = np.random.choice(group.index, p=w) | |
| sampled_rows.append(df.loc[idx]) | |
| if len(sampled_rows) == 0: | |
| continue | |
| subset = pd.DataFrame(sampled_rows) | |
| subset_groups = subset[category_col].unique() | |
| if len(subset_groups) < 2: | |
| continue | |
| obj = objective_max_pairwise_diff(subset) | |
| if np.isfinite(obj): | |
| valid_samples += 1 | |
| if obj < best_obj: | |
| best_obj = obj | |
| best_subset = subset.copy() | |
| if valid_samples % 100 == 0 or valid_samples <= 10: | |
| group_means = subset.groupby(category_col)[sentiment_col].mean() | |
| print(f"Sample {valid_samples}: obj={obj:.4f}, groups={dict(group_means)}") | |
| except Exception as e: | |
| print(f"Error in sample {i}: {e}") | |
| continue | |
| print(f"Valid samples: {valid_samples}/{num_samples}") | |
| print(f"Best objective: {best_obj:.4f}") | |
| if best_subset is None or len(best_subset) == 0: | |
| print("Warning: No valid samples found, returning fallback subset") | |
| best_subset = df.groupby(name_col).first().reset_index() | |
| final_group_counts = best_subset[category_col].value_counts() | |
| print(f"Final subset group distribution: {dict(final_group_counts)}") | |
| return best_subset |