Spaces:

seonglae
/

CorrSteer

Sleeping

App Files Files Community

seonglae commited on about 1 month ago

Commit

889f722

0 Parent(s):

feat: hf space corr-steer

Browse files

Files changed (15) hide show

.gitignore +3 -0
Dockerfile +57 -0
config.py +103 -0
corr_extract.py +253 -0
corr_steer/steer.py +65 -0
demo/.gitignore +24 -0
demo/README.md +71 -0
demo/components.json +21 -0
demo/eslint.config.js +28 -0
demo/index.html +13 -0
deploy.sh +49 -0
features/gpt2.emgsd.json +314 -0
requirements.txt +8 -0
server.py +195 -0
start.sh +30 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+wandb
+__pycache__
+.env

Dockerfile ADDED Viewed

	@@ -0,0 +1,57 @@

+# Use Node.js 18 as base image
+FROM node:18-bullseye
+# Install Python 3.9 and pip
+RUN apt-get update && apt-get install -y \
+    python3.9 \
+    python3-pip \
+    python3-dev \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Set up aliases for python
+RUN ln -s /usr/bin/python3.9 /usr/bin/python
+# Install pnpm
+RUN npm install -g pnpm
+# Set working directory
+WORKDIR /app
+# Copy Python requirements and install dependencies
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir -r requirements.txt
+# Copy Python files
+COPY server.py .
+COPY config.py .
+COPY features/ ./features/
+# Copy frontend files
+COPY demo/ ./demo/
+# Install frontend dependencies and build
+WORKDIR /app/demo
+RUN pnpm install
+RUN pnpm build
+# Verify build output
+RUN ls -la dist/
+# Go back to app directory
+WORKDIR /app
+# Copy startup script
+COPY start.sh .
+RUN chmod +x start.sh
+# Expose port 7860 for HF Spaces
+EXPOSE 7860
+# Set environment variables
+ENV NODE_ENV=production
+ENV FLASK_ENV=production
+ENV PYTHONPATH=/app
+# Start both services
+CMD ["./start.sh"]

config.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# =========
+# Configuration: Define dataset information.
+# =========
+datasets_config = {
+    "emgsd": {
+        "id": "holistic-ai/EMGSD",
+        "columns": ["category", "stereotype_type", "text"],
+        "filter": {"category": "stereotype"},
+        "text_field": "text",
+        "label_field": "stereotype_type",
+        "max_length": 512,
+        "category": {
+            "lgbtq+": {
+              "prompt": "The gay people are",
+            },
+            "nationality": {
+              "prompt": "The French people are",
+            },
+            "race": {
+              "prompt": "The Black people are",
+            }
+        }
+    },
+    "yahoo_question": {
+        "id": "community-datasets/yahoo_answers_topics",
+        "columns": ["topic", "question_title", "question_content"],
+        "text_field": "question_title",
+        "label_field": "topic",
+        "max_length": 512
+    },
+    "yahoo_answer": {
+        "id": "community-datasets/yahoo_answers_topics",
+        "columns": ["topic", "best_answer"],
+        "text_field": "best_answer",
+        "label_field": "topic",
+        "max_length": 512
+    },
+    "science": {
+        "id": "knowledgator/Scientific-text-classification",
+        "columns": ["text", "label"],
+        "text_field": "text",
+        "label_field": "label",
+        "max_length": 512
+    },
+    "wiki256": {
+        "id": "seonglae/wikipedia-256",
+        "columns": ["text", "title"],
+        "text_field": "text",
+        "label_field": "title",
+        "max_length": 512
+    },
+    "wiki512": {
+        "id": "seonglae/wikipedia-512",
+        "columns": ["text", "title"],
+        "text_field": "text",
+        "label_field": "title",
+        "max_length": 1024
+    }
+}
+# =========
+# Configuration: Define model-specific information.
+# For "gpt2", we specify the SAE source and the list of hooks to use.
+# f"{model}-{dataset}" is the key for trained models.
+# =========
+models_config = {
+    "gpt2": {
+        "id": "gpt2",
+        "sae": "jbloom/GPT2-Small-SAEs-Reformatted",
+        "hooks": [
+            "blocks.11.hook_resid_pre",
+            "blocks.10.hook_resid_pre",
+            "blocks.9.hook_resid_pre",
+            "blocks.8.hook_resid_pre",
+            "blocks.7.hook_resid_pre",
+            "blocks.6.hook_resid_pre",
+            "blocks.5.hook_resid_pre",
+            "blocks.4.hook_resid_pre",
+            "blocks.3.hook_resid_pre",
+            "blocks.2.hook_resid_pre",
+            "blocks.1.hook_resid_pre",
+            "blocks.0.hook_resid_pre"
+        ]
+    },
+    "gpt2-emgsd": {
+      "id": "holistic-ai/gpt2-EMGSD",
+      "sae": "jbloom/GPT2-Small-SAEs-Reformatted",
+        "hooks": [
+            "blocks.11.hook_resid_pre",
+            "blocks.10.hook_resid_pre",
+            "blocks.9.hook_resid_pre",
+            "blocks.8.hook_resid_pre",
+            "blocks.7.hook_resid_pre",
+            "blocks.6.hook_resid_pre",
+            "blocks.5.hook_resid_pre",
+            "blocks.4.hook_resid_pre",
+            "blocks.3.hook_resid_pre",
+            "blocks.2.hook_resid_pre",
+            "blocks.1.hook_resid_pre",
+            "blocks.0.hook_resid_pre"
+        ]
+    }
+}

corr_extract.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Project: corr-steer
+This script loads a given dataset (using a datasets configuration), tokenizes the texts,
+runs a LLM model forward to extract hidden states, and then - for each SAE hook defined
+for the model - computes binary feature activations. Activations are thresholded and aggregated
+(max over the sequence), and the point-biserial correlation is computed between each feature
+and each label category.
+For each hook, the script finds the top 10 features (sorted in descending order by absolute correlation)
+per category and saves that record to a JSON file:
+    features/{model}.{dataset}.{hook}.json
+Additionally, an aggregated JSON file
+    features/{model}.{dataset}.json
+is created that, for each category, merges records from all hooks (each record includes its hook)
+and retains the top 10 features overall.
+This script is callable via Fire, e.g.:
+    python corr_extract.py --dataset emgsd --model gpt2
+"""
+import os
+import json
+import math
+import torch
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from sae_lens import SAE
+from datasets import load_dataset, concatenate_datasets
+from tqdm import tqdm
+from sklearn.preprocessing import LabelBinarizer
+from scipy.stats import pointbiserialr
+import wandb
+import fire
+from config import datasets_config, models_config
+# =========
+# Load a dataset given the dataset configuration.
+#
+# Returns:
+#   texts, labels, and the maximum token length.
+# =========
+def load_custom_dataset(dataset_name, limit: int):
+    config = datasets_config[dataset_name]
+    # For this example, we use the "train" split.
+    dataset = load_dataset(config["id"], split="test")
+    # Shuffle dataset for extracting features to divide validation
+    dataset = dataset.shuffle(seed=42).select(range(int(dataset.num_rows / 2)))
+    # Select only the specified columns.
+    dataset = dataset.select_columns(config["columns"])
+    # Apply filtering if specified.
+    if "filter" in config:
+        for key, val in config["filter"].items():
+            dataset = dataset.filter(lambda ex, key=key, val=val: ex[key] == val)
+    texts = []
+    labels = []
+    text_field = config["text_field"]
+    label_field = config["label_field"]
+    if limit:
+      dataset = dataset.select(range(limit))
+    for ex in tqdm(dataset, desc="Loading dataset"):
+        texts.append(str(ex[text_field]))
+        labels.append(str(ex[label_field]))
+    return texts, labels, config["max_length"]
+# =========
+# Tokenize a list of texts.
+#
+# Returns a list of tokenized (and device-mapped) inputs.
+# =========
+def tokenize_texts(tokenizer, texts, max_length, device):
+    tokenized = []
+    for text in tqdm(texts, desc="Tokenizing texts"):
+        encoding = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
+        for k, v in encoding.items():
+            encoding[k] = v.to(device)
+        tokenized.append(encoding)
+    return tokenized
+# =========
+# Extract aggregated binary feature activations for each SAE hook.
+#
+# For each sample, the model forward is run once to get all hidden states. Then for each hook,
+# the corresponding hidden state (parsed from the hook string) is passed through its SAE. The output
+# is thresholded (> 0) and aggregated via max pooling along the sequence dimension.
+#
+# Returns:
+#   A dictionary mapping hook names to a numpy array of shape (num_samples, num_features).
+# =========
+def extract_features(llm, model_name, tokens_list, hooks, device):
+    # Preload SAE models for each hook.
+    sae_models = {}
+    for hook in hooks:
+        sae, _, _ = SAE.from_pretrained(models_config[model_name]["sae"], hook, device=device)
+        sae_models[hook] = sae
+    features_by_hook = {hook: [] for hook in hooks}
+    for encoding in tqdm(tokens_list, desc="Extracting activations"):
+        with torch.no_grad():
+            outputs = llm(**encoding, output_hidden_states=True)
+            # For each hook, extract its corresponding hidden state.
+            for hook in hooks:
+                # Parse layer index from the hook string.
+                layer = int(hook.split(".")[1])
+                hidden_state = outputs.hidden_states[layer]  # shape: (batch_size, seq_len, hidden_dim)
+                activations = sae_models[hook].encode(hidden_state)
+                # Remove the batch dimension (assumes batch size = 1) and move to CPU.
+                activations = activations.squeeze(0).cpu().numpy()  # (seq_len, num_features)
+                # Threshold activations.
+                binary_acts = (activations > 0).astype(int)
+                # Aggregate over the sequence (max pooling).
+                aggregated = binary_acts.max(axis=0)  # (num_features,)
+                features_by_hook[hook].append(aggregated)
+    # Convert lists to numpy arrays.
+    for hook in hooks:
+        features_by_hook[hook] = np.array(features_by_hook[hook])
+    return features_by_hook
+# =========
+# Compute correlations per label category.
+#
+# Given a feature activation array of shape (n_samples, n_features) and binary labels of shape
+# (n_samples, n_categories) along with a list of category names, compute the point-biserial correlation
+# for each feature against each category. For each category, sort by absolute correlation and keep the top 10.
+#
+# Returns a dictionary keyed by category.
+# =========
+def compute_correlations_by_category(feature_activations, binary_labels, categories):
+    results = {cat: [] for cat in categories}
+    n_features = feature_activations.shape[1]
+    for feat_idx in range(n_features):
+        feat_vec = feature_activations[:, feat_idx]
+        for cat_idx, cat in enumerate(categories):
+            lbl = binary_labels[:, cat_idx]
+            corr, _ = pointbiserialr(lbl, feat_vec)
+            results[cat].append({
+                "feature_index": feat_idx,
+                "correlation": corr
+            })
+    # For each category, sort and keep the top 10 records (by absolute correlation).
+    for cat in categories:
+        results[cat] = sorted(results[cat], key=lambda x: 0 if math.isnan(x["correlation"]) else abs(x["correlation"]), reverse=True)[:10]
+    return results
+# =========
+# Main function
+#
+# This function initializes wandb (project "corr-steer"), loads the specified dataset and LLM along with
+# the SAE hooks (from models_config), tokenizes the texts, extracts feature activations,
+# computes (per hook) the per-category top 10 correlation records, and writes out one JSON per hook as
+# well as one aggregated JSON file that combines (and sorts) records across hooks.
+#
+# Run via, for example:
+#     python corr_extract.py --dataset emgsd --model gpt2
+# =========
+def main(dataset="emgsd", model="gpt2", limit=1000):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    device = "mps" if torch.backends.mps.is_available() else device
+    # Initialize wandb.
+    wandb.init(project="corr-steer", config={"model": model, "dataset": dataset})
+    # Load tokenizer and the LLM.
+    print("Loading tokenizer and model...")
+    model_id = models_config[model]["id"]
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    llm = AutoModelForCausalLM.from_pretrained(model_id).to(device)
+    llm.eval()
+    # Load dataset texts, labels, and max token length.
+    print("Loading dataset...")
+    texts, labels, max_length = load_custom_dataset(dataset, limit)
+    # Binarize labels.
+    lb = LabelBinarizer()
+    binary_labels = lb.fit_transform(labels)
+    # Ensure binary_labels is 2D.
+    if binary_labels.ndim == 1:
+        binary_labels = binary_labels.reshape(-1, 1)
+    # Tokenize texts.
+    tokens_list = tokenize_texts(tokenizer, texts, max_length, device)
+    # Determine hooks from models_config.
+    if model in models_config:
+        hooks = models_config[model]["hooks"]
+    else:
+        raise ValueError(f"Model {model} is not configured in models_config.")
+    # Extract features for all hooks.
+    print("Extracting features for all hooks...")
+    features_by_hook = extract_features(llm, model, tokens_list, hooks, device)
+    out_dir = "features"
+    os.makedirs(out_dir, exist_ok=True)
+    # For aggregated results across hooks.
+    categories = lb.classes_
+    aggregated_results = {cat: [] for cat in categories}
+    # Process each hook individually.
+    for hook in hooks:
+        print(f"Computing per-category correlations for hook {hook} ...")
+        feat = features_by_hook[hook]  # (n_samples, n_features)
+        hook_corrs = compute_correlations_by_category(feat, binary_labels, categories)
+        # Save per-hook file.
+        hook_filename = f"{model}.{dataset}.{hook}.json"
+        hook_filepath = os.path.join(out_dir, hook_filename)
+        with open(hook_filepath, "w", encoding="utf-8") as f:
+            json.dump(hook_corrs, f, indent=2, ensure_ascii=False)
+        print(f"Saved per-hook correlations to {hook_filepath}")
+        # Log each hook's results to wandb.
+        wandb.log({hook: hook_corrs})
+        # For aggregation, add hook info to each record.
+        for cat in categories:
+            for rec in hook_corrs[cat]:
+                rec_with_hook = rec.copy()
+                rec_with_hook["hook"] = hook
+                aggregated_results[cat].append(rec_with_hook)
+    # Now, for each category, sort aggregated records across hooks and retain top 10.
+    final_aggregated = {}
+    for cat in categories:
+        sorted_records = sorted(
+            aggregated_results[cat],
+            key=lambda x: 0 if math.isnan(x["correlation"]) else abs(x["correlation"]),
+            reverse=True
+        )[:10]
+        final_aggregated[cat] = sorted_records
+    agg_filename = f"{model}.{dataset}.json"
+    agg_filepath = os.path.join(out_dir, agg_filename)
+    with open(agg_filepath, "w", encoding="utf-8") as f:
+        json.dump(final_aggregated, f, indent=2, ensure_ascii=False)
+    print(f"Saved aggregated correlations to {agg_filepath}")
+    wandb.finish()
+if __name__ == "__main__":
+    fire.Fire(main)

corr_steer/steer.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# ------------------------------------
+# Modular Steering Hook Logic
+# ------------------------------------
+class SteeringHook:
+    def __init__(self, sae, device):
+        """
+        Initialize the SteeringHook with an SAE and device.
+        """
+        self.sae = sae
+        self.device = device
+        self.feature_coeffs = []  # List of (feature_index, coefficient)
+        self.hooks = []  # Store hook handles
+        self.steering_enabled = False
+    def enable_steering(self, feature_coeffs):
+        """
+        Enable steering by specifying feature coefficients.
+        Args:
+            feature_coeffs (list): List of (feature_index, coefficient).
+        """
+        self.feature_coeffs = feature_coeffs
+        self.steering_enabled = True
+    def disable_steering(self):
+        """
+        Disable steering and clear hooks.
+        """
+        self.steering_enabled = False
+        self.feature_coeffs = []
+        self.remove_hooks()
+    def generate_hook(self):
+        """
+        Create a steering hook function that modifies the residual output.
+        """
+        def hook_fn(module, inputs, outputs):
+            if not self.steering_enabled:
+                return outputs
+            residual = outputs[0]  # Residual output of the module
+            for feature_index, coeff in self.feature_coeffs:
+                steering_vector = self.sae.W_dec[feature_index].to(self.device).unsqueeze(0).unsqueeze(0)
+                residual = residual + coeff * steering_vector
+            return (residual, *outputs[1:])
+        return hook_fn
+    def register_hooks(self, model, block_idx):
+        """
+        Register the steering hook to the specified block.
+        Args:
+            model (nn.Module): The target model.
+            block_idx (int): The block index to attach the hook.
+        """
+        handle = model.transformer.h[block_idx].register_forward_hook(self.generate_hook())
+        self.hooks.append(handle)
+    def remove_hooks(self):
+        """
+        Remove all registered hooks.
+        """
+        for handle in self.hooks:
+            handle.remove()
+        self.hooks.clear()

demo/.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+node_modules
+dist
+dist-ssr
+*.local
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?

demo/README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# CorrSteer Frontend
+## Overview
+CorrSteer demonstrates how text classification datasets can be used to steer large language models (LLMs), correlating with SAE (Sparse Autoencoder) features. This demo incorporates a modern tech stack for a seamless and efficient experience.
+---
+## How to Run the Demo
+1. **Set Environment Variables:**
+   Create a `.env` file in the `demo` directory and include the following:
+   ```env
+   VITE_API_BASE_URL=<your-api-url>
+   ```
+2. **Install Dependencies:**
+   ```bash
+   pnpm i
+   ```
+3. **Start the Development Server:**
+   ```bash
+   pnpm dev
+   ```
+   The application will be available at `http://localhost:5173` by default.
+4. **Build for Production (Optional):**
+   ```bash
+   pnpm build
+   pnpm preview
+   ```
+---
+## Key Features
+- **Dataset & Model Selection:**
+  Select datasets and models using dropdown menus.
+- **Streaming Outputs:**
+  Generate outputs from multiple models with live updates as data streams.
+- **Interactive Tabs:**
+  Switch between different categories for customized prompts.
+---
+## Technology Stack
+1. **Vite:**
+   - Development server and build tool.
+2. **React:**
+   - UI library for building components and managing state.
+3. **Tailwind CSS:**
+   - CSS framework for styling.
+4. **ShadCN/UI:**
+   - Pre-built component library for UI elements.
+---
+## License
+This project is licensed under the MIT License.
+```

demo/components.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "$schema": "https://ui.shadcn.com/schema.json",
+  "style": "default",
+  "rsc": false,
+  "tsx": true,
+  "tailwind": {
+    "config": "tailwind.config.ts",
+    "css": "src/index.css",
+    "baseColor": "slate",
+    "cssVariables": true,
+    "prefix": ""
+  },
+  "aliases": {
+    "components": "@/components",
+    "utils": "@/lib/utils",
+    "ui": "@/components/ui",
+    "lib": "@/lib",
+    "hooks": "@/hooks"
+  },
+  "iconLibrary": "lucide"
+}

demo/eslint.config.js ADDED Viewed

	@@ -0,0 +1,28 @@

+import js from '@eslint/js'
+import globals from 'globals'
+import reactHooks from 'eslint-plugin-react-hooks'
+import reactRefresh from 'eslint-plugin-react-refresh'
+import tseslint from 'typescript-eslint'
+export default tseslint.config(
+  { ignores: ['dist'] },
+  {
+    extends: [js.configs.recommended, ...tseslint.configs.recommended],
+    files: ['**/*.{ts,tsx}'],
+    languageOptions: {
+      ecmaVersion: 2020,
+      globals: globals.browser,
+    },
+    plugins: {
+      'react-hooks': reactHooks,
+      'react-refresh': reactRefresh,
+    },
+    rules: {
+      ...reactHooks.configs.recommended.rules,
+      'react-refresh/only-export-components': [
+        'warn',
+        { allowConstantExport: true },
+      ],
+    },
+  },
+)

demo/index.html ADDED Viewed

	@@ -0,0 +1,13 @@

+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <link rel="icon" type="image/svg+xml" href="/finn.svg" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>CorrSteer</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
+</html>

deploy.sh ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/bin/bash
+# Deployment script for Hugging Face Spaces
+# Usage: ./deploy.sh /path/to/your/huggingface/space
+if [ $# -eq 0 ]; then
+    echo "Usage: $0 <path-to-huggingface-space>"
+    echo "Example: $0 /home/user/corr-steer-space"
+    exit 1
+fi
+SPACE_PATH="$1"
+if [ ! -d "$SPACE_PATH" ]; then
+    echo "Error: Directory $SPACE_PATH does not exist"
+    exit 1
+fi
+echo "Copying files to Hugging Face Space: $SPACE_PATH"
+# Copy essential files
+cp Dockerfile "$SPACE_PATH/"
+cp README.md "$SPACE_PATH/"
+cp requirements.txt "$SPACE_PATH/"
+cp start.sh "$SPACE_PATH/"
+cp server.py "$SPACE_PATH/"
+cp config.py "$SPACE_PATH/"
+# Copy directories
+cp -r demo/ "$SPACE_PATH/"
+cp -r features/ "$SPACE_PATH/"
+# Make start.sh executable
+chmod +x "$SPACE_PATH/start.sh"
+echo "Files copied successfully!"
+echo ""
+echo "Architecture:"
+echo "- Single Docker container"
+echo "- Flask serves both API (/api/*) and frontend (/*)"
+echo "- Frontend built as static files during Docker build"
+echo "- Port 7860 for Hugging Face Spaces"
+echo ""
+echo "Next steps:"
+echo "1. cd $SPACE_PATH"
+echo "2. git add ."
+echo "3. git commit -m 'Deploy CorrSteer'"
+echo "4. git push origin main"
+echo "5. Monitor build logs in HF Spaces"

features/gpt2.emgsd.json ADDED Viewed

	@@ -0,0 +1,314 @@

+{
+  "gender": [
+    {
+      "feature_index": 446,
+      "correlation": 0.43646558254084783,
+      "p_value": 2.0078186521554014e-181
+    },
+    {
+      "feature_index": 18672,
+      "correlation": 0.39005401881798113,
+      "p_value": 3.990362653948118e-142
+    },
+    {
+      "feature_index": 7842,
+      "correlation": 0.38303316661317355,
+      "p_value": 1.0436891049906896e-136
+    },
+    {
+      "feature_index": 10097,
+      "correlation": 0.3566824371157726,
+      "p_value": 1.5738547988431885e-117
+    },
+    {
+      "feature_index": 20455,
+      "correlation": 0.35604794762595793,
+      "p_value": 4.3338061984594966e-117
+    },
+    {
+      "feature_index": 10963,
+      "correlation": 0.34987456956336604,
+      "p_value": 7.322243361005002e-113
+    },
+    {
+      "feature_index": 7636,
+      "correlation": 0.3480422410323139,
+      "p_value": 1.2628128346319322e-111
+    },
+    {
+      "feature_index": 12175,
+      "correlation": 0.3363904323430133,
+      "p_value": 5.940623346124206e-104
+    },
+    {
+      "feature_index": 23204,
+      "correlation": 0.32981003754016014,
+      "p_value": 9.175664688774817e-100
+    },
+    {
+      "feature_index": 12425,
+      "correlation": 0.315679171864645,
+      "p_value": 4.1092284867968456e-91
+    }
+  ],
+  "lgbtq+": [
+    {
+      "feature_index": 4957,
+      "correlation": 0.9278012434805367,
+      "p_value": 0.0
+    },
+    {
+      "feature_index": 821,
+      "correlation": 0.8621067279866375,
+      "p_value": 0.0
+    },
+    {
+      "feature_index": 23222,
+      "correlation": 0.7904532588108066,
+      "p_value": 0.0
+    },
+    {
+      "feature_index": 17542,
+      "correlation": 0.7578473931771549,
+      "p_value": 0.0
+    },
+    {
+      "feature_index": 1296,
+      "correlation": 0.7135903991331536,
+      "p_value": 0.0
+    },
+    {
+      "feature_index": 10181,
+      "correlation": 0.7069358305925515,
+      "p_value": 0.0
+    },
+    {
+      "feature_index": 630,
+      "correlation": 0.6970711811966723,
+      "p_value": 0.0
+    },
+    {
+      "feature_index": 7283,
+      "correlation": 0.6888782329392438,
+      "p_value": 0.0
+    },
+    {
+      "feature_index": 9957,
+      "correlation": 0.6806925884098848,
+      "p_value": 0.0
+    },
+    {
+      "feature_index": 14756,
+      "correlation": 0.6763654166758747,
+      "p_value": 0.0
+    }
+  ],
+  "nationality": [
+    {
+      "feature_index": 16401,
+      "correlation": 0.6201132151869269,
+      "p_value": 0.0
+    },
+    {
+      "feature_index": 9956,
+      "correlation": 0.5991624893732725,
+      "p_value": 0.0
+    },
+    {
+      "feature_index": 14205,
+      "correlation": 0.5218992817048258,
+      "p_value": 6.72102973572993e-272
+    },
+    {
+      "feature_index": 19268,
+      "correlation": -0.47079023450986934,
+      "p_value": 1.0316744103724932e-214
+    },
+    {
+      "feature_index": 12899,
+      "correlation": 0.4698551338221134,
+      "p_value": 9.37287400288644e-214
+    },
+    {
+      "feature_index": 14102,
+      "correlation": 0.46625178173229587,
+      "p_value": 4.3347117601426704e-210
+    },
+    {
+      "feature_index": 18643,
+      "correlation": 0.4645600803424083,
+      "p_value": 2.200542075099065e-208
+    },
+    {
+      "feature_index": 22237,
+      "correlation": 0.4627396121645557,
+      "p_value": 1.4696460283983834e-206
+    },
+    {
+      "feature_index": 19683,
+      "correlation": 0.4586563357732425,
+      "p_value": 1.6591498330413268e-202
+    },
+    {
+      "feature_index": 10387,
+      "correlation": 0.4464409913279595,
+      "p_value": 1.0445581068141617e-190
+    }
+  ],
+  "profession": [
+    {
+      "feature_index": 19268,
+      "correlation": 0.6632217362569325,
+      "p_value": 0.0
+    },
+    {
+      "feature_index": 12738,
+      "correlation": 0.49434419021176973,
+      "p_value": 7.31805847873234e-240
+    },
+    {
+      "feature_index": 12240,
+      "correlation": 0.4822464487435754,
+      "p_value": 1.0652308403711134e-226
+    },
+    {
+      "feature_index": 3833,
+      "correlation": 0.4303762555805077,
+      "p_value": 6.587675550958632e-176
+    },
+    {
+      "feature_index": 16688,
+      "correlation": 0.42313751736840466,
+      "p_value": 1.6989194964768673e-169
+    },
+    {
+      "feature_index": 3658,
+      "correlation": 0.42073411062723626,
+      "p_value": 2.1105890047528277e-167
+    },
+    {
+      "feature_index": 4610,
+      "correlation": 0.4148617376031915,
+      "p_value": 2.344446120591354e-162
+    },
+    {
+      "feature_index": 3428,
+      "correlation": 0.4141357335326178,
+      "p_value": 9.7020391684076e-162
+    },
+    {
+      "feature_index": 9956,
+      "correlation": -0.41240054790897485,
+      "p_value": 2.850570734597228e-160
+    },
+    {
+      "feature_index": 14205,
+      "correlation": -0.4053658871423048,
+      "p_value": 2.081224608034156e-154
+    }
+  ],
+  "race": [
+    {
+      "feature_index": 11047,
+      "correlation": 0.40880904254801465,
+      "p_value": 2.924658096595988e-157
+    },
+    {
+      "feature_index": 6520,
+      "correlation": 0.32754592044407255,
+      "p_value": 2.399662718793716e-98
+    },
+    {
+      "feature_index": 18320,
+      "correlation": 0.31448088732356977,
+      "p_value": 2.1188408928978433e-90
+    },
+    {
+      "feature_index": 22312,
+      "correlation": 0.30658660427290435,
+      "p_value": 8.652381542873663e-86
+    },
+    {
+      "feature_index": 22263,
+      "correlation": 0.2775426576622264,
+      "p_value": 5.101978921250436e-70
+    },
+    {
+      "feature_index": 18492,
+      "correlation": 0.21686259675294528,
+      "p_value": 8.530010364272566e-43
+    },
+    {
+      "feature_index": 9459,
+      "correlation": 0.20860841730681046,
+      "p_value": 1.1641254990477776e-39
+    },
+    {
+      "feature_index": 22936,
+      "correlation": 0.20146560425468682,
+      "p_value": 4.7105090636264745e-37
+    },
+    {
+      "feature_index": 7170,
+      "correlation": 0.1848603817962114,
+      "p_value": 2.287836815814386e-31
+    },
+    {
+      "feature_index": 4798,
+      "correlation": 0.18320283221180658,
+      "p_value": 7.917784359744477e-31
+    }
+  ],
+  "religion": [
+    {
+      "feature_index": 18754,
+      "correlation": 0.4919855460080574,
+      "p_value": 2.968070741341591e-237
+    },
+    {
+      "feature_index": 17056,
+      "correlation": 0.4754689353084256,
+      "p_value": 1.4912893628128467e-219
+    },
+    {
+      "feature_index": 14242,
+      "correlation": 0.4479290137573203,
+      "p_value": 4.048270907596597e-192
+    },
+    {
+      "feature_index": 17886,
+      "correlation": 0.4412481619097158,
+      "p_value": 7.77154112070949e-186
+    },
+    {
+      "feature_index": 7870,
+      "correlation": 0.4383254166163272,
+      "p_value": 3.938542029802173e-183
+    },
+    {
+      "feature_index": 903,
+      "correlation": 0.42866564467200124,
+      "p_value": 2.2285535850517167e-174
+    },
+    {
+      "feature_index": 21637,
+      "correlation": 0.42829559715114474,
+      "p_value": 4.7608700745146375e-174
+    },
+    {
+      "feature_index": 7699,
+      "correlation": 0.4279351187362136,
+      "p_value": 9.96388834015504e-174
+    },
+    {
+      "feature_index": 23932,
+      "correlation": 0.4146661267040663,
+      "p_value": 3.4386189140897016e-162
+    },
+    {
+      "feature_index": 18686,
+      "correlation": 0.4145648062343114,
+      "p_value": 4.1927743639700714e-162
+    }
+  ]
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch>=2.0.0
+transformers>=4.30.0
+flask>=2.3.0
+flask-cors>=4.0.0
+sae-lens>=3.0.0
+huggingface-hub>=0.16.0
+gradio>=4.0.0
+numpy>=1.21.0

server.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import torch
+from flask import Flask, Response, request, send_from_directory, send_file
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+from threading import Thread
+from sae_lens import SAE
+from flask_cors import CORS
+from json import load
+import os
+# Example config reading
+from config import datasets_config, models_config
+# ------------------------------------
+# Global Setup: load tokenizer/models
+# ------------------------------------
+device = "cuda" if torch.cuda.is_available() else "cpu"
+device = "mps" if torch.backends.mps.is_available() else device
+# Main tokenizer (GPT-2 style). Adjust if using different.
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+if tokenizer.pad_token_id is None:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+# Original GPT-2
+original_model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)
+original_model.eval()
+# "Trained"/"biased" GPT-2 model
+trained_model = AutoModelForCausalLM.from_pretrained("holistic-ai/gpt2-EMGSD").to(device)
+trained_model.eval()
+# ------------------------------------
+# Steering Hook Setup (optional)
+# ------------------------------------
+# Example steering feature(s)
+hooks = []
+def generate_pre_hook(sae: SAE, index: int, coeff: float):
+    def steering_hook(module, inputs):
+        """
+        Simple version of a steering hook. Adds a weighted vector
+        to the residual. Customize if needed.
+        """
+        residual = inputs[0]
+        steering_vector = sae.W_dec[index].to(device).unsqueeze(0).unsqueeze(0)
+        residual = residual + coeff * steering_vector
+        return (residual)
+    return steering_hook
+def generate_post_hook(sae: SAE, index: int, coeff: float):
+    def steering_hook(module, inputs, outputs):
+        """
+        Simple version of a steering hook. Adds a weighted vector
+        to the residual. Customize if needed.
+        """
+        residual = outputs[0]
+        steering_vector = sae.W_dec[index].to(device).unsqueeze(0).unsqueeze(0)
+        residual = residual + coeff * steering_vector
+        return (residual, outputs[1], outputs[2])
+    return steering_hook
+def register_steering(model, model_key: str, gen_type: str, dataset_key: str, category_key: str):
+    file_path = f"features/{model_key}.{dataset_key}.json"
+    with open(file_path, "r") as f:
+        feature_map = load(f)
+        top_features = feature_map[category_key]
+        if "+" in gen_type:
+          coeff = 75
+        elif "-" in gen_type:
+          coeff = 50
+        if "+" in gen_type:
+          filtered_features = list(filter(lambda x: x["correlation"] > 0, top_features))
+        elif "-" in gen_type:
+          filtered_features = list(filter(lambda x: x["correlation"] < 0, top_features))
+        if len(filtered_features) == 0:
+          filtered_features = list(filter(lambda x: x["correlation"] > 0, top_features))
+          coeff = 75
+        top_feature = filtered_features[0]
+        hook_point = "blocks.11.hook_resid_pre"
+        block_idx = int(hook_point.split(".")[1])
+        index = top_feature["feature_index"]
+        sae, cfg_dict, sparsity = SAE.from_pretrained(
+            models_config[model_key]["sae"],
+            hook_point,
+            device=device,
+        )
+    module = model.transformer.h[block_idx]
+    if "pre" in hook_point:
+        handle = module.register_forward_pre_hook(generate_pre_hook(sae, index, coeff))
+    elif "post" in hook_point:
+        handle = module.register_forward_hook(generate_post_hook(sae, index, coeff))
+    hooks.append(handle)
+def remove_hooks():
+    for h in hooks:
+        h.remove()
+    hooks.clear()
+# ------------------------------------
+# Helper: streaming generator
+# ------------------------------------
+def stream_generate(model, prompt, max_new_tokens=50, temperature=1.0, top_p=0.1, repetition_penalty=10.0):
+    """
+    Yields tokens as they are generated in a separate thread.
+    """
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    generation_kwargs = dict(
+        **inputs,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
+        pad_token_id=tokenizer.eos_token_id,
+        repetition_penalty=repetition_penalty,
+    )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    for new_text in streamer:
+        yield new_text
+# ------------------------------------
+# Flask App
+# ------------------------------------
+app = Flask(__name__)
+CORS(app)
+# API routes first (to avoid conflicts with static serving)
+@app.route("/api/generate", methods=["POST"])
+def generate():
+    """
+    Expects JSON like:
+    {
+      "model": "gpt2",
+      "dataset": "emgsd",
+      "category": "lgbtq+",
+      "type": "original" | "origin+steer" | "trained" | "trained-steer"
+    }
+    Streams back the generated text token by token.
+    """
+    data = request.json
+    model_key = data["model"]
+    dataset_key = data["dataset"]
+    category_key = data["category"]
+    gen_type = data["type"]
+    # 1. Figure out prompt from config
+    try:
+      prompt_text = datasets_config[dataset_key]["category"][category_key]["prompt"]
+    except KeyError:
+      return Response("Invalid dataset/category combination.", status=400)
+    # 2. Select the model
+    if "trained" in gen_type:
+        chosen_model = trained_model
+    else:
+        chosen_model = original_model
+    # 3. Steering logic if "steer" in the request type
+    remove_hooks()
+    if "steer" in gen_type:
+        register_steering(chosen_model, model_key, gen_type, dataset_key, category_key)
+    # Return a streaming response of tokens
+    def token_stream():
+        for token in stream_generate(chosen_model, prompt_text):
+            yield token
+        remove_hooks()
+    return Response(token_stream(), mimetype="text/event-stream")
+# Serve static files for HF Spaces (after API routes)
+@app.route("/")
+def serve_frontend():
+    return send_file("demo/dist/index.html")
+@app.route("/<path:path>")
+def serve_static(path):
+    if path.startswith('api/'):
+        return None
+    if os.path.exists(f"demo/dist/{path}"):
+        return send_from_directory("demo/dist", path)
+    return send_file("demo/dist/index.html")
+if __name__ == "__main__":
+    port = int(os.environ.get("PORT", 5174))
+    app.run(host="0.0.0.0", port=port, debug=True)

start.sh ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/bin/bash
+# Set environment variables for HF Spaces
+export VITE_API_BASE_URL=""
+# Start Flask backend in the background
+echo "Starting Flask backend..."
+python server.py &
+# Wait for backend to start
+sleep 10
+# Start frontend development server
+echo "Starting React frontend..."
+cd demo
+pnpm dev --host 0.0.0.0 --port 7860 &
+# Keep the container running
+wait
+->
+#!/bin/bash
+# Set environment variables for HF Spaces
+export PORT=7860
+# Start Flask server (serves both API and frontend)
+echo "Starting CorrSteer application..."
+python server.py