Spaces:
Running
Running
""" | |
Scanpy tutorial for single-cell RNA sequencing preprocessing and clustering analysis. | |
This MCP Server provides 7 tools: | |
1. quality_control: Calculate and visualize QC metrics, filter cells and genes, detect doublets | |
2. normalize_data: Normalize count data with median total counts and log transformation | |
3. select_features: Identify highly variable genes for feature selection | |
4. reduce_dimensionality: Perform PCA analysis and variance visualization | |
5. build_neighborhood_graph: Construct nearest neighbor graph and UMAP embedding | |
6. cluster_cells: Perform Leiden clustering with visualization | |
7. annotate_cell_types: Multi-resolution clustering, marker gene analysis, and differential expression | |
All tools extracted from `https://github.com/scverse/scanpy/tree/main/docs/tutorials/basics/clustering.ipynb`. | |
""" | |
# Standard imports | |
from typing import Annotated, Literal, Any | |
import pandas as pd | |
import numpy as np | |
from pathlib import Path | |
import os | |
from fastmcp import FastMCP | |
from datetime import datetime | |
import matplotlib.pyplot as plt | |
# Scanpy and related imports | |
import scanpy as sc | |
import anndata as ad | |
# Base persistent directory (HF Spaces guarantees /data is writable & persistent) | |
BASE_DIR = Path("/data") | |
DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs" | |
DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs" | |
INPUT_DIR = Path(os.environ.get("CLUSTERING_INPUT_DIR", DEFAULT_INPUT_DIR)) | |
OUTPUT_DIR = Path(os.environ.get("CLUSTERING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR)) | |
# Ensure directories exist | |
INPUT_DIR.mkdir(parents=True, exist_ok=True) | |
OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
# Timestamp for unique outputs | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
# MCP server instance | |
clustering_mcp = FastMCP(name="clustering") | |
# Set scanpy figure parameters | |
sc.settings.set_figure_params(dpi=300, facecolor="white") | |
def quality_control( | |
# Primary data inputs | |
data_path: Annotated[str, "Path to h5ad file or directory with 10X data. The h5ad file should contain raw count data in AnnData format."] = None, | |
# Analysis parameters with tutorial defaults | |
mt_prefix: Annotated[str, "Prefix for mitochondrial genes"] = "MT-", | |
ribo_prefixes: Annotated[list, "Prefixes for ribosomal genes"] = ["RPS", "RPL"], | |
hb_pattern: Annotated[str, "Pattern for hemoglobin genes"] = "^HB[^(P)]", | |
min_genes: Annotated[int, "Minimum number of genes expressed per cell"] = 100, | |
min_cells: Annotated[int, "Minimum number of cells expressing a gene"] = 3, | |
batch_key: Annotated[str | None, "Column name in adata.obs for batch information"] = None, | |
out_prefix: Annotated[str | None, "Output file prefix"] = None, | |
) -> dict: | |
""" | |
Calculate quality control metrics, visualize QC distributions, and filter low-quality cells and genes. | |
Input is single-cell count data in AnnData format and output is QC plots, filtered data, and doublet scores. | |
""" | |
# Validate exactly one input | |
if data_path is None: | |
raise ValueError("Path to h5ad file or 10X data directory must be provided") | |
# Set output prefix | |
if out_prefix is None: | |
out_prefix = f"qc_{timestamp}" | |
# Load data | |
data_path = Path(data_path) | |
if data_path.is_dir(): | |
# Assume 10X directory format | |
adata = sc.read_10x_mtx(data_path) | |
adata.var_names_make_unique() | |
elif data_path.suffix in ['.h5', '.h5ad']: | |
if data_path.suffix == '.h5': | |
adata = sc.read_10x_h5(data_path) | |
adata.var_names_make_unique() | |
else: | |
adata = ad.read_h5ad(data_path) | |
else: | |
raise ValueError("data_path must be a directory with 10X data or h5/h5ad file") | |
# Define gene categories | |
adata.var["mt"] = adata.var_names.str.startswith(mt_prefix) | |
adata.var["ribo"] = adata.var_names.str.startswith(tuple(ribo_prefixes)) | |
adata.var["hb"] = adata.var_names.str.contains(hb_pattern) | |
# Calculate QC metrics | |
sc.pp.calculate_qc_metrics( | |
adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True | |
) | |
# Create QC violin plots | |
plt.figure(figsize=(12, 4)) | |
sc.pl.violin( | |
adata, | |
["n_genes_by_counts", "total_counts", "pct_counts_mt"], | |
jitter=0.4, | |
multi_panel=True, | |
) | |
violin_path = OUTPUT_DIR / f"{out_prefix}_qc_violin.png" | |
plt.savefig(violin_path, dpi=300, bbox_inches='tight') | |
plt.close() | |
# Create QC scatter plot | |
plt.figure(figsize=(8, 6)) | |
sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt") | |
scatter_path = OUTPUT_DIR / f"{out_prefix}_qc_scatter.png" | |
plt.savefig(scatter_path, dpi=300, bbox_inches='tight') | |
plt.close() | |
# Filter cells and genes | |
print(f"Before filtering: {adata.n_obs} cells, {adata.n_vars} genes") | |
sc.pp.filter_cells(adata, min_genes=min_genes) | |
sc.pp.filter_genes(adata, min_cells=min_cells) | |
print(f"After filtering: {adata.n_obs} cells, {adata.n_vars} genes") | |
# Doublet detection | |
if batch_key and batch_key in adata.obs.columns: | |
sc.pp.scrublet(adata, batch_key=batch_key) | |
else: | |
sc.pp.scrublet(adata) | |
# Save processed data | |
output_file = OUTPUT_DIR / f"{out_prefix}_qc_processed.h5ad" | |
adata.write_h5ad(output_file) | |
# Save QC metrics summary | |
qc_summary = pd.DataFrame({ | |
'metric': ['n_obs', 'n_vars', 'mean_n_genes_by_counts', 'mean_total_counts', 'mean_pct_counts_mt', 'doublet_rate'], | |
'value': [ | |
adata.n_obs, | |
adata.n_vars, | |
adata.obs['n_genes_by_counts'].mean(), | |
adata.obs['total_counts'].mean(), | |
adata.obs['pct_counts_mt'].mean(), | |
adata.obs['predicted_doublet'].sum() / adata.n_obs | |
] | |
}) | |
qc_summary_path = OUTPUT_DIR / f"{out_prefix}_qc_summary.csv" | |
qc_summary.to_csv(qc_summary_path, index=False) | |
return { | |
"message": f"Quality control completed for {adata.n_obs} cells and {adata.n_vars} genes", | |
"reference": "https://github.com/scverse/scanpy/tree/main/docs/tutorials/basics/clustering.ipynb", | |
"artifacts": [ | |
{ | |
"description": "QC violin plots", | |
"path": str(violin_path.resolve()) | |
}, | |
{ | |
"description": "QC scatter plot", | |
"path": str(scatter_path.resolve()) | |
}, | |
{ | |
"description": "QC processed data", | |
"path": str(output_file.resolve()) | |
}, | |
{ | |
"description": "QC metrics summary", | |
"path": str(qc_summary_path.resolve()) | |
} | |
] | |
} | |
def normalize_data( | |
# Primary data inputs | |
data_path: Annotated[str, "Path to h5ad file with QC-processed single-cell data. Should be output from quality_control tool."], | |
# Analysis parameters with tutorial defaults | |
target_sum: Annotated[float | None, "Target sum for normalization. None uses median total counts"] = None, | |
out_prefix: Annotated[str | None, "Output file prefix"] = None, | |
) -> dict: | |
""" | |
Normalize count data using median total counts scaling followed by log1p transformation. | |
Input is quality-controlled AnnData object and output is normalized expression data. | |
""" | |
# Validate exactly one input | |
if data_path is None: | |
raise ValueError("Path to h5ad file must be provided") | |
# Set output prefix | |
if out_prefix is None: | |
out_prefix = f"normalized_{timestamp}" | |
# Load data | |
adata = ad.read_h5ad(data_path) | |
# Saving count data | |
adata.layers["counts"] = adata.X.copy() | |
# Normalizing to median total counts (or target_sum if specified) | |
sc.pp.normalize_total(adata, target_sum=target_sum) | |
# Logarithmize the data | |
sc.pp.log1p(adata) | |
# Save normalized data | |
output_file = OUTPUT_DIR / f"{out_prefix}_normalized.h5ad" | |
adata.write_h5ad(output_file) | |
# Create normalization summary | |
import numpy as np | |
from scipy import sparse | |
# Handle sparse matrices properly | |
if sparse.issparse(adata.layers["counts"]): | |
counts_mean = adata.layers["counts"].mean() | |
counts_std = np.sqrt(adata.layers["counts"].multiply(adata.layers["counts"]).mean() - counts_mean**2) | |
else: | |
counts_mean = np.mean(adata.layers["counts"]) | |
counts_std = np.std(adata.layers["counts"]) | |
if sparse.issparse(adata.X): | |
x_mean = adata.X.mean() | |
x_std = np.sqrt(adata.X.multiply(adata.X).mean() - x_mean**2) | |
else: | |
x_mean = np.mean(adata.X) | |
x_std = np.std(adata.X) | |
norm_summary = pd.DataFrame({ | |
'layer': ['raw_counts', 'normalized_log1p'], | |
'mean_expression': [float(counts_mean), float(x_mean)], | |
'std_expression': [float(counts_std), float(x_std)] | |
}) | |
summary_path = OUTPUT_DIR / f"{out_prefix}_normalization_summary.csv" | |
norm_summary.to_csv(summary_path, index=False) | |
return { | |
"message": f"Data normalized with log1p transformation for {adata.n_obs} cells", | |
"reference": "https://github.com/scverse/scanpy/tree/main/docs/tutorials/basics/clustering.ipynb", | |
"artifacts": [ | |
{ | |
"description": "Normalized data", | |
"path": str(output_file.resolve()) | |
}, | |
{ | |
"description": "Normalization summary", | |
"path": str(summary_path.resolve()) | |
} | |
] | |
} | |
def select_features( | |
# Primary data inputs | |
data_path: Annotated[str, "Path to h5ad file with normalized single-cell data. Should be output from normalize_data tool."], | |
# Analysis parameters with tutorial defaults | |
n_top_genes: Annotated[int, "Number of highly variable genes to select"] = 2000, | |
batch_key: Annotated[str | None, "Column name in adata.obs for batch correction"] = None, | |
flavor: Annotated[Literal["seurat", "cell_ranger", "seurat_v3"], "Method for highly variable gene selection"] = "seurat", | |
out_prefix: Annotated[str | None, "Output file prefix"] = None, | |
) -> dict: | |
""" | |
Identify highly variable genes for feature selection using specified method. | |
Input is normalized AnnData object and output is feature selection plot and filtered data. | |
""" | |
# Validate exactly one input | |
if data_path is None: | |
raise ValueError("Path to h5ad file must be provided") | |
# Set output prefix | |
if out_prefix is None: | |
out_prefix = f"features_{timestamp}" | |
# Load data | |
adata = ad.read_h5ad(data_path) | |
# Find highly variable genes | |
if batch_key and batch_key in adata.obs.columns: | |
sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes, batch_key=batch_key, flavor=flavor) | |
else: | |
sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes, flavor=flavor) | |
# Plot highly variable genes | |
plt.figure(figsize=(10, 6)) | |
sc.pl.highly_variable_genes(adata) | |
plot_path = OUTPUT_DIR / f"{out_prefix}_highly_variable_genes.png" | |
plt.savefig(plot_path, dpi=300, bbox_inches='tight') | |
plt.close() | |
# Save data with feature selection | |
output_file = OUTPUT_DIR / f"{out_prefix}_feature_selected.h5ad" | |
adata.write_h5ad(output_file) | |
# Create feature selection summary | |
n_highly_var = adata.var['highly_variable'].sum() | |
feature_summary = pd.DataFrame({ | |
'metric': ['total_genes', 'highly_variable_genes', 'selection_fraction'], | |
'value': [ | |
adata.n_vars, | |
n_highly_var, | |
n_highly_var / adata.n_vars | |
] | |
}) | |
summary_path = OUTPUT_DIR / f"{out_prefix}_feature_summary.csv" | |
feature_summary.to_csv(summary_path, index=False) | |
return { | |
"message": f"Selected {n_highly_var} highly variable genes from {adata.n_vars} total genes", | |
"reference": "https://github.com/scverse/scanpy/tree/main/docs/tutorials/basics/clustering.ipynb", | |
"artifacts": [ | |
{ | |
"description": "Highly variable genes plot", | |
"path": str(plot_path.resolve()) | |
}, | |
{ | |
"description": "Feature selected data", | |
"path": str(output_file.resolve()) | |
}, | |
{ | |
"description": "Feature selection summary", | |
"path": str(summary_path.resolve()) | |
} | |
] | |
} | |
def reduce_dimensionality( | |
# Primary data inputs | |
data_path: Annotated[str, "Path to h5ad file with feature-selected data. Should be output from select_features tool."], | |
# Analysis parameters with tutorial defaults | |
n_comps: Annotated[int, "Number of principal components to compute"] = 50, | |
use_highly_variable: Annotated[bool, "Whether to use only highly variable genes"] = True, | |
n_pcs_plot: Annotated[int, "Number of PCs to show in variance plot"] = 50, | |
color_vars: Annotated[list, "Variables to color PCA plot by"] = ["sample", "pct_counts_mt"], | |
out_prefix: Annotated[str | None, "Output file prefix"] = None, | |
) -> dict: | |
""" | |
Perform principal component analysis for dimensionality reduction and visualization. | |
Input is feature-selected AnnData object and output is PCA embeddings and variance plots. | |
""" | |
# Validate exactly one input | |
if data_path is None: | |
raise ValueError("Path to h5ad file must be provided") | |
# Set output prefix | |
if out_prefix is None: | |
out_prefix = f"pca_{timestamp}" | |
# Load data | |
adata = ad.read_h5ad(data_path) | |
# Perform PCA | |
sc.tl.pca(adata, n_comps=n_comps, use_highly_variable=use_highly_variable) | |
# Plot PCA variance ratio | |
plt.figure(figsize=(10, 6)) | |
sc.pl.pca_variance_ratio(adata, n_pcs=n_pcs_plot, log=True) | |
variance_path = OUTPUT_DIR / f"{out_prefix}_pca_variance.png" | |
plt.savefig(variance_path, dpi=300, bbox_inches='tight') | |
plt.close() | |
# Plot PCA colored by specified variables | |
available_vars = [var for var in color_vars if var in adata.obs.columns] | |
if available_vars: | |
# Create combinations for plotting | |
plot_colors = [] | |
plot_dims = [] | |
for var in available_vars[:2]: # Limit to 2 variables to match tutorial | |
plot_colors.extend([var, var]) | |
plot_dims.extend([(0, 1), (2, 3)]) | |
plt.figure(figsize=(12, 8)) | |
sc.pl.pca( | |
adata, | |
color=plot_colors, | |
dimensions=plot_dims, | |
ncols=2, | |
size=2, | |
) | |
pca_path = OUTPUT_DIR / f"{out_prefix}_pca_colored.png" | |
plt.savefig(pca_path, dpi=300, bbox_inches='tight') | |
plt.close() | |
pca_artifacts = [{"description": "PCA colored by variables", "path": str(pca_path.resolve())}] | |
else: | |
pca_artifacts = [] | |
# Save data with PCA | |
output_file = OUTPUT_DIR / f"{out_prefix}_pca.h5ad" | |
adata.write_h5ad(output_file) | |
# Create PCA summary | |
pca_summary = pd.DataFrame({ | |
'PC': [f'PC{i+1}' for i in range(min(10, n_comps))], | |
'variance_ratio': adata.uns['pca']['variance_ratio'][:min(10, n_comps)] | |
}) | |
summary_path = OUTPUT_DIR / f"{out_prefix}_pca_summary.csv" | |
pca_summary.to_csv(summary_path, index=False) | |
artifacts = [ | |
{ | |
"description": "PCA variance plot", | |
"path": str(variance_path.resolve()) | |
}, | |
{ | |
"description": "PCA processed data", | |
"path": str(output_file.resolve()) | |
}, | |
{ | |
"description": "PCA summary", | |
"path": str(summary_path.resolve()) | |
} | |
] + pca_artifacts | |
return { | |
"message": f"PCA completed with {n_comps} components explaining {adata.uns['pca']['variance_ratio'].sum():.2%} variance", | |
"reference": "https://github.com/scverse/scanpy/tree/main/docs/tutorials/basics/clustering.ipynb", | |
"artifacts": artifacts | |
} | |
def build_neighborhood_graph( | |
# Primary data inputs | |
data_path: Annotated[str, "Path to h5ad file with PCA data. Should be output from reduce_dimensionality tool."], | |
# Analysis parameters with tutorial defaults | |
n_neighbors: Annotated[int, "Number of neighbors for graph construction"] = 15, | |
n_pcs: Annotated[int, "Number of principal components to use"] = None, | |
color_by: Annotated[str, "Variable to color UMAP by"] = "sample", | |
point_size: Annotated[float, "Point size for UMAP plot"] = 2, | |
out_prefix: Annotated[str | None, "Output file prefix"] = None, | |
) -> dict: | |
""" | |
Build nearest neighbor graph from PCA space and compute UMAP embedding for visualization. | |
Input is PCA-processed AnnData object and output is neighbor graph, UMAP embedding, and visualization. | |
""" | |
# Validate exactly one input | |
if data_path is None: | |
raise ValueError("Path to h5ad file must be provided") | |
# Set output prefix | |
if out_prefix is None: | |
out_prefix = f"neighbors_{timestamp}" | |
# Load data | |
adata = ad.read_h5ad(data_path) | |
# Compute the neighborhood graph | |
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs) | |
# Compute UMAP | |
sc.tl.umap(adata) | |
# Plot UMAP | |
if color_by in adata.obs.columns: | |
plt.figure(figsize=(8, 6)) | |
sc.pl.umap(adata, color=color_by, size=point_size) | |
umap_path = OUTPUT_DIR / f"{out_prefix}_umap.png" | |
plt.savefig(umap_path, dpi=300, bbox_inches='tight') | |
plt.close() | |
else: | |
# Plot without coloring if variable doesn't exist | |
plt.figure(figsize=(8, 6)) | |
sc.pl.umap(adata, size=point_size) | |
umap_path = OUTPUT_DIR / f"{out_prefix}_umap.png" | |
plt.savefig(umap_path, dpi=300, bbox_inches='tight') | |
plt.close() | |
# Save data with neighborhood graph and UMAP | |
output_file = OUTPUT_DIR / f"{out_prefix}_neighbors.h5ad" | |
adata.write_h5ad(output_file) | |
# Create neighborhood summary | |
neighbor_summary = pd.DataFrame({ | |
'metric': ['n_neighbors', 'n_pcs_used', 'umap_dimensions'], | |
'value': [n_neighbors, n_pcs, adata.obsm['X_umap'].shape[1]] | |
}) | |
summary_path = OUTPUT_DIR / f"{out_prefix}_neighbor_summary.csv" | |
neighbor_summary.to_csv(summary_path, index=False) | |
return { | |
"message": f"Neighborhood graph and UMAP completed for {adata.n_obs} cells", | |
"reference": "https://github.com/scverse/scanpy/tree/main/docs/tutorials/basics/clustering.ipynb", | |
"artifacts": [ | |
{ | |
"description": "UMAP visualization", | |
"path": str(umap_path.resolve()) | |
}, | |
{ | |
"description": "Neighborhood graph data", | |
"path": str(output_file.resolve()) | |
}, | |
{ | |
"description": "Neighborhood summary", | |
"path": str(summary_path.resolve()) | |
} | |
] | |
} | |
def cluster_cells( | |
# Primary data inputs | |
data_path: Annotated[str, "Path to h5ad file with neighborhood graph. Should be output from build_neighborhood_graph tool."], | |
# Analysis parameters with tutorial defaults | |
resolution: Annotated[float, "Resolution parameter for Leiden clustering"] = 0.5, | |
flavor: Annotated[Literal["igraph", "leidenalg"], "Leiden algorithm implementation"] = "igraph", | |
n_iterations: Annotated[int, "Number of iterations for clustering"] = 2, | |
cluster_key: Annotated[str, "Key name for storing clusters in adata.obs"] = "leiden", | |
out_prefix: Annotated[str | None, "Output file prefix"] = None, | |
) -> dict: | |
""" | |
Perform Leiden clustering on the neighborhood graph and visualize results. | |
Input is AnnData with neighborhood graph and output is clustered data with UMAP visualization. | |
""" | |
# Validate exactly one input | |
if data_path is None: | |
raise ValueError("Path to h5ad file must be provided") | |
# Set output prefix | |
if out_prefix is None: | |
out_prefix = f"clusters_{timestamp}" | |
# Load data | |
adata = ad.read_h5ad(data_path) | |
# Perform Leiden clustering | |
sc.tl.leiden( | |
adata, | |
resolution=resolution, | |
flavor=flavor, | |
n_iterations=n_iterations, | |
key_added=cluster_key | |
) | |
# Plot UMAP colored by clusters | |
plt.figure(figsize=(8, 6)) | |
sc.pl.umap(adata, color=[cluster_key]) | |
cluster_path = OUTPUT_DIR / f"{out_prefix}_clusters_umap.png" | |
plt.savefig(cluster_path, dpi=300, bbox_inches='tight') | |
plt.close() | |
# Save clustered data | |
output_file = OUTPUT_DIR / f"{out_prefix}_clustered.h5ad" | |
adata.write_h5ad(output_file) | |
# Create clustering summary | |
n_clusters = len(adata.obs[cluster_key].unique()) | |
cluster_counts = adata.obs[cluster_key].value_counts().sort_index() | |
cluster_summary = pd.DataFrame({ | |
'cluster': cluster_counts.index, | |
'n_cells': cluster_counts.values, | |
'fraction': cluster_counts.values / adata.n_obs | |
}) | |
summary_path = OUTPUT_DIR / f"{out_prefix}_cluster_summary.csv" | |
cluster_summary.to_csv(summary_path, index=False) | |
return { | |
"message": f"Leiden clustering identified {n_clusters} clusters at resolution {resolution}", | |
"reference": "https://github.com/scverse/scanpy/tree/main/docs/tutorials/basics/clustering.ipynb", | |
"artifacts": [ | |
{ | |
"description": "Clusters UMAP plot", | |
"path": str(cluster_path.resolve()) | |
}, | |
{ | |
"description": "Clustered data", | |
"path": str(output_file.resolve()) | |
}, | |
{ | |
"description": "Cluster summary", | |
"path": str(summary_path.resolve()) | |
} | |
] | |
} | |
def annotate_cell_types( | |
# Primary data inputs | |
data_path: Annotated[str, "Path to h5ad file with clustered data. Should be output from cluster_cells tool."], | |
# Analysis parameters with tutorial defaults | |
resolutions: Annotated[list, "List of resolutions for multi-resolution clustering"] = [0.02, 0.5, 2.0], | |
groupby_key: Annotated[str, "Clustering key to use for marker analysis"] = "leiden_res_0.50", | |
method: Annotated[Literal["wilcoxon", "t-test", "logreg"], "Method for differential expression"] = "wilcoxon", | |
n_genes: Annotated[int, "Number of top genes to show in plots"] = 5, | |
marker_genes: Annotated[dict | None, "Dictionary of cell type marker genes"] = None, | |
out_prefix: Annotated[str | None, "Output file prefix"] = None, | |
) -> dict: | |
""" | |
Perform multi-resolution clustering, marker gene analysis, and differential expression for cell type annotation. | |
Input is clustered AnnData object and output is multi-resolution plots, marker analysis, and differential expression results. | |
""" | |
# Validate exactly one input | |
if data_path is None: | |
raise ValueError("Path to h5ad file must be provided") | |
# Set output prefix | |
if out_prefix is None: | |
out_prefix = f"annotation_{timestamp}" | |
# Load data | |
adata = ad.read_h5ad(data_path) | |
# Define default marker genes if not provided | |
if marker_genes is None: | |
marker_genes = { | |
"CD14+ Mono": ["FCN1", "CD14"], | |
"CD16+ Mono": ["TCF7L2", "FCGR3A", "LYN"], | |
"cDC2": ["CST3", "COTL1", "LYZ", "DMXL2", "CLEC10A", "FCER1A"], | |
"Erythroblast": ["MKI67", "HBA1", "HBB"], | |
"Proerythroblast": ["CDK6", "SYNGR1", "HBM", "GYPA"], | |
"NK": ["GNLY", "NKG7", "CD247", "FCER1G", "TYROBP", "KLRG1", "FCGR3A"], | |
"ILC": ["ID2", "PLCG2", "GNLY", "SYNE1"], | |
"Naive CD20+ B": ["MS4A1", "IL4R", "IGHD", "FCRL1", "IGHM"], | |
"B cells": ["MS4A1", "ITGB1", "COL4A4", "PRDM1", "IRF4", "PAX5", "BCL11A", "BLK", "IGHD", "IGHM"], | |
"Plasma cells": ["MZB1", "HSP90B1", "FNDC3B", "PRDM1", "IGKC", "JCHAIN"], | |
"Plasmablast": ["XBP1", "PRDM1", "PAX5"], | |
"CD4+ T": ["CD4", "IL7R", "TRBC2"], | |
"CD8+ T": ["CD8A", "CD8B", "GZMK", "GZMA", "CCL5", "GZMB", "GZMH", "GZMA"], | |
"T naive": ["LEF1", "CCR7", "TCF7"], | |
"pDC": ["GZMB", "IL3RA", "COBLL1", "TCF4"], | |
} | |
# Perform multi-resolution clustering | |
for res in resolutions: | |
sc.tl.leiden( | |
adata, key_added=f"leiden_res_{res:4.2f}", resolution=res, flavor="igraph" | |
) | |
# Plot multi-resolution clustering | |
cluster_keys = [f"leiden_res_{res:4.2f}" for res in resolutions] | |
plt.figure(figsize=(15, 5)) | |
sc.pl.umap( | |
adata, | |
color=cluster_keys, | |
legend_loc="on data", | |
) | |
multiresolution_path = OUTPUT_DIR / f"{out_prefix}_multiresolution_clusters.png" | |
plt.savefig(multiresolution_path, dpi=300, bbox_inches='tight') | |
plt.close() | |
# Check if groupby_key exists, if not use first resolution | |
if groupby_key not in adata.obs.columns: | |
groupby_key = cluster_keys[1] if len(cluster_keys) > 1 else cluster_keys[0] | |
# Plot marker genes | |
# Filter marker genes to only include those present in the data | |
available_markers = {} | |
for cell_type, genes in marker_genes.items(): | |
available_genes = [g for g in genes if g in adata.var_names] | |
if available_genes: | |
available_markers[cell_type] = available_genes | |
if available_markers: | |
plt.figure(figsize=(12, 8)) | |
sc.pl.dotplot(adata, available_markers, groupby=groupby_key, standard_scale="var") | |
marker_path = OUTPUT_DIR / f"{out_prefix}_marker_genes.png" | |
plt.savefig(marker_path, dpi=300, bbox_inches='tight') | |
plt.close() | |
marker_artifacts = [{"description": "Marker genes dotplot", "path": str(marker_path.resolve())}] | |
else: | |
marker_artifacts = [] | |
# Differential expression analysis | |
sc.tl.rank_genes_groups(adata, groupby=groupby_key, method=method) | |
# Plot top differentially expressed genes | |
plt.figure(figsize=(10, 8)) | |
sc.pl.rank_genes_groups_dotplot( | |
adata, groupby=groupby_key, standard_scale="var", n_genes=n_genes | |
) | |
de_path = OUTPUT_DIR / f"{out_prefix}_differential_expression.png" | |
plt.savefig(de_path, dpi=300, bbox_inches='tight') | |
plt.close() | |
# Create manual cell type annotations for coarse resolution | |
coarse_key = f"leiden_res_{resolutions[0]:4.2f}" | |
if coarse_key in adata.obs.columns: | |
adata.obs["cell_type_lvl1"] = adata.obs[coarse_key].map({ | |
"0": "Lymphocytes", | |
"1": "Monocytes", | |
"2": "Erythroid", | |
"3": "B Cells", | |
}) | |
# Save annotated data | |
output_file = OUTPUT_DIR / f"{out_prefix}_annotated.h5ad" | |
adata.write_h5ad(output_file) | |
# Export differential expression results | |
de_results = [] | |
for cluster in adata.obs[groupby_key].unique(): | |
cluster_genes = sc.get.rank_genes_groups_df(adata, group=cluster).head(n_genes) | |
cluster_genes['cluster'] = cluster | |
de_results.append(cluster_genes) | |
if de_results: | |
de_df = pd.concat(de_results, ignore_index=True) | |
de_path_csv = OUTPUT_DIR / f"{out_prefix}_differential_genes.csv" | |
de_df.to_csv(de_path_csv, index=False) | |
de_artifacts = [{"description": "Differential expression genes", "path": str(de_path_csv.resolve())}] | |
else: | |
de_artifacts = [] | |
# Create annotation summary | |
annotation_summary = pd.DataFrame({ | |
'resolution': resolutions, | |
'n_clusters': [len(adata.obs[f"leiden_res_{res:4.2f}"].unique()) for res in resolutions] | |
}) | |
summary_path = OUTPUT_DIR / f"{out_prefix}_annotation_summary.csv" | |
annotation_summary.to_csv(summary_path, index=False) | |
artifacts = [ | |
{ | |
"description": "Multi-resolution clustering", | |
"path": str(multiresolution_path.resolve()) | |
}, | |
{ | |
"description": "Differential expression plot", | |
"path": str(de_path.resolve()) | |
}, | |
{ | |
"description": "Annotated data", | |
"path": str(output_file.resolve()) | |
}, | |
{ | |
"description": "Annotation summary", | |
"path": str(summary_path.resolve()) | |
} | |
] + marker_artifacts + de_artifacts | |
return { | |
"message": f"Cell type annotation completed with {len(resolutions)} resolutions and marker analysis", | |
"reference": "https://github.com/scverse/scanpy/tree/main/docs/tutorials/basics/clustering.ipynb", | |
"artifacts": artifacts | |
} | |
def preprocess_and_cluster_scanpy(data_path: str) -> str: | |
""" | |
Complete preprocessing and clustering pipeline for single-cell RNA sequencing data analysis. | |
This comprehensive workflow performs all essential steps for analyzing scRNA-seq data from raw counts | |
to cell type annotation, following the standard Scanpy tutorial for single-cell analysis. | |
""" | |
return f""" | |
Execute a complete single-cell RNA-seq preprocessing and clustering pipeline on {data_path}. | |
First inspect the data to understand: | |
- Dataset size and complexity | |
- Organism (human/mouse) from gene names | |
- Batch information in adata.obs (e.g., "sample", "batch", "donor", "experiment", "condition") | |
- Data quality distribution | |
IMPORTANT: Adapt parameters intelligently based on data characteristics. | |
Stick to the defaults if there is no strong reason (e.g. unchanged leads to false results) to change. | |
Then run the pipeline sequentially, making smart parameter choices: | |
1. **quality_control** - Examine data and adapt: | |
- data_path="{data_path}" | |
- batch_key: Set if batch columns exist (for batch-aware doublet detection) | |
- mt_prefix: "MT-" (human) or "Mt-" (mouse) based on gene names | |
- min_genes/min_cells: Adjust based on quality distributions | |
- Review QC plots before proceeding | |
2. **normalize_data** - Use QC output: | |
- target_sum: None (median) or 10000 (CP10K) | |
3. **select_features** - Feature selection: | |
- batch_key: Use same as step 1 if batches present | |
- n_top_genes: 2000-3000 based on complexity | |
- flavor: "seurat" or "seurat_v3" for high dropout | |
4. **reduce_dimensionality** - PCA analysis: | |
- n_comps: 50 (or less for small datasets) | |
- Review variance plot for optimal PC selection | |
- color_vars: Include relevant metadata | |
5. **build_neighborhood_graph** - Graph construction: | |
- n_pcs: Based on elbow in variance plot (20-40) | |
- n_neighbors: 10-30 based on dataset size | |
- Check UMAP for batch effects | |
6. **cluster_cells** - Clustering: | |
- resolution: 0.1-0.4 (broad) or 0.6-1.5 (fine) | |
- Based on expected cell type diversity | |
7. **annotate_cell_types** - Annotation: | |
- resolutions: Test multiple [low, medium, high] | |
- marker_genes: Provide tissue-specific markers if known | |
- Validate with marker expression | |
KEY DECISIONS: | |
- Identify and consistently use batch_key throughout if batches exist | |
- Adjust all thresholds based on data quality | |
- Validate each step before proceeding | |
- Document any anomalies or batch effects | |
The pipeline produces a fully annotated dataset with QC metrics, embeddings, clusters, and cell type markers. | |
""" |