Spaces:

blanchon
/

TiM

Running on Zero

App Files Files Community

Julien Blanchon commited on 10 days ago

Commit

d0e893e

0 Parent(s):

Clean Space repo (code only, checkpoints in model repo)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +39 -0
.gitignore +183 -0
.python-version +1 -0
README.md +280 -0
app.py +343 -0
configs/c2i/tim_b_p4.yaml +78 -0
configs/c2i/tim_xl_p1_512.yaml +85 -0
configs/c2i/tim_xl_p1_512_mg.yaml +85 -0
configs/c2i/tim_xl_p2_256.yaml +85 -0
configs/c2i/tim_xl_p2_256_mg.yaml +85 -0
configs/t2i/tim_xl_p1_t2i.yaml +81 -0
pyproject.toml +31 -0
requirements.txt +15 -0
setup.py +12 -0
tim/data/c2i_data.py +150 -0
tim/data/sampler_utils.py +52 -0
tim/data/t2i_data.py +126 -0
tim/models/c2i/tim_model.py +406 -0
tim/models/nvidia_radio/hubconf.py +192 -0
tim/models/nvidia_radio/radio/__init__.py +17 -0
tim/models/nvidia_radio/radio/adaptor_base.py +37 -0
tim/models/nvidia_radio/radio/adaptor_generic.py +69 -0
tim/models/nvidia_radio/radio/adaptor_mlp.py +174 -0
tim/models/nvidia_radio/radio/adaptor_registry.py +37 -0
tim/models/nvidia_radio/radio/block.py +54 -0
tim/models/nvidia_radio/radio/cls_token.py +59 -0
tim/models/nvidia_radio/radio/common.py +108 -0
tim/models/nvidia_radio/radio/conv.py +65 -0
tim/models/nvidia_radio/radio/dinov2_arch.py +1016 -0
tim/models/nvidia_radio/radio/dual_hybrid_vit.py +213 -0
tim/models/nvidia_radio/radio/enable_cpe_support.py +224 -0
tim/models/nvidia_radio/radio/enable_damp.py +42 -0
tim/models/nvidia_radio/radio/enable_spectral_reparam.py +277 -0
tim/models/nvidia_radio/radio/eradio_model.py +1392 -0
tim/models/nvidia_radio/radio/extra_models.py +206 -0
tim/models/nvidia_radio/radio/extra_timm_models.py +206 -0
tim/models/nvidia_radio/radio/feature_normalizer.py +111 -0
tim/models/nvidia_radio/radio/forward_intermediates.py +138 -0
tim/models/nvidia_radio/radio/hf_model.py +202 -0
tim/models/nvidia_radio/radio/input_conditioner.py +49 -0
tim/models/nvidia_radio/radio/open_clip_adaptor.py +41 -0
tim/models/nvidia_radio/radio/radio_model.py +375 -0
tim/models/nvidia_radio/radio/vision_transformer_xpos.py +357 -0
tim/models/nvidia_radio/radio/vit_patch_generator.py +287 -0
tim/models/nvidia_radio/radio/vitdet.py +188 -0
tim/models/t2i/tim_model.py +493 -0
tim/models/utils/funcs.py +53 -0
tim/models/utils/norms.py +403 -0
tim/models/utils/rope.py +305 -0
tim/models/utils/text_encoders.py +63 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,39 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoints/** filter=lfs diff=lfs merge=lfs -text
+checkpoints/c2i_model_256.safetensors filter=lfs diff=lfs merge=lfs -text
+checkpoints/c2i_model_512.safetensors filter=lfs diff=lfs merge=lfs -text
+checkpoints/t2i_model.bin filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,183 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+*.json
+*.svg
+/workdir
+/datasets
+/wandb
+samples/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

README.md ADDED Viewed

	@@ -0,0 +1,280 @@

+---
+title: TiM
+emoji: 🏆
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 5.44.1
+app_file: app.py
+pinned: false
+---
+<h1 align="center">Transition Models: Rethinking the Generative Learning Objective</h1>
+<div align="center">
+  <a href="https://github.com/WZDTHU" target="_blank">ZiDong&nbsp;Wang</a><sup>1,2,*</sup>
+  &ensp; <b>&middot;</b> &ensp;
+  <a href="https://invictus717.github.io" target="_blank">Yiyuan&nbsp;Zhang</a><sup>1,2,*,‡</sup>
+  &ensp; <b>&middot;</b> &ensp;
+  <a href="https://yuexy.github.io/" target="_blank">Xiaoyu&nbsp;Yue</a><sup>2,3</sup>
+  &ensp; <b>&middot;</b> &ensp;
+  <a href="https://xyue.io" target="_blank">Xiangyu&nbsp;Yue</a><sup>1</sup>
+  &ensp; <b>&middot;</b> &ensp;
+  <a href="https://yg256li.github.io" target="_blank">Yangguang&nbsp;Li</a><sup>1,†</sup>
+  &ensp; <b>&middot;</b> &ensp;
+  <a href="https://wlouyang.github.io" target="_blank">Wanli&nbsp;Ouyang</a><sup>1,2</sup>
+  &ensp; <b>&middot;</b> &ensp;
+  <a href="http://leibai.site" target="_blank">Lei&nbsp;Bai</a><sup>2,†</sup>
+  <sup>1</sup> MMLab CUHK &emsp; <sup>2</sup>Shanghai AI Lab &emsp; <sup>3</sup>USYD <br>
+  <sup>*</sup>Equal Contribution &emsp; <sup>‡</sup>Project Lead &emsp; <sup>†</sup>Corresponding Authors &emsp; <br>
+</div>
+<h3 align="center">
+<!-- [<a href="https://wzdthu.github.io/NiT">project page</a>]&emsp; -->
+[<a href="https://arxiv.org/abs/2509.04394">arXiv</a>]&emsp;
+[<a href="https://huggingface.co/GoodEnough/TiM-T2I">Model</a>]&emsp;
+[<a href="https://huggingface.co/datasets/GoodEnough/TiM-Toy-T2I-Dataset">Dataset</a>]&emsp;
+</h3>
+<br>
+<b>Highlights</b>: We propose Transition Models (TiM), a novel generative model that learns to navigate the entire generative trajectory with unprecedented flexibility.
+* Our Transition Models (TiM) are trained to master arbitrary state-to-state transitions. This approach allows TiM to learn the entire solution manifold of the generative process, unifying the few-step and many-step regimes within a single, powerful model.
+  ![Figure](./assets/illustration.png)
+* Despite having only 865M parameters, TiM achieves state-of-the-art performance, surpassing leading models such as SD3.5 (8B parameters) and FLUX.1 (12B parameters) across all evaluated step counts on GenEval benchmark. Importantly, unlike previous few-step generators, TiM demonstrates monotonic quality improvement as the sampling budget increases.
+  ![Figure](./assets/nfe_demo.png)
+* Additionally, when employing our native-resolution strategy, TiM delivers exceptional fidelity at resolutions up to $4096\times4096$.
+  ![Figure](./assets/tim_demo.png)
+## 🚨 News
+- `2025-9-5` We are delighted to introduce TiM, which is the first text-to-image generator support any-step generation, entirely trained from scratch. We have released the codes and pretrained models of TiM.
+## 1. Setup
+First, clone the repo:
+```bash
+git clone https://github.com/WZDTHU/TiM.git && cd TiM
+```
+### 1.1 Environment Setup
+```bash
+conda create -n tim_env python=3.10
+pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu118
+pip install flash-attn
+pip install -r requirements.txt
+pip install -e .
+```
+### 1.2 Model Zoo (WIP)
+#### Text-to-Image Generation
+A single TiM model can perform any-step generation (one-step, few-step, and multi-step) and demonstrate monotonic quality improvement as the sampling budget increases.
+| Model | Model Zoo | Model Size | VAE | 1-NFE GenEval | 8-NFE GenEval | 128-NFE GenEval |
+|---------------|------------|---------|------------|-------|-------|-------|
+| TiM-T2I | [🤗 HF](https://huggingface.co/GoodEnough/TiM-T2I/blob/main/t2i_model.bin) | 865M | [DC-AE](https://huggingface.co/mit-han-lab/dc-ae-f32c32-sana-1.1-diffusers) | 0.67 | 0.76 | 0.83 |
+```bash
+mkdir checkpoints
+wget -c "https://huggingface.co/GoodEnough/TiM-T2I/resolve/main/t2i_model.bin" -O checkpoints/t2i_model.bin
+```
+#### Class-guided Image Generation:
+| Model | Model Zoo | Model Size | VAE | 2-NFE FID | 500-NFE FID |
+|---------------|------------|---------|------------|------------|------------|
+| TiM-C2I-256 | [🤗 HF](https://huggingface.co/GoodEnough/TiM-C2I/blob/main/c2i_model_256.safetensors) | 664M | [SD-VAE](https://huggingface.co/stabilityai/sd-vae-ft-ema) | 6.14 | 1.65
+| TiM-C2I-512 | [🤗 HF](https://huggingface.co/GoodEnough/TiM-C2I/blob/main/c2i_model_512.safetensors) | 664M | [DC-AE](https://huggingface.co/mit-han-lab/dc-ae-f32c32-sana-1.1-diffusers) | 4.79 | 1.69
+```bash
+mkdir checkpoints
+wget -c "https://huggingface.co/GoodEnough/TiM-C2I/resolve/main/c2i_model_256.safetensors" -O checkpoints/c2i_model_256.safetensors
+wget -c "https://huggingface.co/GoodEnough/TiM-C2I/resolve/main/c2i_model_512.safetensors" -O checkpoints/c2i_model_512.safetensors
+```
+## 2. Sampling
+#### Text-to-Image Generation
+We provide the sampling scripts on three benchmarks: GenEval, DPGBench, and MJHQ30K. You can specify the sampling steps, resolutions, and CFG scale in the corresponding scripts.
+Sampling with TiM-T2I model on GenEval benchmark:
+```bash
+bash scripts/sample/t2i/sample_t2i_geneval.sh
+```
+Sampling with TiM-T2I model on DPGBench benchmark:
+```bash
+bash scripts/sample/t2i/sample_t2i_dpgbench.sh
+```
+Sampling with TiM-T2I model on MJHQ30k benchmark:
+```bash
+bash scripts/sample/t2i/sample_t2i_mjhq30k.sh
+```
+#### Class-guided Image Generation
+We provide the sampling scripts for ImageNet-256 and ImageNet-512.
+Sampling with C2I model on $256\times256$ resolution:
+```bash
+bash scripts/sample/c2i/sample_256x256.sh
+```
+Sampling with C2I model on $512\times512$ resolution:
+```bash
+bash scripts/sample/c2i/sample_512x512.sh
+```
+## 3. Evaluation
+### Text-to-Image Generation
+#### GenEval
+Please follow the [GenEval](https://github.com/djghosh13/geneval) to setup the conda-environment.
+Given the directory of the generated images `SAMPLING_DIR` and folder of object dector `OBJECT_DETECTOR_FOLDER`, run the following codes:
+```bash
+python projects/evaluate/geneval/evaluation/evaluate_images.py $SAMPLING_DIR --outfile geneval_results.jsonl --model-path $OBJECT_DETECTOR_FOLDER
+```
+This will result in a JSONL file with each line corresponding to an image. Run the following codes to obtain the GenEval Score:
+```bash
+python projects/evaluate/geneval/evaluation/summary_scores.py geneval_results.jsonl
+```
+#### DPGBench
+Please follow the [DPGBench](https://github.com/TencentQQGYLab/ELLA) to setup the conda-environment.
+Given the directory of the generated images `SAMPLING_DIR` , run the following codes:
+```bash
+python projects/evaluate/dpg_bench/compute_dpg_bench.py --image-root-path $SAMPLING_DIR --res-path dpgbench_results.txt --pic-num 4
+```
+#### MJHQ30K
+Please download [MJHQ30K](https://huggingface.co/datasets/playgroundai/MJHQ-30K) as the reference-image.
+Given the directory of the reference-image direcotry `REFERENCE_DIR` and the directory of the generated images `SAMPLING_DIR`, run the following codes to calculate the FID Score:
+```bash
+python projects/evaluate/mjhq30k/calculate_fid.py $REFERENCE_DIR $SAMPLING_DIR
+```
+For CLIP Score, first compute the text features and save it in `MJHQ30K_TEXT_FEAT`:
+```bash
+python projects/evaluate/mjhq30k/calculate_clip.py projects/evaluate/mjhq30k/meta_data.json $MJHQ30K_TEXT_FEAT/clip_feat.safetensors --save-stats
+```
+Then run the following codes to calculate the CLIP Score:
+```bash
+python projects/evaluate/mjhq30k/calculate_clip.py $MJHQ30K_TEXT_FEAT/clip_feat.safetensors $SAMPLING_DIR
+```
+### Class-guided Image Generation
+The sampling generates a folder of samples to compute FID, Inception Score and other metrics.
+<b>Note that we do not pack the generate samples as a `.npz` file, this does not affect the calculation of FID and other metrics.</b>
+Please follow the [ADM's TensorFlow
+evaluation suite](https://github.com/openai/guided-diffusion/tree/main/evaluations)
+to setup the conda-environment and download the reference batch.
+```bash
+wget -c "https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/classify_image_graph_def.pb" -O checkpoints/classify_image_graph_def.pb
+```
+Given the directory of the reference batch `REFERENCE_DIR` and the directory of the generated images `SAMPLING_DIR`, run the following codes:
+```bash
+python projects/evaluate/adm_evaluator.py $REFERENCE_DIR $SAMPLING_DIR
+```
+## 4. Training
+### 4.1 Dataset Setup
+Currently, we provide all the [preprocessed dataset](https://huggingface.co/datasets/GoodEnough/NiT-Preprocessed-ImageNet1K) for ImageNet1K. Please use the following commands to download the preprocessed latents.
+```bash
+bash tools/download_imagenet_256x256.sh
+bash tools/download_imagenet_512x512.sh
+```
+For text-to-image generation, we provide a [toy dataset](https://huggingface.co/datasets/GoodEnough/TiM-Toy-T2I-Dataset). Please use the following command to download this dataset.
+```bash
+bash tools/download_toy_t2i_dataset.sh
+```
+### 4.2 Download Image Encoder
+We use RADIO-v2.5-b as our image encoder for REPA-loss.
+```bash
+wget -c "https://huggingface.co/nvidia/RADIO/resolve/main/radio-v2.5-b_half.pth.tar" -O checkpoints/radio-v2.5-b_half.pth.tar
+```
+### 4.3 Training Scripts
+Specify the `image_dir` in `configs/c2i/tim_b_p4.yaml` and train the base-model (131M) on ImageNet-256:
+```bash
+bash scripts/train/c2i/train_tim_c2i_b.sh
+```
+Specify the `image_dir` in `configs/c2i/tim_xl_p2_256.yaml` and train the XL-model (664M) on ImageNet-256:
+```bash
+bash scripts/train/c2i/train_tim_c2i_xl_256.sh
+```
+Specify the `image_dir` in `configs/c2i/tim_xl_p2_512.yaml` and train the XL-model (664M) on ImageNet-512:
+```bash
+bash scripts/train/c2i/train_tim_c2i_xl_512.sh
+```
+Specify the `root_dir` in `configs/t2i/tim_xl_p1_t2i.yaml` and train the T2I-model (865M) on Toy-T2I-Dataset:
+```bash
+bash scripts/train/t2i/train_tim_t2i.sh
+```
+## Citations
+If you find the project useful, please kindly cite:
+```bibtex
+@article{wang2025transition,
+  title={Transition Models: Rethinking the Generative Learning Objective},
+  author={Wang, Zidong and Zhang, Yiyuan and Yue, Xiaoyu and Yue, Xiangyu and Li, Yangguang and Ouyang, Wanli and Bai, Lei},
+  year={2025},
+  eprint={2509.04394},
+  archivePrefix={arXiv},
+  primaryClass={cs.LG}
+}
+```
+https://arxiv.org/abs/
+## License
+This project is licensed under the Apache-2.0 license.

app.py ADDED Viewed

	@@ -0,0 +1,343 @@

+import gradio as gr
+import spaces  # type: ignore - ZeroGPU spaces library
+import numpy as np
+import random
+import torch
+import functools
+from pathlib import Path
+from PIL import Image
+from omegaconf import OmegaConf  # type: ignore - YAML configuration library
+from tim.schedulers.transition import TransitionSchedule
+from tim.utils.misc_utils import instantiate_from_config, init_from_ckpt
+from tim.models.vae import get_sd_vae, get_dc_ae, sd_vae_decode, dc_ae_decode
+from tim.models.utils.text_encoders import load_text_encoder, encode_prompt
+# Configuration
+dtype = torch.bfloat16
+device = "cuda" if torch.cuda.is_available() else "cpu"
+MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 2048
+# Global variables to store loaded components
+model = None
+scheduler = None
+text_encoder = None
+tokenizer = None
+decode_func = None
+null_cap_feat = None
+null_cap_mask = None
+config = None
+def load_model_components(device: str = "cuda"):
+    """Load all model components once at startup"""
+    global \
+        model, \
+        scheduler, \
+        text_encoder, \
+        tokenizer, \
+        decode_func, \
+        null_cap_feat, \
+        null_cap_mask, \
+        config
+    try:
+        # Load configuration
+        config_path = "configs/t2i/tim_xl_p1_t2i.yaml"
+        ckpt_path = "checkpoints/t2i_model.bin"
+        if not Path(config_path).exists():
+            raise FileNotFoundError(f"Config file not found: {config_path}")
+        if not Path(ckpt_path).exists():
+            raise FileNotFoundError(f"Checkpoint file not found: {ckpt_path}")
+        print("Loading configuration...")
+        config = OmegaConf.load(config_path)
+        model_config = config.model
+        print("Loading VAE...")
+        # Load VAE
+        if "dc-ae" in model_config.vae_dir:
+            dc_ae = get_dc_ae(model_config.vae_dir, dtype=torch.float32, device=device)
+            dc_ae.enable_tiling(2560, 2560, 2560, 2560)
+            decode_func = functools.partial(dc_ae_decode, dc_ae, slice_vae=True)
+        elif "sd-vae" in model_config.vae_dir:
+            sd_vae = get_sd_vae(
+                model_config.vae_dir, dtype=torch.float32, device=device
+            )
+            decode_func = functools.partial(sd_vae_decode, sd_vae, slice_vae=True)
+        else:
+            raise ValueError("Unsupported VAE type")
+        print("Loading text encoder...")
+        # Load text encoder
+        text_encoder, tokenizer = load_text_encoder(
+            text_encoder_dir=model_config.text_encoder_dir,
+            device=device,
+            weight_dtype=torch.bfloat16,
+        )
+        print("Encoding null caption...")
+        # Get null caption features
+        null_cap_feat, null_cap_mask = encode_prompt(
+            tokenizer,
+            text_encoder,
+            device,
+            torch.bfloat16,
+            [""],
+            model_config.use_last_hidden_state,
+            max_seq_length=model_config.max_seq_length,
+        )
+        print("Loading main model...")
+        # Load main model
+        model = instantiate_from_config(model_config.network).to(
+            device=device, dtype=dtype
+        )
+        init_from_ckpt(model, checkpoint_dir=ckpt_path, ignore_keys=None, verbose=True)
+        model.eval()
+        print("Loading scheduler...")
+        # Load scheduler
+        transport = instantiate_from_config(model_config.transport)
+        scheduler = TransitionSchedule(
+            transport=transport, **OmegaConf.to_container(model_config.transition_loss)
+        )
+        print("All components loaded successfully!")
+    except Exception as e:
+        print(f"Error loading model components: {e}")
+        raise e
+@spaces.GPU(duration=60)
+def generate_image(
+    prompt,
+    seed=42,
+    randomize_seed=False,
+    width=1024,
+    height=1024,
+    guidance_scale=2.5,
+    num_inference_steps=16,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """Generate image from text prompt"""
+    try:
+        # Validate inputs
+        if not prompt or len(prompt.strip()) == 0:
+            raise ValueError("Please enter a valid prompt")
+        if model is None or scheduler is None:
+            raise RuntimeError("Model components not loaded. Please check the setup.")
+        # Validate dimensions
+        if (
+            width < 256
+            or width > MAX_IMAGE_SIZE
+            or height < 256
+            or height > MAX_IMAGE_SIZE
+        ):
+            raise ValueError(
+                f"Image dimensions must be between 256 and {MAX_IMAGE_SIZE}"
+            )
+        if width % 32 != 0 or height % 32 != 0:
+            raise ValueError("Image dimensions must be divisible by 32")
+        if randomize_seed:
+            seed = random.randint(0, MAX_SEED)
+        generator = torch.Generator(device=device).manual_seed(seed)
+        # Calculate latent dimensions
+        spatial_downsample = 32 if "dc-ae" in config.model.vae_dir else 8
+        latent_h = int(height / spatial_downsample)
+        latent_w = int(width / spatial_downsample)
+        progress(0.1, desc="Generating random latent...")
+        # Generate random latent
+        z = torch.randn(
+            (1, model.in_channels, latent_h, latent_w),
+            device=device,
+            dtype=dtype,
+            generator=generator,
+        )
+        progress(0.1, desc="Encoding prompt...")
+        # Encode prompt
+        cap_features, cap_mask = encode_prompt(
+            tokenizer,
+            text_encoder,
+            device,
+            dtype,
+            [prompt],
+            config.model.use_last_hidden_state,
+            max_seq_length=config.model.max_seq_length,
+        )
+        cur_max_seq_len = cap_mask.sum(dim=-1).max()
+        y = cap_features[:, :cur_max_seq_len]
+        y_null = null_cap_feat[:, :cur_max_seq_len]
+        y_null = y_null.expand(y.shape[0], cur_max_seq_len, null_cap_feat.shape[-1])
+        # Generate image
+        with torch.no_grad():
+            samples = scheduler.sample(
+                model,
+                y,
+                y_null,
+                z,
+                T_max=1.0,
+                T_min=0.0,
+                num_steps=num_inference_steps,
+                cfg_scale=guidance_scale,
+                cfg_low=0.0,
+                cfg_high=1.0,
+                stochasticity_ratio=0.0,
+                sample_type="transition",
+                step_callback=lambda step: progress(
+                    0.1 + 0.9 * (step / num_inference_steps), desc="Generating image..."
+                ),
+            )[-1]
+            samples = samples.to(torch.float32)
+        # Decode to image
+        images = decode_func(samples)
+        images = (
+            torch.clamp(127.5 * images + 128.0, 0, 255)
+            .permute(0, 2, 3, 1)
+            .to(torch.uint8)
+            .contiguous()
+        )
+        image = Image.fromarray(images[0].cpu().numpy())
+        progress(1.0, desc="Complete!")
+        return image, seed
+    except Exception as e:
+        print(f"Error during image generation: {e}")
+        # Return a placeholder image or error message
+        error_img = Image.new("RGB", (512, 512), color="red")
+        return error_img, seed
+# Example prompts
+examples = [
+    ["a tiny astronaut hatching from an egg on the moon"],
+    ["🐶 Wearing 🕶 flying on the 🌈"],
+    ["an anime illustration of a wiener schnitzel"],
+    ["a photorealistic landscape of mountains at sunset"],
+    ["a majestic lion in a golden savanna at sunset"],
+    ["a futuristic city with flying cars and neon lights"],
+    ["a cozy cabin in a snowy forest with smoke coming from the chimney"],
+    ["a beautiful mermaid swimming in crystal clear water"],
+]
+# CSS styling
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 520px;
+}
+"""
+# Initialize model components
+try:
+    load_model_components(device)
+    print("Model components loaded successfully!")
+except Exception as e:
+    print(f"Error loading model components: {e}")
+    print("Please ensure config and checkpoint files are available")
+# Create Gradio interface
+with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("# TiM Text-to-Image Generator")
+        gr.Markdown(
+            "Generate high-quality images from text prompts using the TiM (Transition in Matching) model"
+        )
+        with gr.Row():
+            prompt = gr.Text(
+                label="Prompt",
+                show_label=False,
+                max_lines=1,
+                placeholder="Enter your prompt",
+                container=False,
+            )
+            run_button = gr.Button("Generate", scale=0)
+        result = gr.Image(label="Result", show_label=False)
+        with gr.Accordion("Advanced Settings", open=False):
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            with gr.Row():
+                width = gr.Slider(
+                    label="Width",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,
+                )
+                height = gr.Slider(
+                    label="Height",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,
+                )
+            with gr.Row():
+                guidance_scale = gr.Slider(
+                    label="Guidance Scale",
+                    minimum=1,
+                    maximum=15,
+                    step=0.1,
+                    value=2.5,
+                )
+                num_inference_steps = gr.Slider(
+                    label="Number of inference steps",
+                    minimum=1,
+                    maximum=50,
+                    step=1,
+                    value=16,
+                )
+        gr.Examples(
+            examples=examples,
+            fn=generate_image,
+            inputs=[prompt],
+            outputs=[result, seed],
+            cache_examples="lazy",
+        )
+        gr.on(
+            triggers=[run_button.click, prompt.submit],
+            fn=generate_image,
+            inputs=[
+                prompt,
+                seed,
+                randomize_seed,
+                width,
+                height,
+                guidance_scale,
+                num_inference_steps,
+            ],
+            outputs=[result, seed],
+        )
+if __name__ == "__main__":
+    demo.launch()

configs/c2i/tim_b_p4.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+model:
+  transport:
+    target: tim.schedulers.transports.OT_FM
+    params:
+      P_mean: -0.4
+      P_std: 1.0
+      sigma_d: 1.0
+  transition_loss:
+    diffusion_ratio: 0.5
+    consistency_ratio: 0.1
+    derivative_type: dde
+    differential_epsilon: 0.005
+    weight_time_type: sqrt
+    weight_time_tangent: True
+  network:
+    target: tim.models.c2i.tim_model.TiM
+    params:
+      input_size: 32
+      patch_size: 4
+      in_channels: 4
+      class_dropout_prob: 0.1
+      num_classes: 1000
+      depth: 12
+      hidden_size: 768
+      num_heads: 12
+      encoder_depth: 4
+      qk_norm: True
+      z_dim: 768
+      new_condition: t-r
+      use_new_embed: True
+      distance_aware: True
+      lora_hidden_size: 256
+  # pretrained_vae:
+  vae_dir: stabilityai/sd-vae-ft-ema
+  # repa encoder
+  enc_dir: checkpoints/radio/radio-v2.5-b_half.pth.tar
+  proj_coeff: 1.0
+  # ema
+  use_ema: True
+  ema_decay: 0.9999
+data:
+  data_type: latent
+  dataset:
+    latent_dir: datasets/imagenet1k/sd-vae-ft-ema-256x256
+    image_dir: datasets/imagenet1k/images/train
+    image_size: 256
+  dataloader:
+    num_workers: 16
+    batch_size: 256  # Batch size (per device) for the training dataloader.
+training:
+  tracker: null
+  max_train_steps: 100000
+  checkpointing_steps: 2000
+  checkpoints_total_limit: 2
+  resume_from_checkpoint: latest
+  learning_rate: 1.0e-4
+  learning_rate_base_batch_size: 256
+  scale_lr: True
+  lr_scheduler: constant # "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]
+  lr_warmup_steps: 0
+  gradient_accumulation_steps: 1
+  optimizer:
+    target: torch.optim.AdamW
+    params:
+      # betas: ${tuple:0.9, 0.999}
+      betas: [0.9, 0.95]
+      weight_decay: 1.0e-2
+      eps: 1.0e-6
+  max_grad_norm: 1.0
+  proportion_empty_prompts: 0.0
+  mixed_precision: bf16 # ["no", "fp16", "bf16"]
+  allow_tf32: True
+  validation_steps: 500
+  checkpoint_list: [100000, 200000, 300000]

configs/c2i/tim_xl_p1_512.yaml ADDED Viewed

	@@ -0,0 +1,85 @@

+model:
+  transport:
+    target: tim.schedulers.transports.OT_FM
+    params:
+      P_mean: -0.4
+      P_std: 1.0
+      sigma_d: 1.0
+      T_max: 1.0
+      T_min: 0.0
+      enhance_target: False
+      w_gt: 1.0
+      w_cond: 0.0
+      w_start: 0.0
+      w_end: 0.0
+  transition_loss:
+    diffusion_ratio: 0.5
+    consistency_ratio: 0.1
+    derivative_type: dde
+    differential_epsilon: 0.005
+    weight_time_type: sqrt
+    weight_time_tangent: True
+  network:
+    target: tim.models.c2i.tim_model.TiM
+    params:
+      input_size: 16
+      patch_size: 1
+      in_channels: 32
+      class_dropout_prob: 0.1
+      num_classes: 1000
+      depth: 28
+      hidden_size: 1152
+      num_heads: 16
+      encoder_depth: 8
+      qk_norm: True
+      z_dim: 768
+      new_condition: t-r
+      use_new_embed: True
+      distance_aware: True
+      lora_hidden_size: 384
+  # pretrained_vae:
+  vae_dir: mit-han-lab/dc-ae-f32c32-sana-1.1-diffusers
+  # repa encoder
+  enc_dir: checkpoints/radio/radio-v2.5-b_half.pth.tar
+  proj_coeff: 1.0
+  # ema
+  use_ema: True
+  ema_decay: 0.9999
+data:
+  data_type: latent
+  dataset:
+    latent_dir: datasets/imagenet1k/dc-ae-f32c32-sana-1.1-diffusers-512x512
+    image_dir: datasets/imagenet1k/images/train
+    image_size: 512
+  dataloader:
+    num_workers: 4
+    batch_size: 64  # Batch size (per device) for the training dataloader.
+training:
+  tracker: null
+  max_train_steps: 750000
+  checkpointing_steps: 2000
+  checkpoints_total_limit: 2
+  resume_from_checkpoint: latest
+  learning_rate: 1.0e-4
+  learning_rate_base_batch_size: 256
+  scale_lr: True
+  lr_scheduler: constant # "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]
+  lr_warmup_steps: 0
+  gradient_accumulation_steps: 1
+  optimizer:
+    target: torch.optim.AdamW
+    params:
+      # betas: ${tuple:0.9, 0.999}
+      betas: [0.9, 0.95]
+      weight_decay: 1.0e-2
+      eps: 1.0e-6
+  max_grad_norm: 1.0
+  proportion_empty_prompts: 0.0
+  mixed_precision: bf16 # ["no", "fp16", "bf16"]
+  allow_tf32: True
+  validation_steps: 500
+  checkpoint_list: [100000, 250000, 500000]

configs/c2i/tim_xl_p1_512_mg.yaml ADDED Viewed

	@@ -0,0 +1,85 @@

+model:
+  transport:
+    target: tim.schedulers.transports.OT_FM
+    params:
+      P_mean: -0.4
+      P_std: 1.0
+      sigma_d: 1.0
+      T_max: 1.0
+      T_min: 0.0
+      enhance_target: True
+      w_gt: 1.0
+      w_cond: 0.75
+      w_start: 0.3
+      w_end: 0.8
+  transition_loss:
+    diffusion_ratio: 0.5
+    consistency_ratio: 0.1
+    derivative_type: dde
+    differential_epsilon: 0.005
+    weight_time_type: sqrt
+    weight_time_tangent: True
+  network:
+    target: tim.models.c2i.tim_model.TiM
+    params:
+      input_size: 16
+      patch_size: 1
+      in_channels: 32
+      class_dropout_prob: 0.1
+      num_classes: 1000
+      depth: 28
+      hidden_size: 1152
+      num_heads: 16
+      encoder_depth: 8
+      qk_norm: True
+      z_dim: 768
+      new_condition: t-r
+      use_new_embed: True
+      distance_aware: True
+      lora_hidden_size: 384
+  # pretrained_vae:
+  vae_dir: mit-han-lab/dc-ae-f32c32-sana-1.1-diffusers
+  # repa encoder
+  enc_dir: checkpoints/radio/radio-v2.5-b_half.pth.tar
+  proj_coeff: 1.0
+  # ema
+  use_ema: True
+  ema_decay: 0.9999
+data:
+  data_type: latent
+  dataset:
+    latent_dir: datasets/imagenet1k/dc-ae-f32c32-sana-1.1-diffusers-512x512
+    image_dir: datasets/imagenet1k/images/train
+    image_size: 512
+  dataloader:
+    num_workers: 4
+    batch_size: 64  # Batch size (per device) for the training dataloader.
+training:
+  tracker: null
+  max_train_steps: 750000
+  checkpointing_steps: 2000
+  checkpoints_total_limit: 2
+  resume_from_checkpoint: latest
+  learning_rate: 1.0e-4
+  learning_rate_base_batch_size: 256
+  scale_lr: True
+  lr_scheduler: constant # "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]
+  lr_warmup_steps: 0
+  gradient_accumulation_steps: 1
+  optimizer:
+    target: torch.optim.AdamW
+    params:
+      # betas: ${tuple:0.9, 0.999}
+      betas: [0.9, 0.95]
+      weight_decay: 1.0e-2
+      eps: 1.0e-6
+  max_grad_norm: 1.0
+  proportion_empty_prompts: 0.0
+  mixed_precision: bf16 # ["no", "fp16", "bf16"]
+  allow_tf32: True
+  validation_steps: 500
+  checkpoint_list: [100000, 250000, 500000]

configs/c2i/tim_xl_p2_256.yaml ADDED Viewed

	@@ -0,0 +1,85 @@

+model:
+  transport:
+    target: tim.schedulers.transports.OT_FM
+    params:
+      P_mean: -0.4
+      P_std: 1.0
+      sigma_d: 1.0
+      T_max: 1.0
+      T_min: 0.0
+      enhance_target: False
+      w_gt: 1.0
+      w_cond: 0.0
+      w_start: 0.0
+      w_end: 0.0
+  transition_loss:
+    diffusion_ratio: 0.5
+    consistency_ratio: 0.1
+    derivative_type: dde
+    differential_epsilon: 0.005
+    weight_time_type: sqrt
+    weight_time_tangent: True
+  network:
+    target: tim.models.c2i.tim_model.TiM
+    params:
+      input_size: 32
+      patch_size: 2
+      in_channels: 4
+      class_dropout_prob: 0.1
+      num_classes: 1000
+      depth: 28
+      hidden_size: 1152
+      num_heads: 16
+      encoder_depth: 8
+      qk_norm: True
+      z_dim: 768
+      new_condition: t-r
+      use_new_embed: True
+      distance_aware: True
+      lora_hidden_size: 384
+  # pretrained_vae:
+  vae_dir: stabilityai/sd-vae-ft-ema
+  # repa encoder
+  enc_dir: checkpoints/radio/radio-v2.5-b_half.pth.tar
+  proj_coeff: 1.0
+  # ema
+  use_ema: True
+  ema_decay: 0.9999
+data:
+  data_type: latent
+  dataset:
+    latent_dir: datasets/imagenet1k/sd-vae-ft-ema-256x256
+    image_dir: datasets/imagenet1k/images/train
+    image_size: 256
+  dataloader:
+    num_workers: 4
+    batch_size: 64  # Batch size (per device) for the training dataloader.
+training:
+  tracker: null
+  max_train_steps: 750000
+  checkpointing_steps: 2000
+  checkpoints_total_limit: 2
+  resume_from_checkpoint: latest
+  learning_rate: 1.0e-4
+  learning_rate_base_batch_size: 256
+  scale_lr: True
+  lr_scheduler: constant # "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]
+  lr_warmup_steps: 0
+  gradient_accumulation_steps: 1
+  optimizer:
+    target: torch.optim.AdamW
+    params:
+      # betas: ${tuple:0.9, 0.999}
+      betas: [0.9, 0.95]
+      weight_decay: 1.0e-2
+      eps: 1.0e-6
+  max_grad_norm: 1.0
+  proportion_empty_prompts: 0.0
+  mixed_precision: bf16 # ["no", "fp16", "bf16"]
+  allow_tf32: True
+  validation_steps: 500
+  checkpoint_list: [100000, 250000, 500000]

configs/c2i/tim_xl_p2_256_mg.yaml ADDED Viewed

	@@ -0,0 +1,85 @@

+model:
+  transport:
+    target: tim.schedulers.transports.OT_FM
+    params:
+      P_mean: -0.4
+      P_std: 1.0
+      sigma_d: 1.0
+      T_max: 1.0
+      T_min: 0.0
+      enhance_target: True
+      w_gt: 1.0
+      w_cond: 0.75
+      w_start: 0.3
+      w_end: 0.8
+  transition_loss:
+    diffusion_ratio: 0.5
+    consistency_ratio: 0.1
+    derivative_type: dde
+    differential_epsilon: 0.005
+    weight_time_type: sqrt
+    weight_time_tangent: True
+  network:
+    target: tim.models.c2i.tim_model.TiM
+    params:
+      input_size: 32
+      patch_size: 2
+      in_channels: 4
+      class_dropout_prob: 0.1
+      num_classes: 1000
+      depth: 28
+      hidden_size: 1152
+      num_heads: 16
+      encoder_depth: 8
+      qk_norm: True
+      z_dim: 768
+      new_condition: t-r
+      use_new_embed: True
+      distance_aware: True
+      lora_hidden_size: 384
+  # pretrained_vae:
+  vae_dir: stabilityai/sd-vae-ft-ema
+  # repa encoder
+  enc_dir: checkpoints/radio/radio-v2.5-b_half.pth.tar
+  proj_coeff: 1.0
+  # ema
+  use_ema: True
+  ema_decay: 0.9999
+data:
+  data_type: latent
+  dataset:
+    latent_dir: datasets/imagenet1k/sd-vae-ft-ema-256x256
+    image_dir: datasets/imagenet1k/images/train
+    image_size: 256
+  dataloader:
+    num_workers: 4
+    batch_size: 64  # Batch size (per device) for the training dataloader.
+training:
+  tracker: null
+  max_train_steps: 750000
+  checkpointing_steps: 2000
+  checkpoints_total_limit: 2
+  resume_from_checkpoint: latest
+  learning_rate: 1.0e-4
+  learning_rate_base_batch_size: 256
+  scale_lr: True
+  lr_scheduler: constant # "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]
+  lr_warmup_steps: 0
+  gradient_accumulation_steps: 1
+  optimizer:
+    target: torch.optim.AdamW
+    params:
+      # betas: ${tuple:0.9, 0.999}
+      betas: [0.9, 0.95]
+      weight_decay: 1.0e-2
+      eps: 1.0e-6
+  max_grad_norm: 1.0
+  proportion_empty_prompts: 0.0
+  mixed_precision: bf16 # ["no", "fp16", "bf16"]
+  allow_tf32: True
+  validation_steps: 500
+  checkpoint_list: [100000, 250000, 500000]

configs/t2i/tim_xl_p1_t2i.yaml ADDED Viewed

	@@ -0,0 +1,81 @@

+model:
+  transport:
+    target: tim.schedulers.transports.OT_FM
+    params:
+      P_mean: 0.0
+      P_std: 1.6
+      sigma_d: 1.0
+  transition_loss:
+    diffusion_ratio: 0.5
+    consistency_ratio: 0.1
+    derivative_type: dde
+    differential_epsilon: 0.005
+    weight_time_type: sqrt
+    weight_time_tangent: True
+  network:
+    target: tim.models.t2i.tim_model.TiM
+    params:
+      input_size: 16
+      patch_size: 1
+      in_channels: 32
+      depth: 28
+      hidden_size: 1152
+      cap_feat_dim: 1152
+      num_heads: 16
+      encoder_depth: 8
+      qk_norm: True
+      z_dim: 768
+      new_condition: t-r
+      use_new_embed: True
+      distance_aware: True
+      lora_hidden_size: 384
+  # pretrained_vae:
+  vae_dir: mit-han-lab/dc-ae-f32c32-sana-1.1-diffusers
+  # text encoder
+  text_encoder_dir: google/gemma-3-1b-it
+  proportion_empty_prompts: 0.1
+  use_last_hidden_state: True
+  max_seq_length: 256
+  # repa encoder
+  enc_dir: checkpoints/radio/radio-v2.5-b_half.pth.tar
+  proj_coeff: 1.0
+  # ema
+  use_ema: True
+  ema_decay: 0.9999
+data:
+  data_type: image_ms
+  dataset:
+    root_dir: datasets/t2i_toy_dataset
+    packed_json: datasets/t2i_toy_dataset/bucket_sampler.json
+    jsonl_dir: datasets/t2i_toy_dataset/data_info.jsonl
+  dataloader:
+    num_workers: 4
+    batch_size: 128  # Batch size (per device) for the training dataloader.
+training:
+  tracker: null
+  max_train_steps: 500000
+  checkpointing_steps: 1000
+  checkpoints_total_limit: 2
+  resume_from_checkpoint: latest
+  learning_rate: 1.0e-4
+  learning_rate_base_batch_size: 512
+  scale_lr: True
+  lr_scheduler: constant # "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]
+  lr_warmup_steps: 0
+  gradient_accumulation_steps: 1
+  optimizer:
+    target: torch.optim.AdamW
+    params:
+      # betas: ${tuple:0.9, 0.999}
+      betas: [0.9, 0.95]
+      weight_decay: 1.0e-2
+      eps: 1.0e-6
+  max_grad_norm: 1.0
+  proportion_empty_prompts: 0.0
+  mixed_precision: bf16 # ["no", "fp16", "bf16"]
+  allow_tf32: True
+  validation_steps: 500
+  checkpoint_list: [100000, 200000, 300000, 400000]

pyproject.toml ADDED Viewed

	@@ -0,0 +1,31 @@

+[project]
+name = "tim"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "accelerate>=0.33.0",
+    "bitsandbytes>=0.47.0",
+    "diffusers==0.33.1",
+    "einops>=0.8.1",
+    "flash-attn>=2.8.3",
+    "gradio>=5.44.1",
+    "imageio==2.34.2",
+    "imageio-ffmpeg==0.5.1",
+    "moviepy==1.0.3",
+    "numpy==1.26.0",
+    "omegaconf>=2.3.0",
+    "pillow==9.5.0",
+    "safetensors>=0.6.2",
+    "sentencepiece>=0.2.0",
+    "spaces>=0.40.1",
+    "streamlit>=1.38.0",
+    "timm>=1.0.19",
+    "torch>=2.8.0",
+    "torchdiffeq>=0.2.5",
+    "torchvision>=0.23.0",
+    "transformers>=4.44.2",
+    "triton>=3.4.0",
+    "wandb>=0.21.3",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+gradio>=4.0.0
+spaces>=0.28.0
+torch>=2.1.0
+torchvision
+diffusers
+transformers>=4.25.0
+omegaconf
+einops
+numpy
+Pillow
+safetensors
+tqdm
+flash-attn>=2.0.0
+accelerate
+-e .

setup.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from setuptools import find_packages, setup
+setup(
+    name="tim",
+    version="0.0.1",
+    description="",
+    packages=find_packages(),
+    install_requires=[
+        "torch",
+        "numpy",
+    ],
+)

tim/data/c2i_data.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+import json
+import datetime
+import torchvision
+import numpy as np
+import torch
+from omegaconf import OmegaConf
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from torchvision.datasets import ImageFolder
+from torchvision import transforms
+from torchvision.transforms.functional import hflip
+from accelerate.logging import get_logger
+from safetensors.torch import load_file
+from .sampler_utils import get_train_sampler
+logger = get_logger(__name__, log_level="INFO")
+def center_crop_arr(pil_image, image_size):
+    """
+    Center cropping implementation from ADM.
+    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
+    """
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(
+            tuple(x // 2 for x in pil_image.size), resample=Image.Resampling.BOX
+        )
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(
+        tuple(round(x * scale) for x in pil_image.size), resample=Image.Resampling.BICUBIC
+    )
+    arr = np.array(pil_image)
+    crop_y = (arr.shape[0] - image_size) // 2
+    crop_x = (arr.shape[1] - image_size) // 2
+    return Image.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size])
+class ImagenetDictWrapper(Dataset):
+    def __init__(self, dataset):
+        super().__init__()
+        self.dataset = dataset
+    def __getitem__(self, i):
+        x, y = self.dataset[i]
+        return {"image": x, "label": y}
+    def __len__(self):
+        return len(self.dataset)
+class ImagenetLatentDataset(Dataset):
+    def __init__(self, latent_dir, image_dir, image_size):
+        super().__init__()
+        self.RandomHorizontalFlipProb = 0.5
+        self.transform = transforms.Compose([
+            transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, image_size)),
+            transforms.Lambda(lambda pil_image: (pil_image, hflip(pil_image))),
+            transforms.Lambda(lambda crops: torch.stack([transforms.ToTensor()(crop) for crop in crops])), # returns a 4D tensor
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+        ])
+        self.dataset = []
+        for class_folder in os.listdir(image_dir):
+            if os.path.isfile(os.path.join(image_dir, class_folder)):
+                continue
+            latent_class_folder = os.path.join(latent_dir, class_folder)
+            image_class_folder = os.path.join(image_dir, class_folder)
+            for file in os.listdir(image_class_folder):
+                self.dataset.append(
+                    dict(
+                        latent=os.path.join(latent_class_folder, file.split('.')[0]+'.safetensors'),
+                        image=os.path.join(image_class_folder, file)
+                    )
+                )
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        data_item = dict()
+        data = load_file(self.dataset[idx]['latent'])
+        image = self.transform(Image.open(self.dataset[idx]['image']).convert("RGB"))
+        if torch.rand(1) < self.RandomHorizontalFlipProb:
+            data_item['latent'] = data['latent'][0]
+            data_item['image'] = image[0]
+        else:
+            data_item['latent'] = data['latent'][1]
+            data_item['image'] = image[1]
+        data_item['label'] = data['label']
+        return data_item
+class C2ILoader():
+    def __init__(self, data_config):
+        super().__init__()
+        self.batch_size = data_config.dataloader.batch_size
+        self.num_workers = data_config.dataloader.num_workers
+        self.data_type = data_config.data_type
+        if data_config.data_type == 'image':
+            self.train_dataset = ImagenetDictWrapper(**OmegaConf.to_container(data_config.dataset))
+        elif data_config.data_type == 'latent':
+            self.train_dataset = ImagenetLatentDataset(**OmegaConf.to_container(data_config.dataset))
+        else:
+            raise NotImplementedError
+        self.test_dataset = None
+        self.val_dataset = None
+    def train_len(self):
+        return len(self.train_dataset)
+    def train_dataloader(self, rank, world_size, global_batch_size, max_steps, resume_steps, seed):
+        sampler = get_train_sampler(
+            self.train_dataset, rank, world_size, global_batch_size, max_steps, resume_steps, seed
+        )
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            sampler=sampler,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            drop_last=True,
+            prefetch_factor=2,
+        )
+    def test_dataloader(self):
+        return None
+    def val_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            drop_last=True
+        )

tim/data/sampler_utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+import json
+# from https://github.com/Alpha-VLLM/LLaMA2-Accessory/blob/main/Large-DiT-ImageNet/train.py#L60
+def get_train_sampler(dataset, rank, world_size, global_batch_size, max_steps,
+                      resume_step, seed):
+    sample_indices = torch.empty([max_steps * global_batch_size // world_size],
+                                 dtype=torch.long)
+    epoch_id, fill_ptr, offs = 0, 0, 0
+    while fill_ptr < sample_indices.size(0):
+        g = torch.Generator()
+        g.manual_seed(seed + epoch_id)
+        epoch_sample_indices = torch.randperm(len(dataset), generator=g)
+        epoch_id += 1
+        epoch_sample_indices = epoch_sample_indices[
+            (rank + offs) % world_size::world_size
+        ]
+        offs = (offs + world_size - len(dataset) % world_size) % world_size
+        epoch_sample_indices = epoch_sample_indices[
+            :sample_indices.size(0) - fill_ptr
+        ]
+        sample_indices[fill_ptr: fill_ptr + epoch_sample_indices.size(0)] = \
+            epoch_sample_indices
+        fill_ptr += epoch_sample_indices.size(0)
+    return sample_indices[resume_step * global_batch_size // world_size:].tolist()
+def get_packed_batch_sampler(
+        dataset, rank, world_size, max_steps, resume_step, seed
+    ):
+    sample_indices = [None for _ in range(max_steps)]
+    epoch_id, fill_ptr, offs = 0, 0, 0
+    while fill_ptr < len(sample_indices):
+        g = torch.Generator()
+        g.manual_seed(seed + epoch_id)
+        epoch_sample_indices = torch.randperm(len(dataset), generator=g)
+        epoch_id += 1
+        epoch_sample_indices = epoch_sample_indices[
+            (rank + offs) % world_size::world_size
+        ]
+        offs = (offs + world_size - len(dataset) % world_size) % world_size
+        epoch_sample_indices = epoch_sample_indices[
+            :len(sample_indices) - fill_ptr
+        ]
+        sample_indices[fill_ptr: fill_ptr + epoch_sample_indices.size(0)] = [
+            dataset[i] for i in epoch_sample_indices
+        ]
+        fill_ptr += epoch_sample_indices.size(0)
+    return sample_indices[resume_step:]

tim/data/t2i_data.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import torch
+import csv
+import json
+import os
+import random
+import ast
+import numpy as np
+from omegaconf import OmegaConf
+from torchvision import transforms
+from torch.utils.data import DataLoader, Dataset
+from PIL import Image
+from tqdm import tqdm
+from safetensors.torch import save_file, load_file
+from .sampler_utils import get_train_sampler, get_packed_batch_sampler
+def resize_arr(pil_image, height, width):
+    pil_image = pil_image.resize((width, height), resample=Image.Resampling.BICUBIC)
+    return pil_image
+class T2IDatasetMS(Dataset):
+    def __init__(self, root_dir, packed_json, jsonl_dir) -> None:
+        super().__init__()
+        self.root_dir = root_dir
+        self.dataset = []
+        with open(packed_json, 'r') as fp:
+            self.packed_dataset = json.load(fp)
+        with open(jsonl_dir, 'r') as fp:
+            self.dataset = [json.loads(line) for line in fp]
+    def __len__(self):
+        return len(self.dataset)
+    def get_one_data(self, data_meta):
+        data_item = dict()
+        image_file = os.path.join(self.root_dir, data_meta['image_file'])
+        image = Image.open(image_file).convert("RGB")
+        bucket = data_meta['bucket']
+        resolutions = bucket.split('-')[-1].split('x')
+        height, width = int(int(resolutions[0])/32)*32, int(int(resolutions[1])/32)*32
+        transform = transforms.Compose([
+            transforms.Lambda(lambda pil_image: resize_arr(pil_image, height, width)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True),
+        ])
+        image = transform(image)
+        data_item['image'] = image
+        data_item['caption'] = random.choice(data_meta['captions']).encode('unicode-escape').decode('utf-8')
+        return data_item
+    def __getitem__(self, index):
+        data_meta = self.dataset[index]
+        # data_item = self.get_one_data(data_meta)
+        try:
+            data_item = self.get_one_data(data_meta)
+        except:
+            print(f"Warning: {data_meta['image_file']} does not exist", flush=True)
+            data_item = None
+        return data_item
+def bucket_collate_fn(batch):
+    caption = []
+    image = []
+    for data in batch:
+        if data == None:
+            continue
+        caption.append(data['caption'])
+        image.append(data['image'])
+    image = torch.stack(image)
+    return dict(image=image, caption=caption)
+class T2ILoader():
+    def __init__(self, data_config):
+        super().__init__()
+        self.batch_size = data_config.dataloader.batch_size
+        self.num_workers = data_config.dataloader.num_workers
+        self.data_type = data_config.data_type
+        if self.data_type == 'image_ms':
+            self.train_dataset = T2IDatasetMS(**OmegaConf.to_container(data_config.dataset))
+        else:
+            raise
+        self.test_dataset = None
+        self.val_dataset = None
+    def train_len(self):
+        return len(self.train_dataset)
+    def train_dataloader(self, rank, world_size, global_batch_size, max_steps, resume_steps, seed):
+        batch_sampler = get_packed_batch_sampler(
+            self.train_dataset.packed_dataset, rank, world_size, max_steps, resume_steps, seed
+        )
+        return DataLoader(
+            self.train_dataset,
+            batch_sampler=batch_sampler,
+            collate_fn=bucket_collate_fn,
+            num_workers=self.num_workers,
+            pin_memory=True,
+        )
+    def test_dataloader(self):
+        return None
+    def val_dataloader(self):
+        return None

tim/models/c2i/tim_model.py ADDED Viewed

	@@ -0,0 +1,406 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import math
+from timm.layers.mlp import SwiGLU, Mlp
+from timm.models.vision_transformer import PatchEmbed, Attention
+from tim.models.utils.funcs import build_mlp, modulate, get_parameter_dtype
+from tim.models.utils.rope import VisionRotaryEmbedding, rotate_half
+from flash_attn import flash_attn_func
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def positional_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        self.timestep_embedding = self.positional_embedding
+        t_freq = self.timestep_embedding(t, dim=self.frequency_embedding_size).to(t.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def forward(self, labels):
+        embeddings = self.embedding_table(labels)
+        return embeddings
+#################################################################################
+#                                 Attention Block                               #
+#################################################################################
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = False,
+            qk_norm: bool = False,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+            norm_layer: nn.Module = nn.LayerNorm,
+            distance_aware: bool = False,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.distance_aware = distance_aware
+        if distance_aware:
+            self.qkv_d = nn.Linear(dim, dim * 3, bias=False)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor, freqs_cos, freqs_sin, attn_type='fused_attn', delta_t=None) -> torch.Tensor:
+        B, N, C = x.shape
+        if self.distance_aware:
+            qkv = self.qkv(x) + self.qkv_d(delta_t)
+        else:
+            qkv = self.qkv(x)
+        if attn_type == 'flash_attn':   # q, k, v: (B, N, n_head, d_head)
+            qkv = qkv.reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 1, 3, 4)
+        else:                           # q, k, v: (B, n_head, N, d_head)
+            qkv = qkv.reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        ori_dtype = qkv.dtype
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        q = q * freqs_cos + rotate_half(q) * freqs_sin
+        k = k * freqs_cos + rotate_half(k) * freqs_sin
+        q, k = q.to(ori_dtype), k.to(ori_dtype)
+        if attn_type == 'flash_attn':
+            x = flash_attn_func(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+            x = x.reshape(B, N, C)
+        elif attn_type == 'fused_attn':
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+            x = x.transpose(1, 2).reshape(B, N, C)
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+            x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+#################################################################################
+#                                 Core TiM Model                                #
+#################################################################################
+class TiMBlock(nn.Module):
+    """
+    A TiM block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        distance_aware = block_kwargs.get('distance_aware', False)
+        self.attn = Attention(
+            hidden_size, num_heads=num_heads, qkv_bias=True, qk_norm=block_kwargs["qk_norm"],
+            distance_aware=distance_aware
+            )
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.mlp = SwiGLU(
+            in_features=hidden_size, hidden_features=(mlp_hidden_dim*2)//3, bias=True
+        )
+        if block_kwargs.get('lora_hidden_size', None) != None:
+            lora_hidden_size = block_kwargs['lora_hidden_size']
+        else:
+            lora_hidden_size = (hidden_size//4)*3
+        self.adaLN_modulation = SwiGLU(
+            in_features=hidden_size, hidden_features=lora_hidden_size, out_features=6*hidden_size, bias=True
+        )
+    def forward(self, x, c, freqs_cos, freqs_sin, attn_type, delta_t=None):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+            self.adaLN_modulation(c).chunk(6, dim=-1)
+        )
+        x = x + gate_msa * self.attn(modulate(self.norm1(x), shift_msa, scale_msa), freqs_cos, freqs_sin, attn_type, delta_t)
+        x = x + gate_mlp * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of TiM.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = SwiGLU(
+            in_features=hidden_size, hidden_features=hidden_size//2, out_features=2*hidden_size, bias=True
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class TiM(nn.Module):
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=1152,
+        encoder_depth=8,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=1000,
+        z_dim=768,
+        projector_dim=2048,
+        use_checkpoint: bool = False,
+        new_condition: str = 't-r',
+        use_new_embed: bool = False,
+        **block_kwargs # qk_norm
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.num_classes = num_classes
+        self.encoder_depth = encoder_depth
+        self.use_checkpoint = use_checkpoint
+        self.new_condition = new_condition
+        self.use_new_embed = use_new_embed
+        self.x_embedder = PatchEmbed(
+            input_size, patch_size, in_channels, hidden_size, bias=True, strict_img_size=False
+        )
+        self.t_embedder = TimestepEmbedder(hidden_size) # timestep embedding type
+        if use_new_embed:
+            self.delta_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        # Will use fixed sin-cos embedding:
+        self.rope = VisionRotaryEmbedding(head_dim=hidden_size//num_heads)
+        self.blocks = nn.ModuleList([
+            TiMBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, **block_kwargs) for _ in range(depth)
+        ])
+        self.projector = build_mlp(hidden_size, projector_dim, z_dim)
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize label embedding table:
+        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in TiM blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation.fc2.weight, 0)
+            nn.init.constant_(block.adaLN_modulation.fc2.bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation.fc2.weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation.fc2.bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x, H, W):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.patch_size
+        h, w = int(H/p), int(W/p)
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
+        return imgs
+    def get_rope(self, h, w, attn_type):
+        grid_h = torch.arange(h)
+        grid_w = torch.arange(w)
+        grid = torch.meshgrid(grid_h, grid_w, indexing='xy')
+        grid = torch.stack(grid, dim=0).reshape(2, -1).unsqueeze(0)
+        freqs_cos, freqs_sin = self.rope.get_cached_2d_rope_from_grid(grid)
+        if attn_type == 'flash_attn':   # (1, N, 1, d_head)
+            return freqs_cos.unsqueeze(2), freqs_sin.unsqueeze(2)
+        else:                           # (1, 1, N, d_head)
+            return freqs_cos.unsqueeze(1), freqs_sin.unsqueeze(1)
+    def forward(self, x, t, r, y, attn_type='flash_attn', return_zs=False, jvp=False):
+        """
+        Forward pass of TiM.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        """
+        B, C, H, W = x.shape
+        x = self.x_embedder(x)                          # (N, T, D), where T = H * W / patch_size ** 2
+        # timestep and class embedding
+        t_embed = self.t_embedder(t).unsqueeze(1)                    # (N, 1, D)
+        delta_embed = self.get_delta_embed(t, r).unsqueeze(1)        # (N, 1, D)
+        y = self.y_embedder(y).unsqueeze(1)                          # (N, 1, D)
+        c = t_embed + delta_embed + y                                # (N, 1, D)
+        freqs_cos, freqs_sin = self.get_rope(
+            int(H/self.patch_size), int(W/self.patch_size), attn_type
+        )
+        for i, block in enumerate(self.blocks):
+            if (not self.use_checkpoint) or jvp:
+                x = block(x, c, freqs_cos, freqs_sin, attn_type, delta_embed)   # (N, T, D)
+            else:
+                x = torch.utils.checkpoint.checkpoint(
+                    self.ckpt_wrapper(block), x, c, freqs_cos, freqs_sin, attn_type, delta_embed
+                )
+            if (i + 1) == self.encoder_depth:
+                h_proj = self.projector(x)
+        x = self.final_layer(x, c)                      # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x, H, W)                    # (N, out_channels, H, W)
+        if return_zs:
+            return x, h_proj
+        else:
+            return x
+    def get_delta_embed(self, t, r):
+        if self.use_new_embed:
+            delta_embedder = self.delta_embedder
+        else:
+            delta_embedder = self.t_embedder
+        if self.new_condition == 't-r':
+            delta_embed = delta_embedder(t-r)
+        elif self.new_condition == 'r':
+            delta_embed = delta_embedder(r)
+        elif self.new_condition == 't,r':
+            delta_embed = self.t_embedder(t) + delta_embedder(r)
+        elif self.new_condition == 't,t-r':
+            delta_embed = self.t_embedder(t) + delta_embedder(t-r)
+        elif self.new_condition == 'r,t-r':
+            delta_embed = self.t_embedder(r) + delta_embedder(t-r)
+        elif self.new_condition == 't,r,t-r':
+            delta_embed = self.t_embedder(t) + self.t_embedder(r) + delta_embedder(t-r)
+        else:
+            raise NotImplementedError
+        return delta_embed
+    def ckpt_wrapper(self, module):
+        def ckpt_forward(*inputs):
+            outputs = module(*inputs)
+            return outputs
+        return ckpt_forward
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)

tim/models/nvidia_radio/hubconf.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+dependencies = ["torch", "timm", "einops"]
+import os
+from typing import Dict, Any, Optional, Union, List
+import warnings
+import torch
+from torch.hub import load_state_dict_from_url
+from timm.models import clean_state_dict
+from .radio.adaptor_registry import adaptor_registry
+from .radio.common import DEFAULT_VERSION, RadioResource, RESOURCE_MAP
+from .radio.enable_damp import configure_damp_from_args
+from .radio.enable_spectral_reparam import disable_spectral_reparam, configure_spectral_reparam_from_args
+from .radio.feature_normalizer import FeatureNormalizer, IntermediateFeatureNormalizer
+from .radio.radio_model import RADIOModel, create_model_from_args
+from .radio.input_conditioner import get_default_conditioner
+from .radio.vitdet import apply_vitdet_arch, VitDetArgs
+def radio_model(
+    version: str = "",
+    progress: bool = True,
+    adaptor_names: Union[str, List[str]] = None,
+    vitdet_window_size: Optional[int] = None,
+    return_checkpoint: bool = False,
+    support_packing: bool=False,
+    **kwargs,
+) -> RADIOModel:
+    if not version:
+        version = DEFAULT_VERSION
+    if os.path.isfile(version):
+        chk = torch.load(version, map_location="cpu", weights_only=False)
+        resource = RadioResource(version, patch_size=None, max_resolution=None, preferred_resolution=None)
+    else:
+        resource = RESOURCE_MAP[version]
+        chk = load_state_dict_from_url(
+            resource.url, progress=progress, map_location="cpu", weights_only=False,
+        )
+    if "state_dict_ema" in chk:
+        state_dict = chk["state_dict_ema"]
+        chk['args'].spectral_reparam = False
+    else:
+        state_dict = chk["state_dict"]
+    args = chk["args"]
+    args.support_packing = support_packing
+    mod = create_model_from_args(args)
+    mod_state_dict = get_prefix_state_dict(state_dict, "base_model.")
+    if args.spectral_reparam:
+        configure_spectral_reparam_from_args(mod, args, state_dict_guidance=mod_state_dict)
+    if getattr(args, 'damp', None):
+        configure_damp_from_args(mod, args)
+    state_dict = clean_state_dict(state_dict)
+    key_warn = mod.load_state_dict(mod_state_dict, strict=False)
+    if key_warn.missing_keys:
+        warnings.warn(f'Missing keys in state dict: {key_warn.missing_keys}')
+    if key_warn.unexpected_keys:
+        warnings.warn(f'Unexpected keys in state dict: {key_warn.unexpected_keys}')
+    if chk['args'].spectral_reparam:
+        # Spectral reparametrization uses PyTorch's "parametrizations" API. The idea behind
+        # the method is that instead of there being a `weight` tensor for certain Linear layers
+        # in the model, we make it a dynamically computed function. During training, this
+        # helps stabilize the model. However, for downstream use cases, it shouldn't be necessary.
+        # Disabling it in this context means that instead of having `w' = f(w)`, we just compute `w' = f(w)`
+        # once, during this function call, and replace the parametrization with the realized weights.
+        # This makes the model run faster, and also use less memory.
+        disable_spectral_reparam(mod)
+        chk['args'].spectral_reparam = False
+    conditioner = get_default_conditioner()
+    conditioner.load_state_dict(get_prefix_state_dict(state_dict, "input_conditioner."))
+    dtype = getattr(chk['args'], 'dtype', torch.float32)
+    mod.to(dtype=dtype)
+    conditioner.dtype = dtype
+    cls_token_per_teacher = getattr(chk['args'], 'cls_token_per_teacher', True)
+    if cls_token_per_teacher:
+        name_to_idx_map = dict()
+        for i, t in enumerate(chk['args'].teachers):
+            if t.get('use_summary', True):
+                name = t['name']
+                if name not in name_to_idx_map:
+                    name_to_idx_map[name] = i
+        summary_idxs = torch.tensor(sorted(name_to_idx_map.values()), dtype=torch.int64)
+    else:
+        summary_idxs = torch.tensor([0], dtype=torch.int64)
+    if adaptor_names is None:
+        adaptor_names = []
+    elif isinstance(adaptor_names, str):
+        adaptor_names = [adaptor_names]
+    teachers = chk["args"].teachers
+    adaptors = dict()
+    for adaptor_name in adaptor_names:
+        for tidx, tconf in enumerate(teachers):
+            if tconf["name"] == adaptor_name:
+                break
+        else:
+            raise ValueError(f'Unable to find the specified adaptor name. Known names: {list(t["name"] for t in teachers)}')
+        ttype = tconf["type"]
+        pf_idx_head = f'_heads.{tidx}'
+        pf_name_head = f'_heads.{adaptor_name}'
+        pf_idx_feat = f'_feature_projections.{tidx}'
+        pf_name_feat = f'_feature_projections.{adaptor_name}'
+        adaptor_state = dict()
+        for k, v in state_dict.items():
+            if k.startswith(pf_idx_head):
+                adaptor_state['summary' + k[len(pf_idx_head):]] = v
+            elif k.startswith(pf_name_head):
+                adaptor_state['summary' + k[len(pf_name_head):]] = v
+            elif k.startswith(pf_idx_feat):
+                adaptor_state['feature' + k[len(pf_idx_feat):]] = v
+            elif k.startswith(pf_name_feat):
+                adaptor_state['feature' + k[len(pf_name_feat):]] = v
+        adaptor = adaptor_registry.create_adaptor(ttype, chk["args"], tconf, adaptor_state)
+        adaptor.head_idx = tidx if cls_token_per_teacher else 0
+        adaptors[adaptor_name] = adaptor
+    feat_norm_sd = get_prefix_state_dict(state_dict, '_feature_normalizer.')
+    feature_normalizer = None
+    if feat_norm_sd:
+        feature_normalizer = FeatureNormalizer(feat_norm_sd['mean'].shape[0], dtype=dtype)
+        feature_normalizer.load_state_dict(feat_norm_sd)
+    inter_feat_norm_sd = get_prefix_state_dict(state_dict, '_intermediate_feature_normalizer.')
+    inter_feature_normalizer = None
+    if inter_feat_norm_sd:
+        inter_feature_normalizer = IntermediateFeatureNormalizer(
+            *inter_feat_norm_sd['means'].shape[:2],
+            rot_per_layer=inter_feat_norm_sd['rotation'].ndim == 3,
+            dtype=dtype
+        )
+        inter_feature_normalizer.load_state_dict(inter_feat_norm_sd)
+    radio = RADIOModel(
+        mod,
+        conditioner,
+        summary_idxs=summary_idxs,
+        patch_size=resource.patch_size,
+        max_resolution=resource.max_resolution,
+        window_size=vitdet_window_size,
+        preferred_resolution=resource.preferred_resolution,
+        adaptors=adaptors,
+        feature_normalizer=feature_normalizer,
+        inter_feature_normalizer=inter_feature_normalizer,
+    )
+    if vitdet_window_size is not None:
+        apply_vitdet_arch(
+            mod,
+            VitDetArgs(
+                vitdet_window_size,
+                radio.num_summary_tokens,
+                num_windowed=resource.vitdet_num_windowed,
+                num_global=resource.vitdet_num_global,
+            ),
+        )
+    if return_checkpoint:
+        return radio, chk
+    return radio
+def get_prefix_state_dict(state_dict: Dict[str, Any], prefix: str):
+    mod_state_dict = {
+        k[len(prefix) :]: v for k, v in state_dict.items() if k.startswith(prefix)
+    }
+    return mod_state_dict

tim/models/nvidia_radio/radio/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+# Register the adaptors
+from .adaptor_registry import adaptor_registry
+from . import open_clip_adaptor
+from .adaptor_base import AdaptorInput, RadioOutput, AdaptorBase
+# Enable support for other model types via the timm register_model mechanism
+from . import extra_timm_models
+from . import extra_models
+from . import vision_transformer_xpos

tim/models/nvidia_radio/radio/adaptor_base.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from argparse import Namespace
+from typing import NamedTuple, Optional
+import torch
+from torch import nn
+import torch.nn.functional as F
+class AdaptorInput(NamedTuple):
+    images: torch.Tensor
+    summary: torch.Tensor
+    features: torch.Tensor
+    feature_fmt: str
+    patch_size: int
+class RadioOutput(NamedTuple):
+    summary: torch.Tensor
+    features: torch.Tensor
+    def to(self, *args, **kwargs):
+        return RadioOutput(
+            self.summary.to(*args, **kwargs) if self.summary is not None else None,
+            self.features.to(*args, **kwargs) if self.features is not None else None,
+        )
+class AdaptorBase(nn.Module):
+    def forward(self, input: AdaptorInput) -> RadioOutput:
+        raise NotImplementedError("Subclasses must implement this!")

tim/models/nvidia_radio/radio/adaptor_generic.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from argparse import Namespace
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .adaptor_base import AdaptorBase, AdaptorInput, RadioOutput
+from .adaptor_mlp import create_mlp_from_state, create_mlp_from_config
+class GenericAdaptor(AdaptorBase):
+    def __init__(self, main_config: Namespace, adaptor_config, state, mlp_config=None):
+        super().__init__()
+        extra_args = dict()
+        ups = None
+        ups_rank = None
+        if adaptor_config is not None:
+            ups = adaptor_config.get('fd_upsample_factor', None)
+            ups_rank = adaptor_config.get('fd_upsample_rank', None)
+        elif mlp_config is not None:
+            ups = mlp_config["feature"].get('upsample_factor', None)
+            ups_rank = mlp_config["feature"].get('upsample_rank', None)
+        if ups is not None:
+            extra_args['upsample_factor'] = ups
+            extra_args['upsample_rank'] = ups_rank
+        if state is not None:
+            spectral_heads = getattr(main_config, 'spectral_heads', False)
+            self.head_mlp = create_mlp_from_state(main_config.mlp_version, state, 'summary.', spectral_weights=spectral_heads)
+            self.feat_mlp = create_mlp_from_state(main_config.mlp_version, state, 'feature.', spectral_weights=spectral_heads, **extra_args)
+        else:
+            assert mlp_config is not None, "Config must not be None if state is None"
+            self.head_mlp =  create_mlp_from_config(
+                main_config.mlp_version,
+                mlp_config["summary"]["input_dim"],
+                mlp_config["summary"]["hidden_dim"],
+                mlp_config["summary"]["output_dim"],
+                mlp_config["summary"]["num_inner"],
+            )
+            self.feat_mlp = create_mlp_from_config(
+                main_config.mlp_version,
+                mlp_config["feature"]["input_dim"],
+                mlp_config["feature"]["hidden_dim"],
+                mlp_config["feature"]["output_dim"],
+                mlp_config["feature"]["num_inner"],
+                **extra_args
+            )
+    def forward(self, input: AdaptorInput) -> RadioOutput:
+        # Convert input'd type to the type of the first parameter of the adaptor.
+        first_param = next(self.parameters())
+        summary = self.head_mlp(input.summary.to(dtype=first_param.dtype)).to(dtype=input.summary.dtype)
+        feat = self.feat_mlp(input.features.to(dtype=first_param.dtype), images=input.images, patch_size=input.patch_size).to(dtype=input.features.dtype)
+        if input.feature_fmt == 'NCHW':
+            feat = (feat.reshape(feat.shape[0], input.images.shape[-2] // input.patch_size * self.feat_mlp.upsample_factor, input.images.shape[-1] // input.patch_size * self.feat_mlp.upsample_factor, feat.shape[2])
+                        .permute(0, 3, 1, 2)
+            )
+        return RadioOutput(summary, feat)

tim/models/nvidia_radio/radio/adaptor_mlp.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import math
+from typing import Dict, Optional
+import torch
+from torch import nn
+from einops import rearrange
+from timm.models.vision_transformer import Block
+from .enable_spectral_reparam import disable_spectral_reparam, enable_spectral_reparam
+class MLP(nn.Module):
+    def __init__(self, input_size: int, hidden_size: int, output_size: int,
+                 num_inner: int = 0, device: torch.device = None, **kwargs):
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(input_size, hidden_size, device=device)
+        self.norm = nn.LayerNorm(hidden_size, device=device)
+        self.relu = nn.ReLU()
+        inner = []
+        for _ in range(num_inner):
+            inner.extend([
+                nn.Linear(hidden_size, hidden_size, device=device),
+                nn.LayerNorm(hidden_size, device=device),
+                nn.ReLU(),
+            ])
+        if inner:
+            self.inner = nn.Sequential(*inner)
+        else:
+            self.inner = nn.Identity()
+        self.fc2 = nn.Linear(hidden_size, output_size, device=device)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.norm(x)
+        x = self.relu(x)
+        x = self.inner(x)
+        x = self.fc2(x)
+        return x
+class MLP2(nn.Module):
+    def __init__(self, input_size: int, hidden_size: int, output_size: int,
+                 num_inner: int = 0,
+                 pre_norm: bool = False, device: torch.device = None,
+                 upsample_factor: int = 1,
+                 upsample_rank: int = None,
+                 from_config: bool = False,
+                 **kwargs):
+        super().__init__()
+        self.pre_norm = nn.Sequential(
+            nn.LayerNorm(input_size),
+            nn.GELU(),
+        ) if pre_norm else nn.Identity()
+        self.upsample_factor = upsample_factor
+        sq_ups = upsample_factor ** 2
+        self._real_output_dim = output_size // sq_ups
+        # hidden_size *= upsample_factor
+        # output_size *= (upsample_factor ** 2)
+        self.fc1 = nn.Linear(input_size, hidden_size, device=device)
+        blocks = []
+        for _ in range(num_inner):
+            blocks.append(nn.Sequential(
+                nn.LayerNorm(hidden_size, device=device),
+                nn.GELU(),
+                nn.Linear(hidden_size, hidden_size, device=device),
+            ))
+        self.blocks = nn.ModuleList(blocks)
+        self.final = nn.Sequential(
+            nn.LayerNorm(hidden_size, device=device),
+            nn.GELU(),
+            nn.Linear(hidden_size, output_size, device=device),
+        )
+    def forward(self, x: torch.Tensor, images: Optional[torch.Tensor] = None, patch_size: Optional[int] = None) -> torch.Tensor:
+        x = self.pre_norm(x)
+        x = self.fc1(x)
+        for block in self.blocks:
+            x = x + block(x)
+        x = self.final(x)
+        if self.upsample_factor > 1:
+            if images is None:
+                raise ValueError(f'`images` cannot be `None` when the head\'s `upsample_factor > 1`!')
+            if patch_size is None:
+                raise ValueError(f'`patch_size` cannot be `None` when the head\'s `upsample_factor > 1`!')
+            h, w = tuple(d // patch_size for d in images.shape[-2:])
+            x = rearrange(x, 'b (h w) (u1 u2 c) -> b (h u1 w u2) c',
+                          h=h, w=w, u1=self.upsample_factor, u2=self.upsample_factor,
+                          c=self._real_output_dim)
+        return x
+MLP_FACTORY = {
+    'v1': MLP,
+    'v2': MLP2,
+}
+def strip_prefix(state: Dict[str, torch.Tensor], prefix: str):
+    state = {
+        k[len(prefix):]: v
+        for k, v in state.items()
+        if k.startswith(prefix)
+    }
+    return state
+def get_mlp_info_from_state(version: str, state: Dict[str, torch.Tensor], prefix: str = '', spectral_weights: bool = False):
+    state = strip_prefix(state, prefix)
+    weight_suffix = 'weight' if not spectral_weights else 'parametrizations.weight.original'
+    if version == 'v1':
+        hidden_dim, input_dim = state[f'fc1.{weight_suffix}'].shape
+        output_dim = state[f'fc2.{weight_suffix}'].shape[0]
+        for num_inner in range(1000):
+            k = f'inner.{num_inner}.0.weight'
+            if k not in state:
+                break
+    elif version == 'v2':
+        hidden_dim, input_dim = state[f'fc1.{weight_suffix}'].shape
+        output_dim = state[f'final.2.{weight_suffix}'].shape[0]
+        for num_inner in range(1000):
+            k = f'blocks.{num_inner}.0.weight'
+            if k not in state:
+                break
+    else:
+        raise ValueError(f'Unsupported MLP version: {version}')
+    return input_dim, hidden_dim, output_dim, num_inner
+def create_mlp_from_config(version: str, input_dim: int, hidden_dim: int, output_dim: int, num_inner: int, **kwargs):
+    ret: nn.Module = MLP_FACTORY[version](input_dim, hidden_dim, output_dim, num_inner, from_config=True, **kwargs)
+    return ret
+def create_mlp_from_state(version: str, state: Dict[str, torch.Tensor], prefix: str = '', spectral_weights: bool = False, **kwargs):
+    state = strip_prefix(state, prefix)
+    input_dim, hidden_dim, output_dim, num_inner = get_mlp_info_from_state(version, state, spectral_weights=spectral_weights)
+    ret: nn.Module = create_mlp_from_config(version, input_dim, hidden_dim, output_dim, num_inner, **kwargs)
+    if spectral_weights:
+        enable_spectral_reparam(ret, init_norm_to_current=False, state_dict_guidance=state)
+    ret.load_state_dict(state)
+    if spectral_weights:
+        disable_spectral_reparam(ret)
+    return ret

tim/models/nvidia_radio/radio/adaptor_registry.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from argparse import Namespace
+from typing import Dict, Any
+import torch
+from .adaptor_generic import GenericAdaptor, AdaptorBase
+dict_t = Dict[str, Any]
+state_t = Dict[str, torch.Tensor]
+class AdaptorRegistry:
+    def __init__(self):
+        self._registry = {}
+    def register_adaptor(self, name):
+        def decorator(factory_function):
+            if name in self._registry:
+                raise ValueError(f"Model '{name}' already registered")
+            self._registry[name] = factory_function
+            return factory_function
+        return decorator
+    def create_adaptor(self, name, main_config: Namespace, adaptor_config: dict_t, state: state_t) -> AdaptorBase:
+        if name not in self._registry:
+            return GenericAdaptor(main_config, adaptor_config, state)
+        return self._registry[name](main_config, adaptor_config, state)
+# Creating an instance of the registry
+adaptor_registry = AdaptorRegistry()

tim/models/nvidia_radio/radio/block.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Ultralytics YOLO 🚀, AGPL-3.0 license
+"""
+Block modules
+"""
+import torch
+import torch.nn as nn
+from timm.models.layers import DropPath
+from .conv import Conv
+# from .transformer import TransformerBlock
+__all__ = ('C2f', 'Bottleneck',)
+class C2f(nn.Module):
+    """Faster Implementation of CSP Bottleneck with 2 convolutions."""
+    def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, drop_path=None):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        if drop_path is None:
+            drop_path = [0.0] * n
+        self.c = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
+        self.cv2 = Conv((2 + n) * self.c, c2, 1)  # optional act=FReLU(c2)
+        self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0, drop_path=drop_path[i]) for i in range(n))
+    def forward(self, x):
+        """Forward pass through C2f layer."""
+        y = list(self.cv1(x).chunk(2, 1))
+        y.extend(m(y[-1]) for m in self.m)
+        return self.cv2(torch.cat(y, 1))
+    def forward_split(self, x):
+        """Forward pass using split() instead of chunk()."""
+        y = list(self.cv1(x).split((self.c, self.c), 1))
+        y.extend(m(y[-1]) for m in self.m)
+        return self.cv2(torch.cat(y, 1))
+class Bottleneck(nn.Module):
+    """Standard bottleneck."""
+    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5, drop_path=0.0):  # ch_in, ch_out, shortcut, groups, kernels, expand
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, k[0], 1)
+        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
+        self.add = shortcut and c1 == c2
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        """'forward()' applies the YOLOv5 FPN to input data."""
+        return x + self.drop_path1(self.cv2(self.cv1(x))) if self.add else self.cv2(self.cv1(x))

tim/models/nvidia_radio/radio/cls_token.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from typing import Optional
+import torch
+from torch import nn
+class ClsToken(nn.Module):
+    def __init__(self, ndim: int,
+                 num_tokens: int = 1,
+                 enabled: bool = True,
+                 register_multiple: Optional[int] = None,
+                 num_registers: Optional[int] = None,
+    ):
+        super().__init__()
+        self.ndim = ndim
+        self.enabled = enabled
+        self.num_registers = 0
+        self.num_tokens = num_tokens
+        if enabled:
+            if num_registers:
+                self.num_registers = num_registers
+            elif register_multiple:
+                self.num_registers = register_multiple - (num_tokens % register_multiple)
+            scale = ndim ** -0.5
+            self.token = nn.Parameter(torch.randn(num_tokens + self.num_registers, ndim) * scale)
+        else:
+            self.token = None
+        self.num_patches = self.num_tokens + self.num_registers
+    def disable(self):
+        self.token = None
+        self.enabled = False
+    def forward(self, x: torch.Tensor):
+        if self.token is None:
+            return x
+        token = self.token.unsqueeze(0).expand(x.shape[0], -1, -1)
+        x = torch.cat([
+            token,
+            x,
+        ], dim=1)
+        return x
+    def no_weight_decay(self):
+        return [
+            'token',
+        ]

tim/models/nvidia_radio/radio/common.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from dataclasses import dataclass
+from typing import Optional
+from .radio_model import Resolution
+@dataclass
+class RadioResource:
+    url: str
+    patch_size: int
+    max_resolution: int
+    preferred_resolution: Resolution
+    vitdet_num_windowed: Optional[int] = None
+    vitdet_num_global: Optional[int] = None
+RESOURCE_MAP = {
+    # RADIOv2.5
+    "radio_v2.5-b": RadioResource(
+        "https://huggingface.co/nvidia/RADIO/resolve/main/radio-v2.5-b_half.pth.tar?download=true",
+        patch_size=16,
+        max_resolution=2048,
+        preferred_resolution=(768, 768),
+        vitdet_num_global=4,
+    ),
+    "radio_v2.5-l": RadioResource(
+        "https://huggingface.co/nvidia/RADIO/resolve/main/radio-v2.5-l_half.pth.tar?download=true",
+        patch_size=16,
+        max_resolution=2048,
+        preferred_resolution=(768, 768),
+        vitdet_num_global=4,
+    ),
+    "radio_v2.5-h": RadioResource(
+        "https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.5-h.pth.tar?download=true",
+        patch_size=16,
+        max_resolution=2048,
+        preferred_resolution=(768, 768),
+        vitdet_num_global=4,
+    ),
+    "radio_v2.5-h-norm": RadioResource(
+        "https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.5-h-norm.pth.tar?download=true",
+        patch_size=16,
+        max_resolution=2048,
+        preferred_resolution=(768, 768),
+        vitdet_num_global=4,
+    ),
+    "radio_v2.5-g": RadioResource(
+        "https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.5-g.pth.tar?download=true",
+        patch_size=14,
+        max_resolution=1792,
+        preferred_resolution=(896, 896),
+        vitdet_num_global=8,
+    ),
+    # RADIO
+    "radio_v2.1": RadioResource(
+        "https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.1_bf16.pth.tar?download=true",
+        patch_size=16,
+        max_resolution=2048,
+        preferred_resolution=Resolution(432, 432),
+        vitdet_num_windowed=5,
+    ),
+    "radio_v2": RadioResource(
+        "https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.pth.tar?download=true",
+        patch_size=16,
+        max_resolution=2048,
+        preferred_resolution=Resolution(432, 432),
+        vitdet_num_windowed=5,
+    ),
+    "radio_v1": RadioResource(
+        "https://huggingface.co/nvidia/RADIO/resolve/main/radio_v1.pth.tar?download=true",
+        patch_size=14,
+        max_resolution=1050,
+        preferred_resolution=Resolution(378, 378),
+    ),
+    # E-RADIO
+    "e-radio_v2": RadioResource(
+        "https://huggingface.co/nvidia/RADIO/resolve/main/eradio_v2.pth.tar?download=true",
+        patch_size=16,
+        max_resolution=2048,
+        preferred_resolution=Resolution(512, 512),
+    ),
+    # C-RADIO
+    "c-radio_v2.5-g": RadioResource(
+        "https://huggingface.co/nvidia/C-RADIOv2-g/resolve/main/c-radio_v2-g_half.pth.tar",
+        patch_size=16,
+        max_resolution=2048,
+        preferred_resolution=(768, 768),
+        vitdet_num_global=8,
+    ),
+    "c-radio_v3-l": RadioResource(
+        # NOTE: Currently, this model cannot be loaded via TorchHub. Instead, use the transformers API at https://huggingface.co/nvidia/C-RADIOv3-L
+        # and accept the license terms.
+        "https://huggingface.co/nvidia/C-RADIOv3-L/resolve/main/c-radio-v3_l_half.pth.tar?download=true",
+        patch_size=16,
+        max_resolution=2048,
+        preferred_resolution=Resolution(512, 512),
+    ),
+}
+DEFAULT_VERSION = "radio_v2.5-h"

tim/models/nvidia_radio/radio/conv.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Ultralytics YOLO 🚀, AGPL-3.0 license
+"""
+Convolution modules
+"""
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+__all__ = ('Conv', 'LightConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus', 'GhostConv',
+           'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'RepConv')
+def autopad(k, p=None, d=1):  # kernel, padding, dilation
+    """Pad to 'same' shape outputs."""
+    if d > 1:
+        k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]  # actual kernel-size
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
+    return p
+# Pavlo's implementation with switch to deploy
+class Conv(nn.Module):
+    default_act = nn.SiLU()  # default activation
+    def __init__(self, a, b, kernel_size=1, stride=1, padding=None, g=1, dilation=1, bn_weight_init=1, bias=False, act=True):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(a, b, kernel_size, stride, autopad(kernel_size, padding, dilation), dilation, g, bias=False)
+        if 1:
+            self.bn = torch.nn.BatchNorm2d(b)
+            torch.nn.init.constant_(self.bn.weight, bn_weight_init)
+            torch.nn.init.constant_(self.bn.bias, 0)
+        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+    def forward(self,x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+    @torch.no_grad()
+    def switch_to_deploy(self):
+        if not isinstance(self.bn, nn.Identity):
+            # return 1
+            c, bn = self.conv, self.bn
+            w = bn.weight / (bn.running_var + bn.eps) ** 0.5
+            w = c.weight * w[:, None, None, None]
+            b = bn.bias - bn.running_mean * bn.weight / \
+                (bn.running_var + bn.eps)**0.5
+            # m = torch.nn.Conv2d(w.size(1) * c.groups,
+            #                     w.size(0),
+            #                     w.shape[2:],
+            #                     stride=c.stride,
+            #                     padding=c.padding,
+            #                     dilation=c.dilation,
+            #                     groups=c.groups)
+            self.conv.weight.data.copy_(w)
+            self.conv.bias = nn.Parameter(b)
+            # self.conv.bias.data.copy_(b)
+            # self.conv = m.to(c.weight.device)
+            self.bn = nn.Identity()

tim/models/nvidia_radio/radio/dinov2_arch.py ADDED Viewed

	@@ -0,0 +1,1016 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+# Nvidia
+# NOTE: We re-define this model architecture primarily so that we don't have to worry about version compatibility breaking,
+# but also because Huggingface does a string replace of `gamma` to something else when loading the model state,
+# and this breaks loading of this model.
+from enum import Enum
+from functools import partial
+import logging
+import math
+import os
+import sys
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+import warnings
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import trunc_normal_
+_torch_has_sdpa = hasattr(F, 'scaled_dot_product_attention')
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import fmha, scaled_index_add, index_select_cat, SwiGLU, memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+    else:
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        if _torch_has_sdpa:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                is_causal=False,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+                scale=self.scale,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+if not XFORMERS_AVAILABLE:
+    SwiGLU = SwiGLUFFN
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, torch.Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.grandma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.grandma) if self.inplace else x * self.grandma
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        # Huggingface is absurd and it will rename strings that contain `gamma`, which means that the normal DINO implementation
+        # of LayerScale won't work with HFHub. So we rename the variable to 'grandma', and support loading checkpoints in either
+        # format
+        key_a = f'{prefix}gamma'
+        key_b = f'{prefix}grandma'
+        if key_a in state_dict:
+            gamma = state_dict[key_a]
+        elif key_b in state_dict:
+            gamma = state_dict[key_b]
+        else:
+            if strict:
+                raise KeyError(f"Couldn't find the key {key_a} nor {key_b} in the state dict!")
+            else:
+                missing_keys.append(key_a)
+                missing_keys.append(key_b)
+                unexpected_keys.extend(state_dict.keys())
+                gamma = None
+        if gamma is not None:
+            self.grandma.data.copy_(gamma)
+        # return super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        def attn_residual_func(x: torch.Tensor) -> torch.Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: torch.Tensor) -> torch.Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[torch.Tensor]) -> List[torch.Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.grandma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.grandma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, torch.Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError
+def drop_add_residual_stochastic_depth(
+    x: torch.Tensor,
+    residual_func: Callable[[torch.Tensor], torch.Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> torch.Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[torch.Tensor],
+    residual_func: Callable[[torch.Tensor, Any], torch.Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> torch.Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+class Weights(Enum):
+    LVD142M = "LVD142M"
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = sys.modules[__name__].__dict__[arch_name](**vit_kwargs)
+    return model
+def dinov2_vits14(**kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", **kwargs)
+def dinov2_vitb14(**kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", **kwargs)
+def dinov2_vitl14(**kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", **kwargs)
+def dinov2_vitg14(**kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        **kwargs,
+    )
+def dinov2_vits14_reg(**kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitb14_reg(**kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitl14_reg(**kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitg14_reg(**kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )

tim/models/nvidia_radio/radio/dual_hybrid_vit.py ADDED Viewed

	@@ -0,0 +1,213 @@

+from logging import getLogger
+from typing import Tuple
+import torch
+from torch import nn
+from torch.nn import functional as F
+from timm.models import register_model
+from timm.models import vision_transformer as tvit
+from timm.models import convnext as tconv
+from einops import rearrange
+from . import extra_timm_models as et
+class Fuser(nn.Module):
+    def __init__(self, src_dim: int, tgt_dim: int, gated: bool = True):
+        super().__init__()
+        self.gated = gated
+        mid_dim = max(src_dim, tgt_dim) * 2
+        self.fwd = nn.Sequential(
+            nn.Conv2d(src_dim, mid_dim, kernel_size=3, stride=1, padding=1),
+            nn.GELU(),
+            nn.Conv2d(mid_dim, tgt_dim * (2 if gated else 1), kernel_size=3, stride=1, padding=1),
+        )
+    def forward(self, src: torch.Tensor, tgt: torch.Tensor) -> torch.Tensor:
+        if src.ndim == 3:
+            shape = tgt.shape[-2:]
+        else:
+            shape = src.shape[-2:]
+        nd = shape[0] * shape[1]
+        if src.ndim == 3:
+            src = src[:, -nd:].reshape(src.shape[0], src.shape[2], *shape)
+        if tgt.ndim == 3:
+            tgt_pre = tgt[:, :-nd]
+            tgt = tgt[:, -nd:].reshape(tgt.shape[0], tgt.shape[2], *shape)
+        else:
+            tgt_pre = None
+        pred = self.fwd(src)
+        if self.gated:
+            g, pred = torch.chunk(pred, 2, dim=1)
+            g = F.sigmoid(g)
+            pred = g * pred
+        tgt = tgt + pred
+        if tgt_pre is not None:
+            tgt = rearrange(tgt, 'b c h w -> b (h w) c')
+            tgt = torch.cat([tgt_pre, tgt], dim=1)
+        return tgt
+class AttnDownsample(nn.Module):
+    def __init__(self, dim: int, window_size: int, num_heads: int = 16):
+        super().__init__()
+        self.q = nn.Parameter(torch.randn(1, num_heads, 1, dim // num_heads) * 0.01)
+        self.kv = nn.Linear(dim, dim * 2)
+        self.proj = nn.Linear(dim, dim)
+        self.window_size = window_size
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+    def forward(self, x: torch.Tensor, twod_shape: Tuple[int, int]) -> torch.Tensor:
+        ntok = twod_shape[0] * twod_shape[1]
+        x_pre = x[:, :-ntok]
+        B = x.shape[0]
+        ds_hw = tuple(s // self.window_size for s in twod_shape)
+        x_spat = rearrange(
+            x[:, -ntok:],
+            'b (h d1 w d2) c -> (b h w) (d1 d2) c',
+            h=ds_hw[0], w=ds_hw[1],
+            d1=self.window_size, d2=self.window_size,
+        )
+        B, N, C = x_spat.shape
+        k, v = self.kv(x_spat).reshape(B, N, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q = (self.q * self.scale).expand(B, -1, -1, -1)
+        attn = q @ k.transpose(-2, -1)
+        attn = F.softmax(attn, dim=-1)
+        x = attn @ v
+        x = x.transpose(1, 2).reshape(B, C)
+        x = self.proj(x)
+        x = rearrange(x, '(b h w) c -> b (h w) c', b=x_pre.shape[0], h=ds_hw[0], w=ds_hw[1])
+        x = torch.cat([x_pre, x], dim=1)
+        return x
+class HybridModel(nn.Module):
+    def __init__(self, vit: tvit.VisionTransformer, conv: tconv.ConvNeXt, pretrained: bool = False,
+                 concatenate: bool = False, **kwargs):
+        super().__init__()
+        self.conv = conv
+        self.vit = vit
+        self.concatenate = concatenate
+        conv.stages = nn.ModuleList(conv.stages)
+        vit.blocks = nn.ModuleList(vit.blocks)
+        self._half_vit_idx = len(vit.blocks) // 2 + 1
+        self._half_conv_idx = None
+        x = torch.empty(1, 3, 256, 256)
+        x = self.conv.stem(x)
+        for i in range(len(conv.stages)):
+            x = conv.stages[i](x)
+            if self._half_conv_idx is None and x.shape[-2:] == (16, 16):
+                self._half_conv_idx = i + 1
+                half_conv_dim = x.shape[1]
+            final_conv_dim = x.shape[1]
+        self.vit_to_conv_fusion = Fuser(vit.embed_dim, half_conv_dim)
+        self.conv_to_vit_fusion = Fuser(half_conv_dim, vit.embed_dim)
+        self.vit_ds = AttnDownsample(vit.embed_dim, window_size=2)
+        embed_dim = vit.embed_dim + (final_conv_dim if concatenate else 0)
+        if not concatenate:
+            self.final_fuse = Fuser(final_conv_dim, vit.embed_dim, gated=False)
+        self.final_block = tvit.Block(embed_dim, num_heads=16)
+        self.embed_dim = embed_dim
+    @property
+    def patch_size(self):
+        return 32
+    @property
+    def no_fsdp_wrap_types(self):
+        return {tvit.VisionTransformer, tconv.ConvNeXt}
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.forward_features(x)
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        y_vit = self.vit.patch_generator(x)
+        for i in range(self._half_vit_idx):
+            y_vit = self.vit.blocks[i](y_vit)
+        y_conv = self.conv.stem(x)
+        for i in range(self._half_conv_idx):
+            y_conv = self.conv.stages[i](y_conv)
+        y_vit, y_conv = self.conv_to_vit_fusion(y_conv, y_vit), self.vit_to_conv_fusion(y_vit, y_conv)
+        y_vit = self.vit_ds(y_vit, y_conv.shape[-2:])
+        for i in range(self._half_vit_idx, len(self.vit.blocks)):
+            y_vit = self.vit.blocks[i](y_vit)
+        for i in range(self._half_conv_idx, len(self.conv.stages)):
+            y_conv = self.conv.stages[i](y_conv)
+        if self.concatenate:
+            y_conv = rearrange(y_conv, 'b c h w -> b (h w) c')
+            # Average pool across the board, and replicate for each cls/register token
+            conv_summary = y_conv.mean(dim=1, keepdim=True).expand(-1, self.vit.patch_generator.num_cls_patches, -1)
+            y_conv = torch.cat([conv_summary, y_conv], dim=1)
+            y = torch.cat([y_vit, y_conv], dim=2)
+        else:
+            y = self.final_fuse(y_conv, y_vit)
+        y = self.final_block(y)
+        summary = y[:, :self.vit.patch_generator.num_cls_tokens]
+        features = y[:, self.vit.patch_generator.num_cls_patches:]
+        return summary, features
+@register_model
+def hybrid_base(pretrained=False, concatenate: bool = False, weight_init: str = 'skip', **kwargs):
+    cfg = dict(num_classes=0, **kwargs)
+    conv = tconv.convnextv2_base(pretrained=pretrained, **cfg)
+    vit = tvit.vit_base_patch16_224(pretrained=pretrained, weight_init=weight_init, **cfg)
+    return HybridModel(vit, conv, pretrained, concatenate=concatenate)
+@register_model
+def hybrid_large(pretrained=False, concatenate: bool = False, weight_init: str = 'skip', **kwargs):
+    cfg = dict(num_classes=0, **kwargs)
+    conv = tconv.convnextv2_large(pretrained=pretrained, **cfg)
+    vit = tvit.vit_large_patch16_224(pretrained=pretrained, weight_init=weight_init, **cfg)
+    return HybridModel(vit, conv, pretrained, concatenate=concatenate)
+@register_model
+def hybrid_huge(pretrained=False, concatenate: bool = False, weight_init: str = 'skip', **kwargs):
+    cfg = dict(num_classes=0, **kwargs)
+    conv = tconv.convnextv2_huge(pretrained=pretrained, **cfg)
+    vit = et.vit_huge_patch16_224(pretrained=pretrained, weight_init=weight_init, **cfg)
+    return HybridModel(vit, conv, pretrained, concatenate=concatenate)

tim/models/nvidia_radio/radio/enable_cpe_support.py ADDED Viewed

	@@ -0,0 +1,224 @@

+# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from typing import List, Optional, Set, Tuple, Union
+from types import MethodType
+import torch
+from torch import nn
+from timm.models import VisionTransformer, checkpoint_seq
+from timm.models.vision_transformer import Attention, Block
+from .feature_normalizer import IntermediateFeatureNormalizerBase, NullIntermediateFeatureNormalizer
+from .extra_models import DinoWrapper
+from .vit_patch_generator import ViTPatchGenerator
+from .forward_intermediates import forward_intermediates
+from .dual_hybrid_vit import HybridModel
+from flash_attn import flash_attn_varlen_func
+def _attn_forward_pack(self: Attention, x: torch.Tensor, cu_seqlens: torch.Tensor) -> torch.Tensor:
+    N, C = x.shape
+    qkv = self.qkv(x).reshape(N, 3, self.num_heads, self.head_dim).permute(1, 0, 2, 3)
+    q, k, v = qkv.unbind(0)
+    q, k = self.q_norm(q), self.k_norm(k)
+    max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+    x = flash_attn_varlen_func(
+        q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen
+    ).reshape(N, -1)
+    x = self.proj(x)
+    x = self.proj_drop(x)
+    return x
+def _block_forward_pack(self: Block, x: torch.Tensor, cu_seqlens: torch.Tensor) -> torch.Tensor:
+    x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x), cu_seqlens)))
+    x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+    return x
+def _forward_cpe_pack(self: VisionTransformer, images: List[torch.Tensor]) -> torch.Tensor:
+    device = images[0].device
+    x = []
+    seqlens = []
+    for image in images:
+        # image: [1, c, H, W] -> x: [n_cls+h*w, D], h=H/p and w=W/p
+        _image = self.patch_generator(image).squeeze(0)
+        x.append(_image)
+        seqlens.append(_image.shape[0])
+    x = torch.cat(x, dim=0)
+    seqlens = torch.tensor(seqlens, device=device, dtype=torch.int)
+    cu_seqlens = torch.cat([
+        torch.tensor([0], device=device, dtype=torch.int32),
+        torch.cumsum(seqlens, dim=0, dtype=torch.int32)
+    ])
+    if getattr(self, 'grad_checkpointing', False) and not torch.jit.is_scripting():
+        for block in self.blocks:
+            x = checkpoint_seq(block, x, cu_seqlens)
+    else:
+        for block in self.blocks:
+            x = block(x, cu_seqlens)
+    x = self.norm(x)
+    return x, cu_seqlens
+def _forward_cpe(self: VisionTransformer, x: torch.Tensor) -> torch.Tensor:
+    x = self.patch_generator(x)
+    if getattr(self, 'grad_checkpointing', False) and not torch.jit.is_scripting():
+        x = checkpoint_seq(self.blocks, x)
+    else:
+        x = self.blocks(x)
+    x = self.norm(x)
+    return x
+def _take_indices(
+        num_blocks: int,
+        n: Optional[Union[int, List[int], Tuple[int]]],
+) -> Tuple[Set[int], int]:
+    if isinstance(n, int):
+        assert n >= 0
+        take_indices = {x for x in range(num_blocks - n, num_blocks)}
+    else:
+        take_indices = {num_blocks + idx if idx < 0 else idx for idx in n}
+    return take_indices, max(take_indices)
+def _forward_intermediates_cpe(
+        self,
+        x: torch.Tensor,
+        norm: bool = False,
+        **kwargs,
+) -> Union[List[torch.Tensor], Tuple[torch.Tensor, List[torch.Tensor]]]:
+    return forward_intermediates(
+        self,
+        patch_extractor=self.patch_generator,
+        num_summary_tokens=self.patch_generator.num_skip,
+        num_cls_tokens=self.patch_generator.num_cls_tokens,
+        norm=self.norm if norm else lambda y: y,
+        x=x,
+        **kwargs,
+    )
+def _forward_cpe_dinov2(self: DinoWrapper, x: torch.Tensor) -> torch.Tensor:
+    y = _forward_cpe(self.inner, x)
+    return y[:, 0], y[:, self.num_summary_tokens:]
+def _forward_intermediates_cpe_dinov2(self: DinoWrapper, *args, **kwargs):
+    return _forward_intermediates_cpe(self.inner, *args, **kwargs)
+def _enable_cpe_for_timm_vit(model: VisionTransformer,
+                             max_img_size: Union[int, Tuple[int, int]] = 1024,
+                             num_cls_tokens: int = 1,
+                             pos_dropout: float = 0.1,
+                             register_multiple: int = Optional[None],
+                             num_registers: int = Optional[None],
+                             support_packing: bool = False,
+):
+    if not isinstance(model, VisionTransformer):
+        raise ValueError("CPE only support for VisionTransformer models!")
+    patch_size = model.patch_embed.patch_size[0]
+    embed_dim = model.embed_dim
+    input_dims = model.patch_embed.img_size
+    normalize_patches = not isinstance(model.patch_embed.norm, nn.Identity)
+    cls_token = model.cls_token is not None
+    max_img_size = int(round(max_img_size / patch_size) * patch_size)
+    patch_generator = ViTPatchGenerator(
+        patch_size=patch_size,
+        embed_dim=embed_dim,
+        input_dims=input_dims,
+        normalize_patches=normalize_patches,
+        cls_token=cls_token,
+        max_input_dims=max_img_size,
+        pos_dropout=pos_dropout,
+        num_cls_tokens=num_cls_tokens,
+        register_multiple=register_multiple,
+        num_registers=num_registers,
+    )
+    model.patch_generator = patch_generator
+    model.patch_embed = None
+    model.cls_token = None
+    model.pos_embed = None
+    model.pos_drop = None
+    model.patch_size = patch_size
+    model.num_cls_tokens = num_cls_tokens
+    model.num_registers = patch_generator.num_registers
+    model.forward_features = MethodType(_forward_cpe, model)
+    model.forward_intermediates = MethodType(_forward_intermediates_cpe, model)
+    if support_packing:
+        model.forward_features = MethodType(_forward_cpe_pack, model)
+        for block in model.blocks:
+            block.forward = MethodType(_block_forward_pack, block)
+            block.attn.forward = MethodType(_attn_forward_pack, block.attn)
+def _enable_cpe_for_dv2_reg_vit(model: DinoWrapper,
+                                max_img_size: Union[int, Tuple[int, int]] = 1024,
+                                num_cls_tokens: int = 1,
+                                pos_dropout: float = 0.1,
+                                register_multiple: int = Optional[None],
+                                num_registers: int = Optional[None],
+):
+    patch_size = model.patch_size
+    embed_dim = model.embed_dim
+    input_dims = model.inner.patch_embed.patches_resolution
+    normalize_patches = not isinstance(model.inner.patch_embed.norm, nn.Identity)
+    cls_token = True
+    max_img_size = int(round(max_img_size / patch_size) * patch_size)
+    patch_generator = ViTPatchGenerator(
+        patch_size=patch_size,
+        embed_dim=embed_dim,
+        input_dims=input_dims,
+        normalize_patches=normalize_patches,
+        cls_token=cls_token,
+        max_input_dims=max_img_size,
+        pos_dropout=pos_dropout,
+        num_cls_tokens=num_cls_tokens,
+        register_multiple=register_multiple,
+        num_registers=num_registers,
+        patch_bias=True,
+    )
+    inner = model.inner
+    inner.patch_generator = patch_generator
+    inner.patch_embed = None
+    inner.cls_token = None
+    inner.pos_embed = None
+    inner.register_tokens = None
+    inner.patch_size = patch_size
+    model.forward_features = MethodType(_forward_cpe_dinov2, model)
+    model.forward_intermediates = MethodType(_forward_intermediates_cpe_dinov2, model)
+def enable_cpe(model: nn.Module,
+               *args,
+               **kwargs,
+):
+    if isinstance(model, VisionTransformer):
+        _enable_cpe_for_timm_vit(model, *args, **kwargs)
+    elif isinstance(model, DinoWrapper):
+        _enable_cpe_for_dv2_reg_vit(model, *args, **kwargs)
+    elif isinstance(model, HybridModel):
+        _enable_cpe_for_timm_vit(model.vit, *args, **kwargs)
+    else:
+        raise ValueError(f'CPE not supported for this model type: {type(model)}')

tim/models/nvidia_radio/radio/enable_damp.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from logging import getLogger
+import math
+import os
+from typing import Dict, List, Optional, Union, Tuple
+from types import MethodType
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.utils import parametrize
+# For now, don't do anything
+class DAMP(nn.Identity):
+    def __init__(self, std: float):
+        super().__init__()
+        self.std = std
+def enable_damp(model: nn.Module, std: float):
+    if isinstance(model, (list, tuple)):
+        for m in model:
+            enable_damp(m, std)
+        return
+    for name, module in model.named_modules():
+        if isinstance(module, nn.Linear):
+            parametrize.register_parametrization(module, 'weight', DAMP(std))
+def configure_damp_from_args(model: nn.Module, args):
+    damp = getattr(args, 'damp', None)
+    if damp:
+        enable_damp(model, damp)

tim/models/nvidia_radio/radio/enable_spectral_reparam.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from logging import getLogger
+import math
+import os
+from typing import Dict, List, Optional, Union, Tuple
+from types import MethodType
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.utils import parametrize
+from torch.nn.utils.parametrizations import _SpectralNorm
+from timm.models.vision_transformer import Attention, Mlp
+_EPS = 1e-5
+class _SNReweight(_SpectralNorm):
+    def __init__(self, weight: torch.Tensor, *args, init_norm_to_current: bool = False, alpha: float = 0.05, version: int = 2, **kwargs):
+        super().__init__(weight, *args, **kwargs)
+        self.alpha = alpha
+        self.version = version
+        self.register_buffer('_sn_version', torch.tensor(version))
+        if init_norm_to_current:
+            # This will set the numerator to match the denominator, which should preserve the original values
+            init_scale = self._get_sigma(weight, n_power_iterations=20).item()
+        else:
+            init_scale = 1.0
+        if version == 1:
+            init_value = init_scale
+        elif version == 2:
+            t = init_scale - alpha
+            if t < _EPS:
+                getLogger("spectral_reparam").warn(f'The initialized spectral norm {init_scale} is too small to be represented. Setting to {_EPS} instead.')
+                t = _EPS
+            init_value = math.log(math.exp(t) - 1)
+        else:
+            raise ValueError(f'Unsupported version: {version}')
+        # Make 2D so that weight decay gets applied
+        self.scale = nn.Parameter(torch.tensor([[init_value]], dtype=torch.float32, device=weight.device))
+    # Re-implementing this because we need to make division by sigma safe
+    def _get_sigma(self, weight: torch.Tensor, n_power_iterations: int = None) -> torch.Tensor:
+        if not n_power_iterations:
+            n_power_iterations = self.n_power_iterations
+        if weight.ndim == 1:
+            # Faster and more exact path, no need to approximate anything
+            sigma = weight.norm()
+        else:
+            weight_mat = self._reshape_weight_to_matrix(weight)
+            if self.training:
+                self._power_method(weight_mat, n_power_iterations)
+            # See above on why we need to clone
+            u = self._u.clone(memory_format=torch.contiguous_format)
+            v = self._v.clone(memory_format=torch.contiguous_format)
+            # The proper way of computing this should be through F.bilinear, but
+            # it seems to have some efficiency issues:
+            # https://github.com/pytorch/pytorch/issues/58093
+            sigma = torch.dot(u, torch.mv(weight_mat, v))
+        return sigma + self.eps
+    def forward(self, weight: torch.Tensor, *args, **kwargs):
+        dtype = weight.dtype
+        sigma = self._get_sigma(weight, *args, **kwargs)
+        if self.version == 1:
+            scale = self.scale
+        elif self.version == 2:
+            scale = F.softplus(self.scale) + self.alpha
+        else:
+            raise ValueError(f'Unsupported version: {self.version}')
+        scale = scale.float() / sigma.float()
+        y = weight * scale
+        if dtype in (torch.float16, torch.bfloat16):
+            y = y.to(dtype)
+        return y
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        version_key = f'{prefix}_sn_version'
+        if version_key not in state_dict:
+            self.version = 1
+            state_dict[version_key] = torch.tensor(1)
+        return super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+class _ChunkedSNReweight(nn.Module):
+    def __init__(self, weight: torch.Tensor, num_chunks: int, *args, init_norm_to_current: bool = False, **kwargs):
+        super().__init__()
+        self.num_chunks = num_chunks
+        parts = weight.split(weight.shape[0] // num_chunks, dim=0)
+        self.parts = nn.ModuleList([
+            _SNReweight(p, *args, init_norm_to_current=init_norm_to_current, **kwargs)
+            for p in parts
+        ])
+    def forward(self, weight: torch.Tensor, *args, **kwargs):
+        parts = weight.split(weight.shape[0] // self.num_chunks, dim=0)
+        parts = [
+            fn(p)
+            for fn, p in zip(self.parts, parts)
+        ]
+        return torch.cat(parts, dim=0)
+class _AttnSNReweight(_ChunkedSNReweight):
+    def __init__(self, weight: torch.Tensor, *args, init_norm_to_current: bool = False, renorm_values: bool = False, **kwargs):
+        super().__init__(weight, 3, *args, init_norm_to_current=init_norm_to_current, **kwargs)
+        if not renorm_values:
+            self.parts[2] = nn.Identity()
+def enable_spectral_reparam(model: Union[nn.Module, List[nn.Module]],
+                            n_power_iterations: int = 1,
+                            eps: float = 1e-6,
+                            init_norm_to_current: bool = False,
+                            renorm_values: bool = True,
+                            renorm_mlp: bool = True,
+                            state_dict_guidance: Optional[Dict[str, torch.Tensor]] = None):
+    if isinstance(model, (list, tuple)):
+        for i, sub in enumerate(model):
+            sub_sd = state_dict_guidance[i] if isinstance(state_dict_guidance, (list, tuple)) else state_dict_guidance
+            enable_spectral_reparam(sub, n_power_iterations=n_power_iterations, eps=eps,
+                                    init_norm_to_current=init_norm_to_current, renorm_values=renorm_values,
+                                    renorm_mlp=renorm_mlp, state_dict_guidance=sub_sd)
+        return
+    print('Enabling spectral reparametrization')
+    args = dict(n_power_iterations=n_power_iterations, dim=0, eps=eps, init_norm_to_current=init_norm_to_current)
+    visited_prefixes = set()
+    def is_guidance_parametrized(name: str):
+        if state_dict_guidance is None:
+            return True
+        p_name = f'{name}.parametrizations'
+        is_prm = any(k for k in state_dict_guidance if k.startswith(p_name) and k.endswith('_sn_version'))
+        return is_prm
+    def parametrize_linear(linear: nn.Linear):
+        parametrize.register_parametrization(
+            linear,
+            'weight',
+            _SNReweight(linear.weight, **args)
+        )
+    for name, mod in model.named_modules():
+        pref = '.'.join(name.split('.')[:-1])
+        if pref in visited_prefixes:
+            continue
+        if isinstance(mod, Attention) or name.endswith('.attn'):
+            if is_guidance_parametrized(f'{name}.qkv'):
+                parametrize.register_parametrization(
+                    mod.qkv,
+                    'weight',
+                    _AttnSNReweight(mod.qkv.weight, renorm_values=renorm_values, **args),
+                )
+            if hasattr(mod, 'proj') and is_guidance_parametrized(f'{name}.proj'):
+                parametrize_linear(mod.proj)
+            visited_prefixes.add(name)
+        elif name.endswith('mlp') and renorm_mlp and hasattr(mod, 'w12'):
+            if is_guidance_parametrized(f'{name}.w12'):
+                parametrize.register_parametrization(
+                    mod.w12,
+                    'weight',
+                    _ChunkedSNReweight(mod.w12.weight, num_chunks=2, **args),
+                )
+            if is_guidance_parametrized(f'{name}.w3'):
+                parametrize_linear(mod.w3)
+            visited_prefixes.add(name)
+        elif isinstance(mod, nn.Linear) and 'patch_generator' not in name and is_guidance_parametrized(name):
+            parametrize_linear(mod)
+def configure_spectral_reparam_from_args(model: nn.Module, args, state_dict_guidance: Optional[Dict[str, torch.Tensor]] = None):
+    spectral_reparam = getattr(args, 'spectral_reparam', False)
+    if isinstance(spectral_reparam, bool) and spectral_reparam:
+        enable_spectral_reparam(model, init_norm_to_current=True, state_dict_guidance=state_dict_guidance)
+    elif isinstance(spectral_reparam, dict):
+        enable_spectral_reparam(
+            model,
+            n_power_iterations=spectral_reparam.get('n_power_iterations', 1),
+            eps=spectral_reparam.get('eps', 1e-12),
+            init_norm_to_current=True,
+            state_dict_guidance=state_dict_guidance,
+        )
+def disable_spectral_reparam(model: nn.Module):
+    print('Disabling spectral reparametrization')
+    for name, mod in model.named_modules():
+        if parametrize.is_parametrized(mod):
+            parametrize.remove_parametrizations(mod, 'weight')
+            pass
+if __name__ == '__main__':
+    import argparse
+    from . import radio_model as create_model
+    parser = argparse.ArgumentParser(description='Remove parametrization from state dict')
+    parser.add_argument('--checkpoint', type=str, required=True, help='The checkpoint to load')
+    parser.add_argument('--output', type=str, default='', help='Where to store the checkpoint')
+    parser.add_argument('--release', default=False, action='store_true', help='Prune extraneous checkpoint fields')
+    parser.add_argument('--strict', default=False, action='store_true', help='Strictly load the state dict')
+    args = parser.parse_args()
+    if not args.output:
+        chk_dir, chk_name = os.path.split(args.checkpoint)
+        args.output = os.path.join(chk_dir, f'clean_{chk_name}')
+        print(f'Set output to "{args.output}"')
+    chk = torch.load(args.checkpoint, map_location='cpu', mmap=True)
+    model = create_model.create_model_from_args(chk['args'])
+    key = 'base_model.'
+    mod_state = dict()
+    extra_state = dict()
+    for k, v in chk['state_dict'].items():
+        if k.startswith(key):
+            mod_state[k[len(key):]] = v
+        else:
+            extra_state[k] = v
+    chk_load_info = model.load_state_dict(mod_state, strict=args.strict)
+    if chk_load_info.unexpected_keys or chk_load_info.missing_keys:
+        print(chk_load_info)
+    if chk['args'].spectral_reparam:
+        disable_spectral_reparam(model)
+    if hasattr(chk['args'], 'dtype'):
+        model.to(dtype=chk['args'].dtype)
+    mod_state = model.state_dict()
+    final_state = dict()
+    final_state.update({f'{key}{k}': v for k, v in mod_state.items()})
+    final_state.update(extra_state)
+    chk['state_dict'] = final_state
+    chk['args'].spectral_reparam = False
+    if args.release:
+        chk = {
+            'arch': chk['arch'],
+            'epoch': chk['epoch'],
+            'state_dict': chk['state_dict'],
+            'args': chk['args'],
+        }
+    torch.save(chk, args.output)
+    pass

tim/models/nvidia_radio/radio/eradio_model.py ADDED Viewed

	@@ -0,0 +1,1392 @@

+#!/usr/bin/env python3
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+# E-RADIO model from
+# Mike Ranzinger, Greg Heinrich, Jan Kautz, and Pavlo Molchanov. "AM-RADIO: Agglomerative Model--Reduce All Domains Into One." arXiv preprint arXiv:2312.06709 (2023).
+# based on FasterViT, Swin Transformer, YOLOv8
+# FasterViT:
+# Ali Hatamizadeh, Greg Heinrich, Hongxu Yin, Andrew Tao, Jose M. Alvarez, Jan Kautz, and Pavlo Molchanov. "FasterViT: Fast Vision Transformers with Hierarchical Attention." arXiv preprint arXiv:2306.06189 (2023).
+import timm
+import torch
+import torch.nn as nn
+from timm.models.registry import register_model
+from timm.models.layers import trunc_normal_, DropPath, LayerNorm2d
+import numpy as np
+import torch.nn.functional as F
+import math
+import warnings
+#######################
+## Codebase from YOLOv8
+## BEGINNING
+#######################
+class C2f(nn.Module):
+    """Faster Implementation of CSP Bottleneck with 2 convolutions."""
+    """From YOLOv8 codebase"""
+    def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, drop_path=None):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        if drop_path is None:
+            drop_path = [0.0] * n
+        self.c = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
+        self.cv2 = Conv((2 + n) * self.c, c2, 1)  # optional act=FReLU(c2)
+        self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0, drop_path=drop_path[i]) for i in range(n))
+    def forward(self, x):
+        """Forward pass through C2f layer."""
+        y = list(self.cv1(x).chunk(2, 1))
+        y.extend(m(y[-1]) for m in self.m)
+        return self.cv2(torch.cat(y, 1))
+    def forward_split(self, x):
+        """Forward pass using split() instead of chunk()."""
+        y = list(self.cv1(x).split((self.c, self.c), 1))
+        y.extend(m(y[-1]) for m in self.m)
+        return self.cv2(torch.cat(y, 1))
+class Bottleneck(nn.Module):
+    """Standard bottleneck."""
+    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5, drop_path=0.0):  # ch_in, ch_out, shortcut, groups, kernels, expand
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, k[0], 1)
+        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
+        self.add = shortcut and c1 == c2
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        """'forward()' applies the YOLOv5 FPN to input data."""
+        return x + self.drop_path1(self.cv2(self.cv1(x))) if self.add else self.cv2(self.cv1(x))
+class Conv(nn.Module):
+    """Modified to support layer fusion"""
+    default_act = nn.SiLU()  # default activation
+    def __init__(self, a, b, kernel_size=1, stride=1, padding=None, g=1, dilation=1, bn_weight_init=1, bias=False, act=True):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(a, b, kernel_size, stride, autopad(kernel_size, padding, dilation), dilation, g, bias=False)
+        if 1:
+            self.bn = torch.nn.BatchNorm2d(b)
+            torch.nn.init.constant_(self.bn.weight, bn_weight_init)
+            torch.nn.init.constant_(self.bn.bias, 0)
+        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+    def forward(self,x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+    @torch.no_grad()
+    def switch_to_deploy(self):
+        # return 1
+        if not isinstance(self.bn, nn.Identity):
+            c, bn = self.conv, self.bn
+            w = bn.weight / (bn.running_var + bn.eps) ** 0.5
+            w = c.weight * w[:, None, None, None]
+            b = bn.bias - bn.running_mean * bn.weight / \
+                (bn.running_var + bn.eps)**0.5
+            self.conv.weight.data.copy_(w)
+            self.conv.bias = nn.Parameter(b)
+            self.bn = nn.Identity()
+def autopad(k, p=None, d=1):  # kernel, padding, dilation
+    """Pad to 'same' shape outputs."""
+    if d > 1:
+        k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]  # actual kernel-size
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
+    return p
+#######################
+## Codebase from YOLOv8
+## END
+#######################
+def pixel_unshuffle(data, factor=2):
+    # performs nn.PixelShuffle(factor) in reverse, torch has some bug for ONNX and TRT, so doing it manually
+    B, C, H, W = data.shape
+    return data.view(B, C, factor, H//factor, factor, W//factor).permute(0,1,2,4,3,5).reshape(B, -1, H//factor, W//factor)
+class SwiGLU(nn.Module):
+    # should be more advanced, but doesnt improve results so far
+    def forward(self, x):
+        x, gate = x.chunk(2, dim=-1)
+        return F.silu(gate) * x
+def window_partition(x, window_size):
+    """
+    Function for partitioning image into windows and later do windowed attention
+    Args:
+        x: (B, C, H, W)
+        window_size: window size
+    Returns:
+        windows - local window features (num_windows*B, window_size*window_size, C)
+        (Hp, Wp) -  the size of the padded image
+    """
+    B, C, H, W = x.shape
+    if window_size == 0 or (window_size==H and window_size==W):
+        windows = x.flatten(2).transpose(1, 2)
+        Hp, Wp = H, W
+    else:
+        pad_h = (window_size - H % window_size) % window_size
+        pad_w = (window_size - W % window_size) % window_size
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, (0, pad_w, 0, pad_h), mode="reflect")
+        Hp, Wp = H + pad_h, W + pad_w
+        x = x.view(B, C, Hp // window_size, window_size, Wp // window_size, window_size)
+        windows = x.permute(0, 2, 4, 3, 5, 1).reshape(-1, window_size*window_size, C)
+    return windows, (Hp, Wp)
+class Conv2d_BN(nn.Module):
+    '''
+    Conv2d + BN layer with folding capability to speed up inference
+    Can be merged with Conv() function with additional arguments
+    '''
+    def __init__(self, a, b, kernel_size=1, stride=1, padding=0, dilation=1, groups=1, bn_weight_init=1, bias=False):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(a, b, kernel_size, stride, padding, dilation, groups, bias=False)
+        if 1:
+            self.bn = torch.nn.BatchNorm2d(b)
+            torch.nn.init.constant_(self.bn.weight, bn_weight_init)
+            torch.nn.init.constant_(self.bn.bias, 0)
+    def forward(self,x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+    @torch.no_grad()
+    def switch_to_deploy(self):
+        if not isinstance(self.bn, nn.Identity):
+            c, bn = self.conv, self.bn
+            w = bn.weight / (bn.running_var + bn.eps) ** 0.5
+            w = c.weight * w[:, None, None, None]
+            b = bn.bias - bn.running_mean * bn.weight / \
+                (bn.running_var + bn.eps)**0.5
+            self.conv.weight.data.copy_(w)
+            self.conv.bias = nn.Parameter(b)
+            self.bn = nn.Identity()
+def window_reverse(windows, window_size, H, W, pad_hw):
+    """
+    Windows to the full feature map
+    Args:
+        windows: local window features (num_windows*B, window_size, window_size, C)
+        window_size: Window size
+        H: Height of image
+        W: Width of image
+        pad_w - a tuple of image passing used in windowing step
+    Returns:
+        x: (B, C, H, W)
+    """
+    # print(f"window_reverse, windows.shape {windows.shape}")
+    Hp, Wp = pad_hw
+    if window_size == 0 or (window_size==H and window_size==W):
+        B = int(windows.shape[0] / (Hp * Wp / window_size / window_size))
+        x = windows.transpose(1, 2).view(B, -1, H, W)
+    else:
+        B = int(windows.shape[0] / (Hp * Wp / window_size / window_size))
+        x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+        x = x.permute(0, 5, 1, 3, 2, 4).reshape(B,windows.shape[2], Hp, Wp)
+        if Hp > H or Wp > W:
+            x = x[:, :, :H, :W, ].contiguous()
+    return x
+class PosEmbMLPSwinv2D(nn.Module):
+    """
+    2D positional embedding from Swin Transformer v2
+    Added functionality to store the positional embedding in the model and not recompute it every time
+    """
+    def __init__(
+        self, window_size, pretrained_window_size, num_heads, seq_length, no_log=False, cpb_mlp_hidden=512,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.num_heads = num_heads
+        # mlp to generate continuous relative position bias
+        self.cpb_mlp = nn.Sequential(
+            nn.Linear(2, cpb_mlp_hidden, bias=True),
+            nn.ReLU(inplace=True),
+            nn.Linear(cpb_mlp_hidden, num_heads, bias=False),
+        )
+        self.grid_exists = False
+        self.seq_length = seq_length
+        self.deploy = False
+        self.num_heads = num_heads
+        self.no_log = no_log
+        self.pretrained_window_size = pretrained_window_size
+        self.relative_bias_window_size = window_size
+        relative_coords_table, relative_position_index, relative_bias = self.relative_bias_initialization(window_size, num_heads,
+                                                                                                     pretrained_window_size, seq_length,
+                                                                                                     no_log)
+        self.register_buffer("relative_coords_table", relative_coords_table)
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.register_buffer("relative_bias", relative_bias)  # for EMA
+    def relative_bias_initialization(self, window_size, num_heads, pretrained_window_size, seq_length, no_log):
+        # as in separate function to support window size chage after model weights loading
+        relative_coords_h = torch.arange(
+            -(window_size[0] - 1), window_size[0], dtype=torch.float32
+        )
+        relative_coords_w = torch.arange(
+            -(window_size[1] - 1), window_size[1], dtype=torch.float32
+        )
+        relative_coords_table = (
+            torch.stack(torch.meshgrid([relative_coords_h, relative_coords_w]))
+            .permute(1, 2, 0)
+            .contiguous()
+            .unsqueeze(0)
+        )  # 1, 2*Wh-1, 2*Ww-1, 2
+        if pretrained_window_size[0] > 0:
+            relative_coords_table[:, :, :, 0] /= pretrained_window_size[0] - 1
+            relative_coords_table[:, :, :, 1] /= pretrained_window_size[1] - 1
+        else:
+            relative_coords_table[:, :, :, 0] /= self.window_size[0] - 1
+            relative_coords_table[:, :, :, 1] /= self.window_size[1] - 1
+        if not no_log:
+            relative_coords_table *= 8  # normalize to -8, 8
+            relative_coords_table = (
+                torch.sign(relative_coords_table)
+                * torch.log2(torch.abs(relative_coords_table) + 1.0)
+                / np.log2(8)
+            )
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = (
+            coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        )  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0
+        ).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_bias = torch.zeros(1, num_heads, seq_length, seq_length)
+        self.relative_bias_window_size = window_size
+        return relative_coords_table, relative_position_index, relative_bias
+    def switch_to_deploy(self):
+        self.deploy = True
+        self.grid_exists = True
+    def forward(self, input_tensor):
+        # for efficiency, we want this forward to be folded into a single operation (sum)
+        # if resolution stays the same, then we dont need to recompute MLP layers
+        if not self.deploy or self.training:
+            self.grid_exists = False
+        #compare if all elements in self.window_size list match those in self.relative_bias_window_size
+        if not all([self.window_size[i] == self.relative_bias_window_size[i] for i in range(len(self.window_size))]):
+            relative_coords_table, relative_position_index, relative_bias = self.relative_bias_initialization(self.window_size, self.num_heads,
+                                                                                                        self.pretrained_window_size, self.seq_length,
+                                                                                                        self.no_log)
+            self.relative_coords_table = relative_coords_table.to(self.relative_coords_table.device)
+            self.relative_position_index = relative_position_index.to(self.relative_position_index.device)
+            self.relative_bias = relative_bias.to(self.relative_bias.device)
+        if self.deploy and self.grid_exists:
+            input_tensor = input_tensor + self.relative_bias
+            return input_tensor
+        if 1:
+            self.grid_exists = True
+            relative_position_bias_table = self.cpb_mlp(
+                self.relative_coords_table
+            ).view(-1, self.num_heads)
+            relative_position_bias = relative_position_bias_table[
+                self.relative_position_index.view(-1)
+            ].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1,
+            )  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(
+                2, 0, 1
+            ).contiguous()  # nH, Wh*Ww, Wh*Ww
+            relative_position_bias = 16 * torch.sigmoid(relative_position_bias)
+            self.relative_bias = relative_position_bias.unsqueeze(0)
+        input_tensor = input_tensor + self.relative_bias
+        return input_tensor
+class GRAAttentionBlock(nn.Module):
+    def __init__(self, window_size, dim_in, dim_out,
+                 num_heads, drop_path=0., qk_scale=None, qkv_bias=False,
+                 norm_layer=nn.LayerNorm, layer_scale=None,
+                  use_swiglu=True,
+                  subsample_ratio=1, dim_ratio=1, conv_base=False,
+                  do_windowing=True, multi_query=False, use_shift=0,
+                  cpb_mlp_hidden=512, conv_groups_ratio=0):
+        '''
+        Global Resolution Attention Block , see README for details
+        Attention with subsampling to get a bigger receptive field for attention
+        conv_base - use conv2d instead of avgpool2d for downsample / upsample
+        '''
+        super().__init__()
+        self.shift_size=window_size//2 if use_shift else 0
+        self.do_windowing = do_windowing
+        self.subsample_ratio = subsample_ratio
+        if do_windowing:
+            if conv_base:
+                    self.downsample_op = nn.Conv2d(dim_in, dim_out, kernel_size=subsample_ratio, stride=subsample_ratio) if subsample_ratio > 1 else nn.Identity()
+                    self.downsample_mixer = nn.Identity()
+                    self.upsample_mixer = nn.Identity()
+                    self.upsample_op = nn.ConvTranspose2d(dim_in, dim_out, kernel_size=subsample_ratio, stride=subsample_ratio) if subsample_ratio > 1 else nn.Identity()
+            else:
+                self.downsample_op = nn.AvgPool2d(kernel_size=subsample_ratio, stride=subsample_ratio) if subsample_ratio > 1 else nn.Identity()
+                self.downsample_mixer = Conv2d_BN(dim_in, dim_out, kernel_size=1, stride=1) if subsample_ratio > 1 else nn.Identity()
+                self.upsample_mixer = nn.Upsample(scale_factor=subsample_ratio, mode='nearest') if subsample_ratio > 1 else nn.Identity()
+                self.upsample_op = Conv2d_BN(dim_in, dim_out, kernel_size=1, stride=1, padding=0, bias=False) if subsample_ratio > 1 else nn.Identity()
+        # in case there is no downsampling conv we want to have it separately
+        # will help with information propagation between windows
+        if subsample_ratio == 1:
+            # conv_groups_ratio=0
+            self.pre_conv = Conv2d_BN(dim_in, dim_in, kernel_size=3, stride=1, padding=1, groups=max(1,int(conv_groups_ratio*dim_in)), bias=False)
+            # self.pre_conv = nn.Conv2d(dim_in, dim_in, kernel_size=3, stride=1, padding=1, groups=max(1,int(conv_groups_ratio*dim_in)), bias=False)
+            # self.pre_conv_act = nn.ReLU6()
+            #for simplicity:
+            self.pre_conv_act = nn.Identity()
+            if conv_groups_ratio == -1:
+                self.pre_conv = nn.Identity()
+                self.pre_conv_act = nn.Identity()
+        self.window_size = window_size
+        self.norm1 = norm_layer(dim_in)
+        self.attn = WindowAttention(
+            dim_in,
+            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            resolution=window_size,
+            seq_length=window_size**2, dim_out=dim_in, multi_query=multi_query,
+            shift_size=self.shift_size, cpb_mlp_hidden=cpb_mlp_hidden)
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        use_layer_scale = layer_scale is not None and type(layer_scale) in [int, float]
+        self.gamma1 = nn.Parameter(layer_scale * torch.ones(dim_in))  if use_layer_scale else 1
+        ### mlp layer
+        mlp_ratio = 4
+        self.norm2 = norm_layer(dim_in)
+        mlp_hidden_dim = int(dim_in * mlp_ratio)
+        activation = nn.GELU if not use_swiglu else SwiGLU
+        mlp_hidden_dim = int((4 * dim_in * 1 / 2) / 64) * 64 if use_swiglu else mlp_hidden_dim
+        self.mlp = Mlp(in_features=dim_in, hidden_features=mlp_hidden_dim, act_layer=activation, use_swiglu=use_swiglu)
+        self.gamma2 = nn.Parameter(layer_scale * torch.ones(dim_in)) if layer_scale else 1
+        self.drop_path2=DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        skip_connection = x
+        attn_mask = None
+        # in case there is no downsampling conv we want to have it separately
+        # will help with information propagation
+        if self.subsample_ratio == 1:
+            x = self.pre_conv_act(self.pre_conv(x)) + skip_connection
+        if self.do_windowing:
+            # performing windowing if required
+            x = self.downsample_op(x)
+            x = self.downsample_mixer(x)
+            if self.window_size>0:
+                H, W = x.shape[2], x.shape[3]
+            if self.shift_size > 0 and H>self.window_size and W>self.window_size:
+                # @swin like cyclic shift, doesnt show better performance
+                x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(2, 3))
+            x, pad_hw = window_partition(x, self.window_size)
+            if self.shift_size > 0 and H>self.window_size and W>self.window_size:
+                # set atten matrix to have -100 and the top right square
+                # attn[:, :, :-self.shift_size, -self.shift_size:] = -100.0
+                # calculate attention mask for SW-MSA
+                # not used in final version, can be useful for some cases especially for high res
+                H, W = pad_hw
+                img_mask = torch.zeros((1, H, W, 1), device=x.device)  # 1 H W 1
+                h_slices = (slice(0, -self.window_size),
+                            slice(-self.window_size, -self.shift_size),
+                            slice(-self.shift_size, None))
+                w_slices = (slice(0, -self.window_size),
+                            slice(-self.window_size, -self.shift_size),
+                            slice(-self.shift_size, None))
+                cnt = 0
+                for h in h_slices:
+                    for w in w_slices:
+                        img_mask[:, h, w, :] = cnt
+                        cnt += 1
+                img_mask = img_mask.transpose(1,2).transpose(1,3)
+                mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+                mask_windows = mask_windows[0].view(-1, self.window_size * self.window_size)
+                attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+                attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        # window attention
+        x = x + self.drop_path1(self.gamma1*self.attn(self.norm1(x), attn_mask=attn_mask)) # or pass H,W
+        # mlp layer
+        x = x + self.drop_path2(self.gamma2*self.mlp(self.norm2(x)))
+        if self.do_windowing:
+            if self.window_size > 0:
+                x = window_reverse(x, self.window_size, H, W, pad_hw)
+            # reverse cyclic shift
+            if self.shift_size > 0 and H>self.window_size and W>self.window_size:
+                # @swin like cyclic shift, not tested
+                x = torch.roll(x, shifts=(self.shift_size, self.shift_size), dims=(2, 3))
+            x = self.upsample_mixer(x)
+            x = self.upsample_op(x)
+            if x.shape[2] != skip_connection.shape[2] or x.shape[3] != skip_connection.shape[3]:
+                x = torch.nn.functional.pad(x, ( 0, -x.shape[3] + skip_connection.shape[3], 0, -x.shape[2] + skip_connection.shape[2]), mode="reflect")
+        # need to add skip connection because downsampling and upsampling will break residual connection
+        # 0.5 is needed to make sure that the skip connection is not too strong
+        # in case of no downsample / upsample we can show that 0.5 compensates for the residual connection
+        x = 0.5 * x + 0.5 * skip_connection
+        return x
+class MultiResolutionAttention(nn.Module):
+    """
+    MultiResolutionAttention (MRA) module
+    The idea is to use multiple attention blocks with different resolution
+    Feature maps are downsampled / upsampled for each attention block on different blocks
+    Every attention block supports windowing
+    """
+    def __init__(self, window_size, sr_ratio,
+                 dim, dim_ratio, num_heads,
+                 do_windowing=True,
+                 layer_scale=1e-5, norm_layer=nn.LayerNorm,
+                 drop_path = 0, qkv_bias=False, qk_scale=1.0,
+                 use_swiglu=True, multi_query=False, conv_base=False,
+                 use_shift=0, cpb_mlp_hidden=512, conv_groups_ratio=0) -> None:
+        """
+        Args:
+            input_resolution: input image resolution
+            window_size: window size
+            compression_ratio: compression ratio
+            max_depth: maximum depth of the GRA module
+            use_shift: do window shifting
+        """
+        super().__init__()
+        depth = len(sr_ratio)
+        self.attention_blocks = nn.ModuleList()
+        for i in range(depth):
+            subsample_ratio = sr_ratio[i]
+            if len(window_size) > i:
+                window_size_local = window_size[i]
+            else:
+                window_size_local = window_size[0]
+            self.attention_blocks.append(GRAAttentionBlock(window_size=window_size_local,
+                                            dim_in=dim, dim_out=dim, num_heads=num_heads,
+                                            qkv_bias=qkv_bias, qk_scale=qk_scale, norm_layer=norm_layer,
+                                            layer_scale=layer_scale, drop_path=drop_path,
+                                            use_swiglu=use_swiglu, subsample_ratio=subsample_ratio, dim_ratio=dim_ratio,
+                                            do_windowing=do_windowing, multi_query=multi_query, conv_base=conv_base,
+                                            use_shift=use_shift, cpb_mlp_hidden=cpb_mlp_hidden, conv_groups_ratio=conv_groups_ratio),
+                                        )
+    def forward(self, x):
+        for attention_block in self.attention_blocks:
+            x = attention_block(x)
+        return x
+class Mlp(nn.Module):
+    """
+    Multi-Layer Perceptron (MLP) block
+    """
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 use_swiglu=True,
+                 drop=0.):
+        """
+        Args:
+            in_features: input features dimension.
+            hidden_features: hidden features dimension.
+            out_features: output features dimension.
+            act_layer: activation function.
+            drop: dropout rate.
+        """
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features * (2 if use_swiglu else 1), bias=False)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=False)
+    def forward(self, x):
+        x_size = x.size()
+        x = x.view(-1, x_size[-1])
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        x = x.view(x_size)
+        return x
+class Downsample(nn.Module):
+    """
+    Down-sampling block
+    Pixel Unshuffle is used for down-sampling, works great accuracy - wise but takes 10% more TRT time
+    """
+    def __init__(self,
+                 dim,
+                 shuffle = False,
+                 ):
+        """
+        Args:
+            dim: feature size dimension.
+            shuffle: idea with
+            keep_dim: bool argument for maintaining the resolution.
+        """
+        super().__init__()
+        dim_out = 2 * dim
+        if shuffle:
+            self.norm = lambda x: pixel_unshuffle(x, factor=2)
+            self.reduction = Conv2d_BN(dim*4, dim_out, 1, 1, 0, bias=False)
+            # pixel unshuffleging works well but doesnt provide any speedup
+        else:
+            # removed layer norm for better, in this formulation we are getting 10% better speed
+            # LayerNorm for high resolution inputs will be a pain as it pools over the entire spatial dimension
+            # therefore we remove it compared to the original implementation in FasterViT
+            self.norm = nn.Identity()
+            self.reduction = Conv2d_BN(dim, dim_out, 3, 2, 1, bias=False)
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class PatchEmbed(nn.Module):
+    """
+    Patch embedding block
+    Used to convert image into an initial set of feature maps with lower resolution
+    """
+    def __init__(self, in_chans=3, in_dim=64, dim=96, shuffle_down=False):
+        """
+        Args:
+            in_chans: number of input channels.
+            in_dim: intermediate feature size dimension to speed up stem.
+            dim: final stem channel number
+            shuffle_down: use PixelUnshuffle for down-sampling, effectively increases the receptive field
+        """
+        super().__init__()
+        # shuffle_down = False
+        if not shuffle_down:
+            self.proj = nn.Identity()
+            self.conv_down = nn.Sequential(
+                Conv2d_BN(in_chans, in_dim, 3, 2, 1, bias=False),
+                nn.ReLU(),
+                Conv2d_BN(in_dim, dim, 3, 2, 1, bias=False),
+                nn.ReLU()
+                )
+        else:
+            self.proj = lambda x: pixel_unshuffle(x, factor=4)
+            self.conv_down = nn.Sequential(Conv2d_BN(in_chans*16, dim, 3, 1, 1),
+                                           nn.ReLU(),
+                                           )
+    def forward(self, x):
+        x = self.proj(x)
+        x = self.conv_down(x)
+        return x
+class ConvBlock(nn.Module):
+    """
+    Convolutional block, used in first couple of stages
+    Experimented with plan resnet-18 like modules, they are the best in terms of throughput
+    Finally, YOLOv8 idea seem to work fine (resnet-18 like block with squeezed feature dimension, and feature concatendation at the end)
+    """
+    def __init__(self, dim,
+                 drop_path=0.,
+                 layer_scale=None,
+                 kernel_size=3,
+                 ):
+        super().__init__()
+        self.conv1 = Conv2d_BN(dim, dim, kernel_size=kernel_size, stride=1, padding=1)
+        self.act1 = nn.GELU()
+        self.conv2 = Conv2d_BN(dim, dim, kernel_size=kernel_size, stride=1, padding=1)
+        self.layer_scale = layer_scale
+        if layer_scale is not None and type(layer_scale) in [int, float]:
+            self.gamma = nn.Parameter(layer_scale * torch.ones(dim))
+            self.layer_scale = True
+        else:
+            self.layer_scale = False
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        input = x
+        x = self.conv1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        if self.layer_scale:
+            x = x * self.gamma.view(1, -1, 1, 1)
+        x = input + self.drop_path(x)
+        return x
+class WindowAttention(nn.Module):
+    # Windowed Attention from SwinV2
+    # use a MLP trick to deal with various input image resolutions, then fold it to improve speed
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, resolution=0,
+                 seq_length=0, dim_out=None, multi_query=False, shift_size=0, cpb_mlp_hidden=512):
+        # taken from EdgeViT and tweaked with attention bias.
+        super().__init__()
+        if not dim_out: dim_out = dim
+        self.shift_size = shift_size
+        self.multi_query = multi_query
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.head_dim = dim // num_heads
+        self.dim_internal = dim
+        self.scale = qk_scale or head_dim ** -0.5
+        if not multi_query:
+            self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        else:
+            self.qkv = nn.Linear(dim, dim + 2*self.head_dim, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim_out, bias=False)
+        # attention positional bias
+        self.pos_emb_funct = PosEmbMLPSwinv2D(window_size=[resolution, resolution],
+                                              pretrained_window_size=[resolution, resolution],
+                                              num_heads=num_heads,
+                                              seq_length=seq_length,
+                                              cpb_mlp_hidden=cpb_mlp_hidden)
+        self.resolution = resolution
+    def forward(self, x, attn_mask = None):
+        B, N, C = x.shape
+        if not self.multi_query:
+            qkv = self.qkv(x).reshape(B, -1, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[2]
+        else:
+            qkv = self.qkv(x)
+            (q, k, v) = qkv.split([self.dim_internal, self.head_dim, self.head_dim], dim=2)
+            q = q.reshape(B, -1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+            k = k.reshape(B, -1, 1, C // self.num_heads).permute(0, 2, 1, 3)
+            v = v.reshape(B, -1, 1, C // self.num_heads).permute(0, 2, 1, 3)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = self.pos_emb_funct(attn)
+        #add window shift
+        if attn_mask is not None:
+            nW = attn_mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N, N) + attn_mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, -1, C)
+        x = self.proj(x)
+        return x
+class ERADIOLayer(nn.Module):
+    """
+    E-RADIO Layer
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size,
+                 conv=False,
+                 downsample=True,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 norm_layer=nn.LayerNorm,
+                 drop_path=0.,
+                 layer_scale=None,
+                 layer_scale_conv=None,
+                 sr_dim_ratio=1,
+                 sr_ratio=1,
+                 multi_query=False,
+                 use_swiglu=True,
+                 yolo_arch=False,
+                 downsample_shuffle=False,
+                 conv_base=False,
+                 use_shift=False,
+                 cpb_mlp_hidden=512,
+                 conv_groups_ratio=0,
+                 verbose: bool = True,
+    ):
+        """
+        Args:
+            dim: feature size dimension.
+            depth: number of layers in each stage.
+            input_resolution: input image resolution.
+            window_size: window size in each stage.
+            downsample: bool argument for down-sampling.
+            mlp_ratio: MLP ratio.
+            num_heads: number of heads in each stage.
+            qkv_bias: bool argument for query, key, value learnable bias.
+            qk_scale: bool argument to scaling query, key.
+            drop: dropout rate.
+            attn_drop: attention dropout rate.
+            drop_path: drop path rate.
+            norm_layer: normalization layer.
+            layer_scale: layer scaling coefficient.
+            use_shift: SWIN like window shifting for half the window size for every alternating layer (considering multi-resolution)
+            conv_groups_ratio: group ratio for conv when no subsampling in multi-res attention
+        """
+        super().__init__()
+        self.conv = conv
+        self.yolo_arch=False
+        self.verbose = verbose
+        if conv:
+            if not yolo_arch:
+                self.blocks = nn.ModuleList([
+                    ConvBlock(dim=dim,
+                            drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                            layer_scale=layer_scale_conv)
+                    for i in range(depth)])
+                self.blocks = nn.Sequential(*self.blocks)
+            else:
+                self.blocks = C2f(dim,dim,n=depth,shortcut=True,e=0.5)
+                self.yolo_arch=True
+        else:
+            if not isinstance(window_size, list): window_size = [window_size]
+            self.window_size = window_size[0]
+            self.do_single_windowing = True
+            if not isinstance(sr_ratio, list): sr_ratio = [sr_ratio]
+            self.sr_ratio = sr_ratio
+            if any([sr!=1 for sr in sr_ratio]) or len(set(window_size))>1:
+                self.do_single_windowing = False
+                do_windowing = True
+            else:
+                self.do_single_windowing = True
+                do_windowing = False
+            #for v2_2
+            if conv_groups_ratio != -1:
+                self.do_single_windowing = False
+                do_windowing = True
+            self.blocks = nn.ModuleList()
+            for i in range(depth):
+                self.blocks.append(
+                    MultiResolutionAttention(window_size=window_size,
+                                             sr_ratio=sr_ratio,
+                                             dim=dim,
+                                             dim_ratio = sr_dim_ratio,
+                                             num_heads=num_heads,
+                                             norm_layer=norm_layer,
+                                             drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                             layer_scale=layer_scale,
+                                             qkv_bias=qkv_bias,
+                                             qk_scale=qk_scale,
+                                             use_swiglu=use_swiglu,
+                                             do_windowing=do_windowing,
+                                             multi_query=multi_query,
+                                             conv_base=conv_base,
+                                             cpb_mlp_hidden=cpb_mlp_hidden,
+                                             use_shift =0 if ((not use_shift) or ((i) % 2 == 0)) else True    ,
+                                             conv_groups_ratio=conv_groups_ratio,
+                    ))
+            self.blocks = nn.Sequential(*self.blocks)
+        self.transformer = not conv
+        self.downsample = None if not downsample else Downsample(dim=dim, shuffle=downsample_shuffle)
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # do padding for transforemr
+        interpolate = True
+        if self.transformer and interpolate:
+            # Windowed Attention will split feature map into windows with the size of window_size x window_size
+            # if the resolution is not divisible by window_size, we need to interpolate the feature map
+            # can be done via padding, but doing so after training hurts the model performance.
+            # interpolation affects the performance as well, but not as much as padding
+            if isinstance(self.window_size, list) or isinstance(self.window_size, tuple):
+                current_max_window_size = max(self.window_size)
+            else:
+                current_max_window_size = self.window_size
+            max_window_size = max([res_upsample*current_max_window_size for res_upsample in self.sr_ratio])
+            if H % max_window_size != 0 or W % max_window_size != 0:
+                new_h = int(np.ceil(H/max_window_size)*max_window_size)
+                new_w = int(np.ceil(W/max_window_size)*max_window_size)
+                x = F.interpolate(x, size=(new_h, new_w), mode='nearest')
+                if self.verbose:
+                    warnings.warn(f"Choosen window size is not optimal for given resolution. Interpolation of features maps will be done and it can affect the performance. Max window size is {max_window_size}, feature map size is {H}x{W}, interpolated feature map size is {new_h}x{new_w}.")
+        if self.transformer and self.do_single_windowing:
+            H, W = x.shape[2], x.shape[3]
+            x, pad_hw = window_partition(x, self.window_size)
+        #run main blocks
+        x = self.blocks(x)
+        if self.transformer and self.do_single_windowing:
+            x = window_reverse(x, self.window_size, H, W, pad_hw)
+        if self.transformer and interpolate:
+            #lets keep original resolution, might be not ideal, but for the upsampling tower we need to keep the expected resolution.
+            x = F.interpolate(x, size=(H, W), mode='nearest')
+        if self.downsample is None:
+            return x, x
+        return self.downsample(x), x  # changing to output pre downsampled features
+class InterpolateLayer(nn.Module):
+    def __init__(self, size=None, scale_factor=None, mode='nearest'):
+        super(InterpolateLayer, self).__init__()
+        self.size = size
+        self.scale_factor = scale_factor
+        self.mode = mode
+    def forward(self, x):
+        return F.interpolate(x, size=self.size, scale_factor=self.scale_factor, mode=self.mode)
+class HiResNeck(nn.Module):
+    """
+    The block is used to output dense features from all stages
+    Otherwise, by default, only the last stage features are returned with E-RADIO
+    """
+    def __init__(self, dim, depths, neck_start_stage, full_features_head_dim, downsample_enabled):
+        '''
+        Hi Resolution neck to support output of high res features that are useful for dense tasks.
+        depths - total number of layers in the base model
+        neck_start_stage - when to start the neck, 0 - start from the first stage, 1 - start from the second stage etc.
+                            earlier layers result in higher resolution features at the cost of compute
+        full_features_head_dim - number of channels in the dense features head
+        '''
+        super().__init__()
+        # create feature projection layers for segmentation output
+        self.neck_features_proj = nn.ModuleList()
+        self.neck_start_stage = neck_start_stage
+        upsample_ratio = 1
+        for i in range(len(depths)):
+            level_n_features_output = int(dim * 2 ** i)
+            if self.neck_start_stage > i: continue
+            if (upsample_ratio > 1) or full_features_head_dim!=level_n_features_output:
+                feature_projection = nn.Sequential()
+                if False:
+                    feature_projection.add_module("norm",nn.BatchNorm2d(level_n_features_output)) #fast, but worse
+                    feature_projection.add_module("dconv", nn.ConvTranspose2d(level_n_features_output,
+                                                                            full_features_head_dim, kernel_size=upsample_ratio, stride=upsample_ratio))
+                else:
+                    # B, in_channels, H, W -> B, in_channels, H*upsample_ratio, W*upsample_ratio
+                    # print("upsample ratio", upsample_ratio, level_n_features_output, level_n_features_output)
+                    feature_projection.add_module("upsample", InterpolateLayer(scale_factor=upsample_ratio, mode='nearest'))
+                    feature_projection.add_module("conv1", nn.Conv2d(level_n_features_output, level_n_features_output, kernel_size=3, stride=1, padding=1, groups=level_n_features_output))
+                    feature_projection.add_module("norm",nn.BatchNorm2d(level_n_features_output))
+                    # B, in_channels, H*upsample_ratio, W*upsample_ratio -> B, full_features_head_dim, H*upsample_ratio, W*upsample_ratio
+                    feature_projection.add_module("conv2", nn.Conv2d(level_n_features_output, full_features_head_dim, kernel_size=1, stride=1, padding=0))
+            else:
+                feature_projection = nn.Sequential()
+            self.neck_features_proj.append(feature_projection)
+            if i>0 and downsample_enabled[i]:
+                upsample_ratio *= 2
+    def forward(self, x, il_level=-1, full_features=None):
+        if self.neck_start_stage > il_level:
+            return full_features
+        if full_features is None:
+            full_features = self.neck_features_proj[il_level - self.neck_start_stage](x)
+        else:
+            #upsample torch tensor x to match full_features size, and add to full_features
+            feature_projection = self.neck_features_proj[il_level - self.neck_start_stage](x)
+            if feature_projection.shape[2] != full_features.shape[2] or feature_projection.shape[3] != full_features.shape[3]:
+                feature_projection = torch.nn.functional.pad(feature_projection, ( 0, -feature_projection.shape[3] + full_features.shape[3], 0, -feature_projection.shape[2] + full_features.shape[2]))
+            full_features = full_features + feature_projection
+        return full_features
+class ERADIO(nn.Module):
+    """
+    Efficient RADIO
+    """
+    def __init__(self,
+                 dim,
+                 in_dim,
+                 depths,
+                 window_size,
+                 mlp_ratio,
+                 num_heads,
+                 drop_path_rate=0.2,
+                 in_chans=3,
+                 num_classes=1000,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 layer_scale=None,
+                 layer_scale_conv=None,
+                 layer_norm_last=False,
+                 sr_ratio = [1, 1, 1, 1],
+                 max_depth = -1,
+                 conv_base=False,
+                 use_swiglu=False,
+                 multi_query=False,
+                 norm_layer=nn.LayerNorm,
+                 drop_uniform=False,
+                 yolo_arch=False,
+                 shuffle_down=False,
+                 downsample_shuffle=False,
+                 return_full_features=False,
+                 full_features_head_dim=128,
+                 neck_start_stage=1,
+                 use_neck=False,
+                 use_shift=False,
+                 cpb_mlp_hidden=512,
+                 conv_groups_ratio=0,
+                 verbose: bool = False,
+                 **kwargs):
+        """
+        Args:
+            dim: feature size dimension.
+            depths: number of layers in each stage.
+            window_size: window size in each stage.
+            mlp_ratio: MLP ratio.
+            num_heads: number of heads in each stage.
+            drop_path_rate: drop path rate.
+            in_chans: number of input channels.
+            num_classes: number of classes.
+            qkv_bias: bool argument for query, key, value learnable bias.
+            qk_scale: bool argument to scaling query, key.
+            drop_rate: dropout rate.
+            attn_drop_rate: attention dropout rate.
+            norm_layer: normalization layer.
+            layer_scale: layer scaling coefficient.
+            return_full_features: output dense features as well as logits
+            full_features_head_dim: number of channels in the dense features head
+            neck_start_stage: a stage id to start full feature neck. Model has 4 stages, indix starts with 0
+                                for 224 resolution, the output of the stage before downsample:
+                                stage 0: 56x56, stage 1: 28x28, stage 2: 14x14, stage 3: 7x7
+            use_neck: even for summarization embedding use neck
+            use_shift: SWIN like window shifting but without masking attention
+            conv_groups_ratio: will be used for conv blocks where there is no multires attention,
+                                if 0 then normal conv,
+                                if 1 then channels are independent,
+                                if -1 then no conv at all
+        """
+        super().__init__()
+        num_features = int(dim * 2 ** (len(depths) - 1))
+        self.num_classes = num_classes
+        self.patch_embed = PatchEmbed(in_chans=in_chans, in_dim=in_dim, dim=dim, shuffle_down=shuffle_down)
+        # set return_full_features true if we want to return full features from all stages
+        self.return_full_features = return_full_features
+        self.use_neck = use_neck
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        if drop_uniform:
+            dpr = [drop_path_rate for x in range(sum(depths))]
+        if not isinstance(max_depth, list): max_depth = [max_depth] * len(depths)
+        self.levels = nn.ModuleList()
+        for i in range(len(depths)):
+            conv = True if (i == 0 or i == 1) else False
+            level = ERADIOLayer(dim=int(dim * 2 ** i),
+                                   depth=depths[i],
+                                   num_heads=num_heads[i],
+                                   window_size=window_size[i],
+                                   mlp_ratio=mlp_ratio,
+                                   qkv_bias=qkv_bias,
+                                   qk_scale=qk_scale,
+                                   conv=conv,
+                                   drop_path=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                                   downsample=(i < len(depths) - 1),
+                                   layer_scale=layer_scale,
+                                   layer_scale_conv=layer_scale_conv,
+                                   sr_ratio=sr_ratio[i],
+                                   use_swiglu=use_swiglu,
+                                   multi_query=multi_query,
+                                   norm_layer=norm_layer,
+                                   yolo_arch=yolo_arch,
+                                   downsample_shuffle=downsample_shuffle,
+                                   conv_base=conv_base,
+                                   cpb_mlp_hidden=cpb_mlp_hidden,
+                                   use_shift=use_shift,
+                                   conv_groups_ratio=conv_groups_ratio,
+                                   verbose=verbose)
+            self.levels.append(level)
+        if self.return_full_features or self.use_neck:
+            #num_heads
+            downsample_enabled = [self.levels[i-1].downsample is not None for i in range(len(self.levels))]
+            self.high_res_neck = HiResNeck(dim, depths, neck_start_stage, full_features_head_dim, downsample_enabled)
+        self.switched_to_deploy = False
+        self.norm = LayerNorm2d(num_features) if layer_norm_last else nn.BatchNorm2d(num_features)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.head = nn.Linear(num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, LayerNorm2d):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.BatchNorm2d):
+            nn.init.ones_(m.weight)
+            nn.init.zeros_(m.bias)
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'rpb'}
+    def forward_features(self, x):
+        _, _, H, W = x.shape
+        if H % 32 != 0 or W % 32 != 0:
+            raise ValueError(f"E-RADIO requires input dimensions to be divisible by 32 but got H x W: {H} x {W}")
+        x = self.patch_embed(x)
+        full_features = None
+        for il, level in enumerate(self.levels):
+            x, pre_downsample_x = level(x)
+            if self.return_full_features or self.use_neck:
+                full_features = self.high_res_neck(pre_downsample_x, il, full_features)
+        # x = self.norm(full_features if (self.return_full_features or self.use_neck) else x)
+        x = self.norm(x) # new version for
+        if not self.return_full_features:
+            return x, None
+        return x, full_features
+    def forward(self, x):
+        x, full_features = self.forward_features(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.head(x)
+        if full_features is not None:
+            return x, full_features
+        return x
+    def switch_to_deploy(self):
+        '''
+        A method to perform model self-compression
+        merges BN into conv layers
+        converts MLP relative positional bias into precomputed buffers
+        '''
+        if not self.switched_to_deploy:
+            for level in [self.patch_embed, self.levels, self.head]:
+                for module in level.modules():
+                    if hasattr(module, 'switch_to_deploy'):
+                        module.switch_to_deploy()
+        self.switched_to_deploy = True
+    def change_window_size(self, new_window_size):
+        """
+        E-RADIO employs windowed attention, which may be sensitive to the choice of this parameter,
+        especially in cases of uneven partitioning of the feature maps.
+        E-RADIO allows for the adjustment of the window size after training,
+        making it adaptable to different input image resolutions.
+        The recommended values for window size based on input resolution are as follows:
+        Input Resolution | Window Size
+        224 | 7
+        256 | 8
+        386 | 12
+        512 | 16
+        Ideally, the window size should be a factor of the input resolution. In the third stage, we divide the resolution by 16, so the window size should be
+        img_res/16/2
+        for the third stage and img_res/32 for the last stage. While this can be applied in a brute-force manner, a better way is to do model.change_window_size.
+        Manual way to change resolution -> model.change_window_size(resolution)
+        """
+        window_size = new_window_size
+        print(f"Setting window size to {window_size}")
+        for module in self.modules():
+            if hasattr(module, "window_size"):
+                # check if tuple or a number
+                if isinstance(module.window_size, tuple):
+                    if module.window_size[0] != window_size:
+                        module.window_size = (window_size, window_size)
+                elif isinstance(module.window_size, list):
+                    if module.window_size[0] != window_size:
+                        module.window_size = [window_size, window_size]
+                else:
+                    module.window_size = window_size
+    def set_optimal_window_size(self, image_dim, max_window_size = 16):
+        """
+        Using hand picked window size for various resolutions.
+        E-RADIO employs windowed attention, which may be sensitive to the choice of this parameter,
+        especially in cases of uneven partitioning of the feature maps.
+        E-RADIO allows for the adjustment of the window size after training,
+        making it adaptable to different input image resolutions.
+        The recommended values for window size based on input resolution are as follows:
+        Input Resolution | Window Size
+        224 | 7
+        256 | 8
+        386 | 12
+        512 | 16
+        Ideally, the window size should be a factor of the input resolution. In the third stage, we divide the resolution by 16, so the window size should be
+        img_res/16/2
+        for the third stage and img_res/32 for the last stage. While this can be applied in a brute-force manner, a better way is to do model.change_window_size.
+        Manual way to change resolution -> model.change_window_size(resolution)
+        """
+        # import math
+        def divisorGenerator(n):
+            large_divisors = []
+            for i in range(1, int(math.sqrt(n) + 1)):
+                if n % i == 0:
+                    yield i
+                    if i*i != n:
+                        large_divisors.append(n / i)
+            for divisor in reversed(large_divisors):
+                yield divisor
+        if isinstance(image_dim, list) or isinstance(image_dim, tuple):
+            image_dim = min(image_dim)
+        # we do windowed attention in the 3rd stage for the first time, therefore //16,
+        # we do subsampled attention with downsample by 2 so need to get //32 actually
+        # ideally we should rewrite this to be dependent on the structure of the model like what if subsampled is removed etc
+        all_divisors = np.array(list(divisorGenerator(image_dim//32)))
+        new_window_size = int(min(all_divisors[all_divisors <= max_window_size][-1], max_window_size))
+        # for image_dim in [128, 224, 256, 384, 512, 768, 1024]:
+        #     all_divisors = np.array(list(divisorGenerator(image_dim//32)))
+        #     new_window_size = int(min(all_divisors[all_divisors <= max_window_size][-1], max_window_size))
+        #     print(f"Setting window size to {new_window_size} for image resolution {image_dim}")
+        self.change_window_size(new_window_size = new_window_size)
+@register_model
+def eradio_large_fullres_ws16(pretrained=False, **kwargs):
+    model = ERADIO(
+        depths=[3, 3, 5, 5],
+        num_heads=[2, 4, 8, 16],
+        window_size=[None, None, [16, 16], 16],
+        dim=192,
+        in_dim=64,
+        mlp_ratio=4,
+        drop_path_rate=0.0,
+        sr_ratio=[1, 1, [2, 1], 1],
+        use_swiglu=False,
+        yolo_arch=True,
+        shuffle_down=False,
+        conv_base=True,
+        use_neck=True,
+        full_features_head_dim=1536,
+        neck_start_stage=2,
+        **kwargs,
+    )
+    if pretrained:
+        model.load_state_dict(torch.load(pretrained)["state_dict"])
+    return model
+@register_model
+def eradio_xxxtiny(pretrained=False, **kwargs):  # ,
+    model = ERADIO(
+        depths=[1, 3, 4, 5],
+        num_heads=[2, 4, 8, 16],
+        window_size=[None, None, [16, 16], 16],
+        dim=32,
+        in_dim=32,
+        mlp_ratio=4,
+        drop_path_rate=0.0,
+        sr_ratio=[1, 1, [2, 1], 1],
+        use_swiglu=False,
+        yolo_arch=True,
+        shuffle_down=False,
+        conv_base=True,
+        use_neck=True,
+        full_features_head_dim=256,
+        neck_start_stage=2,
+        **kwargs,
+    )
+    if pretrained:
+        model.load_state_dict(torch.load(pretrained))
+    return model
+@register_model
+def eradio_xxxtiny_8x_ws12(pretrained=False, **kwargs):
+    model = ERADIO(depths=[1, 3, 4, 5],
+        num_heads=[2, 4, 8, 16],
+        window_size=[None, None, [12, 12], 12],
+        dim=32,
+        in_dim=32,
+        mlp_ratio=4,
+        drop_path_rate=0.0,
+        sr_ratio=[1, 1, [2, 1], 1],
+        use_swiglu=False,
+        downsample_shuffle=False,
+        yolo_arch=True,
+        shuffle_down=False,
+        cpb_mlp_hidden=64,
+        use_neck=True,
+        full_features_head_dim=256,
+        neck_start_stage=2,
+        conv_groups_ratio = 1,
+        **kwargs)
+    if pretrained:
+        model.load_state_dict(torch.load(pretrained)["state_dict"])
+    return model
+@register_model
+def eradio_xxxtiny_8x_ws16(pretrained=False, **kwargs):
+    model = ERADIO(depths=[1, 3, 4, 5],
+        num_heads=[2, 4, 8, 16],
+        window_size=[None, None, [16, 16], 16],
+        dim=32,
+        in_dim=32,
+        mlp_ratio=4,
+        drop_path_rate=0.0,
+        sr_ratio=[1, 1, [2, 1], 1],
+        use_swiglu=False,
+        downsample_shuffle=False,
+        yolo_arch=True,
+        shuffle_down=False,
+        cpb_mlp_hidden=64,
+        use_neck=True,
+        full_features_head_dim=256,
+        neck_start_stage=1,
+        conv_groups_ratio = 1,
+        **kwargs)
+    if pretrained:
+        model.load_state_dict(torch.load(pretrained)["state_dict"])
+    return model
+@register_model
+def eradio(pretrained=False, **kwargs):
+    return eradio_large_fullres_ws16(pretrained=pretrained, **kwargs)

tim/models/nvidia_radio/radio/extra_models.py ADDED Viewed

	@@ -0,0 +1,206 @@

+from distutils.version import LooseVersion
+from types import MethodType
+from typing import List, Optional, Tuple, Union
+import warnings
+import torch
+from torch import nn
+import torch.nn.functional as F
+from timm.models.registry import register_model
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .forward_intermediates import forward_intermediates
+from .input_conditioner import InputConditioner
+_has_torch_sdpa = hasattr(F, 'scaled_dot_product_attention')
+class PaliGemmaWrapper(nn.Module):
+    def __init__(self, vis_model: nn.Module, embed_dim: int):
+        super().__init__()
+        self.vis_model = vis_model
+        self.embed_dim = embed_dim
+    @property
+    def patch_size(self):
+        return self.vis_model.embeddings.patch_size
+    @property
+    def blocks(self):
+        return self.vis_model.encoder.layers
+    @property
+    def embed_dim(self):
+        return self.vis_model.embeddings.embed_dim
+    def forward(self, x: torch.Tensor):
+        outputs = self.vis_model(
+            x,
+            return_dict=False,
+            interpolate_pos_encoding=True,
+        )
+        features = outputs[0].to(torch.float32)
+        summary = features.mean(dim=1)
+        return summary, features
+    def forward_features(self, x: torch.Tensor):
+        return self(x)
+def _get_paligemma_model(repo: str, embed_dim: int = None, dtype: torch.dtype = torch.bfloat16):
+    from transformers import PaliGemmaForConditionalGeneration, __version__ as tx_version
+    if LooseVersion(tx_version) > LooseVersion('4.44.2'):
+        warnings.warn(f'Your transformers version "{tx_version}" is higher than 4.44.2, and for whatever reason, PaliGemma might be broken.')
+    extra_args = dict()
+    if dtype is not None:
+        extra_args['torch_dtype'] = dtype
+        rev = str(dtype).split('.')[-1]
+        extra_args['revision'] = rev
+    model = PaliGemmaForConditionalGeneration.from_pretrained(repo, **extra_args)
+    vis_model = model.vision_tower.vision_model
+    vis_model = PaliGemmaWrapper(vis_model, embed_dim)
+    return vis_model
+@register_model
+def paligemma_896_student(**kwargs):
+    model = _get_paligemma_model('google/paligemma-3b-pt-896', embed_dim=1152, dtype=None)
+    return model
+def dv2_sdpa(self, x: torch.Tensor) -> torch.Tensor:
+    B, N, C = x.shape
+    qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+    q, k, v = qkv[0], qkv[1], qkv[2]
+    x = F.scaled_dot_product_attention(
+        q, k, v,
+        is_causal=False,
+        dropout_p=self.attn_drop.p if self.training else 0.,
+        scale=self.scale,
+    )
+    x = x.transpose(1, 2).reshape(B, N, C)
+    x = self.proj(x)
+    x = self.proj_drop(x)
+    return x
+def _load_dino_v2(dino_v2_model, cache_dir: Optional[str] = None, pretrained=True, **kwargs):
+    if cache_dir:
+        torch.hub.set_dir(cache_dir)
+    model: nn.Module = torch.hub.load(
+        'facebookresearch/dinov2',
+        dino_v2_model,
+        pretrained=pretrained,
+        # **kwargs,
+    )
+    if _has_torch_sdpa:
+        for n, m in model.named_modules():
+            if n.endswith('.attn'):
+                m.forward = MethodType(dv2_sdpa, m)
+    return model
+class DinoWrapper(nn.Module):
+    def __init__(self, dino_model: nn.Module):
+        super().__init__()
+        self.inner = dino_model
+        dino_model.blocks = nn.Sequential(*dino_model.blocks)
+    @property
+    def embed_dim(self):
+        return self.inner.embed_dim
+    @property
+    def patch_size(self):
+        return self.inner.patch_size
+    @property
+    def num_cls_tokens(self):
+        return getattr(self.inner, 'num_tokens', 1)
+    @property
+    def num_registers(self):
+        return getattr(self.inner, 'num_register_tokens', 0)
+    @property
+    def num_summary_tokens(self):
+        return self.num_cls_tokens + self.num_registers
+    @property
+    def blocks(self):
+        return self.inner.blocks
+    def forward(self, *args, **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
+        parts = self.inner.forward_features(*args, **kwargs)
+        cls_token = parts['x_norm_clstoken']
+        features = parts['x_norm_patchtokens']
+        return cls_token, features
+    def forward_features(self, x: torch.Tensor):
+        x = self.inner.prepare_tokens_with_masks(x)
+        x = self.inner.blocks(x)
+        x_norm = self.inner.norm(x)
+        return x_norm[:, 0], x_norm[:, self.num_summary_tokens:]
+    def patchify(self, x: torch.Tensor) -> torch.Tensor:
+        return self.inner.prepare_tokens_with_masks(x)
+    def forward_intermediates(self,
+        x: torch.Tensor,
+        norm: bool = False,
+        **kwargs,
+    ) -> Union[List[torch.Tensor], Tuple[torch.Tensor, List[torch.Tensor]]]:
+        return forward_intermediates(
+            self,
+            patch_extractor=self.inner.prepare_tokens_with_masks,
+            num_summary_tokens=self.num_summary_tokens,
+            num_cls_tokens=self.num_cls_tokens,
+            norm=self.inner.norm if norm else lambda y: y,
+            x=x,
+            **kwargs,
+        )
+def _dino_student(arch: str, **kwargs):
+    from . import dinov2_arch
+    factory = getattr(dinov2_arch, arch)
+    model = factory()
+    model = DinoWrapper(model)
+    conditioner = InputConditioner(
+        input_scale=1.0,
+        norm_mean=IMAGENET_DEFAULT_MEAN,
+        norm_std=IMAGENET_DEFAULT_STD,
+    )
+    model.input_conditioner = conditioner
+    return model
+@register_model
+def dino_v2_l_student(**kwargs):
+    return _dino_student('dinov2_vitl14_reg', **kwargs)
+@register_model
+def dino_v2_g_student(**kwargs):
+    return _dino_student('dinov2_vitg14_reg', **kwargs)

tim/models/nvidia_radio/radio/extra_timm_models.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import math
+import warnings
+import torch
+from torch import nn
+from torch.nn import functional as F
+from timm.models import register_model
+from timm.models.vision_transformer import (
+    VisionTransformer,
+    _create_vision_transformer as _timm_create_vision_transformer,
+    Mlp,
+    Block,
+    LayerScale as TIMMLayerScale,
+)
+# Import these to also register them
+from . import dinov2_arch
+@register_model
+def vit_tiny_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
+    """ ViT-Tiny (Vit-Ti/16)
+    """
+    model_args = dict(patch_size=14, embed_dim=192, depth=12, num_heads=3)
+    model = _create_vision_transformer('vit_tiny_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
+    return model
+@register_model
+def vit_small_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
+    """ ViT-Small (ViT-S/16)
+    """
+    model_args = dict(patch_size=14, embed_dim=384, depth=12, num_heads=6)
+    model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
+    return model
+@register_model
+def vit_base_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
+    """ ViT-Base (ViT-B/14) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    model_args = dict(patch_size=14, embed_dim=768, depth=12, num_heads=12)
+    model = _create_vision_transformer('vit_base_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
+    return model
+@register_model
+def vit_base_patch16_v2_224(pretrained=False, **kwargs) -> VisionTransformer:
+    """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    model_args = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, init_values=1e-5,
+        reg_tokens=4, no_embed_class=True, img_size=518 * 16 // 14
+    )
+    model = _create_vision_transformer(
+        'vit_base_patch14_reg4_dinov2', pretrained=pretrained, **dict(model_args, **kwargs))
+    return model
+@register_model
+def vit_large_patch16_v2_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
+    """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    name = 'vit_large_patch14_reg4_dinov2'
+    model_args = dict(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16, init_values=1e-5,
+        reg_tokens=4, no_embed_class=True, img_size=518 * 16 // 14
+    )
+    model = _create_vision_transformer(name, pretrained=pretrained, **dict(model_args, **kwargs))
+    return model
+@register_model
+def vit_huge_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
+    """ ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
+    """
+    model_args = dict(patch_size=16, embed_dim=1280, depth=32, num_heads=16)
+    if pretrained:
+        # There is no pretrained version of ViT-H/16, but we can adapt a ViT-H/14 for this purpose
+        model = _create_vision_transformer('vit_huge_patch14_224', pretrained=True, **dict(model_args, **kwargs))
+    else:
+        model = _create_vision_transformer('vit_huge_patch16_224', pretrained=False, **dict(model_args, **kwargs))
+    return model
+@register_model
+def vit_huge_patch16_224_mlpnorm(pretrained=False, **kwargs) -> VisionTransformer:
+    """ ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
+    """
+    model = vit_huge_patch16_224(pretrained=pretrained, **kwargs)
+    for m in model.modules():
+        if isinstance(m, Mlp) and not isinstance(m.norm, nn.LayerNorm):
+            m.norm = nn.LayerNorm(m.fc1.out_features)
+    return model
+@register_model
+def vit_giant_patch16_224(pretrained=False, scaled_ln: bool = False, **kwargs) -> VisionTransformer:
+    """ ViT-giant model (ViT-g/16) from original paper (https://arxiv.org/abs/2010.11929).
+    """
+    model_args = dict(patch_size=16, embed_dim=1536, depth=40, num_heads=24)
+    model = _create_vision_transformer('vit_giant_patch16_224', pretrained=False, **dict(model_args, **kwargs))
+    if scaled_ln:
+        _apply_scaled_ln(model)
+    return model
+@register_model
+def vit_bigG_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
+    model_args = dict(patch_size=14, embed_dim=1664, depth=48, num_heads=16, init_values=1e-6)
+    model = _create_vision_transformer('vit_bigG_patch14', pretrained=False, **dict(model_args, **kwargs))
+    return model
+def _create_vision_transformer(*args, **kwargs):
+    model = _timm_create_vision_transformer(*args, **kwargs)
+    _patch_layer_scale(model)
+    return model
+def _patch_layer_scale(model: VisionTransformer):
+    def replace_ls(old_ls: TIMMLayerScale):
+        new_ls = dinov2_arch.LayerScale(old_ls.gamma.shape[0], inplace=old_ls.inplace)
+        new_ls.load_state_dict(old_ls.state_dict())
+        return new_ls
+    # Monkey patch: Replace TIMM's LayerScale with our modified DINOv2 one, that uses a param name
+    # other than gamma, so that HFHub doesn't mess with it!
+    for mod in model.modules():
+        if isinstance(mod, Block):
+            if isinstance(mod.ls1, TIMMLayerScale):
+                mod.ls1 = replace_ls(mod.ls1)
+            if isinstance(mod.ls2, TIMMLayerScale):
+                mod.ls2 = replace_ls(mod.ls2)
+    pass
+class ScaledLayerNorm(nn.LayerNorm):
+    '''
+    https://arxiv.org/pdf/2502.05795v1
+    '''
+    def __init__(self, ln_base: nn.LayerNorm, depth: int = 0):
+        super().__init__(ln_base.normalized_shape, eps=ln_base.eps, elementwise_affine=ln_base.elementwise_affine)
+        self.load_state_dict(ln_base.state_dict())
+        self.register_buffer('ln_scale', torch.tensor(1.0 / math.sqrt(depth)), persistent=False)
+    def forward(self, x):
+        y = super().forward(x)
+        y = y * self.ln_scale
+        return y
+class DyT(nn.Module):
+    def __init__(self, C: int, init_alpha: float):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.full((1,), init_alpha))
+        self.gamma = nn.Parameter(torch.ones(C))
+        self.beta = nn.Parameter(torch.zeros(C))
+    def forward(self, x: torch.Tensor):
+        x = F.tanh(self.alpha * x)
+        return self.gamma * x + self.beta
+@register_model
+def vit_large_dyt_patch16_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
+    """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    model_args = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16)
+    model = _create_vision_transformer('vit_large_dyt_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
+    def _replace_ln_with_dyt(ln: nn.LayerNorm, depth: int):
+        return DyT(ln.normalized_shape[0], init_alpha=0.9)
+    _replace_ln(model, _replace_ln_with_dyt)
+    return model
+def _apply_scaled_ln(model: VisionTransformer):
+    warnings.warn('Post-LayerNorm scaling activated!')
+    _replace_ln(model, lambda ln, depth: ScaledLayerNorm(ln, depth=depth))
+def _replace_ln(model: VisionTransformer, fn):
+    def _inner_replace_ln(block: Block, depth: int, key: str):
+        prev = getattr(block, key)
+        if isinstance(prev, nn.LayerNorm):
+            setattr(block, key, fn(prev, depth=depth))
+    for i, block in enumerate(model.blocks):
+        _inner_replace_ln(block, i + 1, 'norm1')
+        _inner_replace_ln(block, i + 1, 'norm2')

tim/models/nvidia_radio/radio/feature_normalizer.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from collections import namedtuple
+from typing import NamedTuple, Optional, Tuple
+import torch
+from torch import nn
+def _run_kernel(x: torch.Tensor, mean: torch.Tensor, tx: torch.Tensor):
+    if x.ndim <= 3:
+        x = x - mean
+        x = x @ tx.T
+    elif x.ndim == 4:
+        x = x - mean.reshape(1, -1, 1, 1)
+        kernel = tx.reshape(*tx.shape, 1, 1)
+        x = torch.nn.functional.conv2d(x, weight=kernel, bias=None, stride=1, padding=0)
+    else:
+        raise ValueError(f'Unsupported input dimension: {x.ndim}, shape: {x.shape}')
+    return x
+class FeatureNormalizer(nn.Module):
+    def __init__(self, embed_dim: int, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.register_buffer('mean', torch.zeros(embed_dim, dtype=dtype))
+        self.register_buffer('tx', torch.eye(embed_dim, dtype=dtype))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = _run_kernel(x, self.mean, self.tx)
+        return x
+class InterFeatState(NamedTuple):
+    y: torch.Tensor
+    alpha: torch.Tensor
+class IntermediateFeatureNormalizerBase(nn.Module):
+    def forward(self, x: torch.Tensor, index: int, rot_index: int = None, skip: Optional[int] = None) -> InterFeatState:
+        raise NotImplementedError()
+class IntermediateFeatureNormalizer(IntermediateFeatureNormalizerBase):
+    def __init__(self, num_intermediates: int, embed_dim: int, rot_per_layer: bool = False, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.register_buffer('alphas', torch.ones(num_intermediates, dtype=dtype))
+        rot = torch.eye(embed_dim, dtype=dtype)
+        if rot_per_layer:
+            rot = rot.unsqueeze(0).repeat(num_intermediates, 1, 1)
+        self.register_buffer('rotation', rot.contiguous())
+        self.register_buffer('means', torch.zeros(num_intermediates, embed_dim, dtype=dtype))
+    def forward(self, x: torch.Tensor, index: int, rot_index: int = None, skip: Optional[int] = None) -> InterFeatState:
+        if rot_index is None:
+            rot_index = index
+        if skip:
+            assert x.ndim == 3, f'Cannot use the `skip` parameter when the `x` tensor isn\'t 3-dimensional.'
+            prefix, x = x[:, :skip], x[:, skip:]
+        rotation = self._get_rotation(rot_index)
+        y = _run_kernel(x, self.means[index], rotation)
+        alpha = self.alphas[index]
+        if skip:
+            alpha = torch.cat([
+                torch.ones(skip, dtype=alpha.dtype, device=alpha.device),
+                alpha[None].expand(y.shape[1]),
+            ]).reshape(1, -1, 1)
+            y = torch.cat([prefix, y], dim=1)
+        else:
+            if x.ndim == 3:
+                alpha = alpha.reshape(1, 1, 1).expand(1, y.shape[1], 1)
+            elif x.ndim == 4:
+                alpha = alpha.reshape(1, 1, 1, 1).expand(1, 1, *y.shape[2:])
+            else:
+                raise ValueError(f'Unsupported input dimension: {x.ndim}')
+        return InterFeatState(y, alpha)
+    def _get_rotation(self, rot_index: int) -> torch.Tensor:
+        if self.rotation.ndim == 2:
+            return self.rotation
+        return self.rotation[rot_index]
+class NullIntermediateFeatureNormalizer(IntermediateFeatureNormalizerBase):
+    instances = dict()
+    def __init__(self, dtype: torch.dtype, device: torch.device):
+        super().__init__()
+        self.register_buffer('alpha', torch.tensor(1, dtype=dtype, device=device))
+    @staticmethod
+    def get_instance(dtype: torch.dtype, device: torch.device):
+        instance = NullIntermediateFeatureNormalizer.instances.get((dtype, device), None)
+        if instance is None:
+            instance = NullIntermediateFeatureNormalizer(dtype, device)
+            NullIntermediateFeatureNormalizer.instances[(dtype, device)] = instance
+        return instance
+    def forward(self, x: torch.Tensor, index: int, rot_index: int = None, skip: Optional[int] = None) -> InterFeatState:
+        return InterFeatState(x, self.alpha)

tim/models/nvidia_radio/radio/forward_intermediates.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union, Any, Iterable
+from types import MethodType
+import torch
+from torch import nn
+from .feature_normalizer import IntermediateFeatureNormalizerBase, NullIntermediateFeatureNormalizer
+def _take_indices(
+        num_blocks: int,
+        n: Optional[Union[int, List[int], Tuple[int]]],
+) -> Tuple[Set[int], int]:
+    if isinstance(n, int):
+        assert n >= 0
+        take_indices = {x for x in range(num_blocks - n, num_blocks)}
+    else:
+        take_indices = {num_blocks + idx if idx < 0 else idx for idx in n}
+    return take_indices, max(take_indices)
+def forward_intermediates(
+        model: nn.Module,
+        patch_extractor: Callable[[torch.Tensor], torch.Tensor],
+        norm: nn.Module,
+        num_summary_tokens: int,
+        num_cls_tokens: int,
+        x: torch.Tensor,
+        indices: Optional[Union[int, List[int], Tuple[int]]] = None,
+        return_prefix_tokens: bool = False,
+        stop_early: bool = False,
+        output_fmt: str = 'NCHW',
+        intermediates_only: bool = False,
+        aggregation: Optional[str] = "sparse",
+        inter_feature_normalizer: Optional[IntermediateFeatureNormalizerBase] = None,
+        norm_alpha_scheme = "post-alpha",
+        block_kwargs: Dict = None,
+) -> Union[List[torch.Tensor], Tuple[torch.Tensor, List[torch.Tensor]]]:
+    """ Forward features that returns intermediates.
+    The Dense layer aggregation method is inspired from the paper: "Dense Connector for MLLMs"
+    by Yao, Huanjin et al. (2024). arXiv preprint arXiv:2405.13800}
+    Args:
+        x: Input image tensor
+        indices: Take last n blocks if int, select matching indices if sequence
+        return_prefix_tokens: Return both prefix and spatial intermediate tokens
+        norm: Apply norm layer to all intermediates
+        stop_early: Stop iterating over blocks when last desired intermediate hit
+        output_fmt: Shape of intermediate feature outputs
+        intermediates_only: Only return intermediate features
+        aggregation: intermediate layer aggregation method (sparse or dense)
+        norm_alpha_scheme: apply alpha before ("pre-alpha") or after accumulation ("post-alpha")
+    Returns:
+    """
+    assert output_fmt in ('NCHW', 'NLC'), 'Output format must be one of NCHW or NLC.'
+    assert aggregation in ('sparse', 'dense'), 'Aggregation must be one of sparse or dense.'
+    reshape = output_fmt == 'NCHW'
+    intermediates = []
+    block_kwargs = block_kwargs or dict()
+    blocks = model.blocks
+    take_indices, max_index = _take_indices(len(blocks), indices)
+    take_indices = sorted(take_indices)
+    # forward pass
+    B, _, height, width = x.shape
+    x = patch_extractor(x)
+    if stop_early:
+        blocks = blocks[:max_index + 1]
+    if inter_feature_normalizer is None or norm_alpha_scheme == 'none':
+        inter_feature_normalizer = NullIntermediateFeatureNormalizer.get_instance(x.dtype, x.device)
+    assert norm_alpha_scheme in ('none', 'pre-alpha', 'post-alpha'), f'Unsupported alpha scheme: {norm_alpha_scheme}'
+    post_alpha_scheme = norm_alpha_scheme == 'post-alpha'
+    accumulator = 0
+    alpha_sum = 0
+    num_accumulated = 0
+    take_off = 0
+    for i, blk in enumerate(blocks):
+        x = blk(x, **block_kwargs)
+        if aggregation == "dense":
+            # Arbitrarily use the rotation matrix from the final layer in the dense group
+            y, alpha = inter_feature_normalizer(x, i, rot_index=take_indices[take_off], skip=num_summary_tokens)
+            if post_alpha_scheme:
+                accumulator = accumulator + y
+                alpha_sum = alpha_sum + alpha
+            else:
+                accumulator = accumulator + (alpha * y)
+                alpha_sum += 1
+            num_accumulated += 1
+        if i == take_indices[take_off]:
+            if aggregation == "dense":
+                alpha = alpha_sum / num_accumulated
+                x_ = alpha * accumulator / num_accumulated
+                num_accumulated = 0
+                accumulator = 0
+                alpha_sum = 0
+            else:
+                 y, alpha = inter_feature_normalizer(x, i, skip=num_summary_tokens)
+                 x_ = alpha * y
+            # normalize intermediates with final norm layer if enabled
+            intermediates.append(norm(x_))
+            take_off = min(take_off + 1, len(take_indices) - 1)
+    # process intermediates
+    # split prefix (e.g. class, distill) and spatial feature tokens
+    prefix_tokens = [y[:, :num_cls_tokens] for y in intermediates]
+    intermediates = [y[:, num_summary_tokens:] for y in intermediates]
+    if reshape:
+        # reshape to BCHW output format
+        H = height // model.patch_size
+        W = width // model.patch_size
+        intermediates = [y.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() for y in intermediates]
+    if not torch.jit.is_scripting() and return_prefix_tokens:
+        # return_prefix not support in torchscript due to poor type handling
+        intermediates = list(zip(prefix_tokens, intermediates))
+    if intermediates_only:
+        return intermediates
+    x = norm(x)
+    return x, intermediates

tim/models/nvidia_radio/radio/hf_model.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import namedtuple
+from typing import Callable, Dict, Optional, List, Union
+from timm.models import VisionTransformer
+import torch
+from torch import nn
+from transformers import PretrainedConfig, PreTrainedModel
+from .common import RESOURCE_MAP, DEFAULT_VERSION
+# Import all required modules.
+from .adaptor_base import AdaptorBase, RadioOutput, AdaptorInput
+from .adaptor_generic import GenericAdaptor, AdaptorBase
+from .adaptor_mlp import create_mlp_from_config
+from .adaptor_registry import adaptor_registry
+from .cls_token import ClsToken
+from .dinov2_arch import dinov2_vitg14_reg
+from .enable_cpe_support import enable_cpe
+from .enable_spectral_reparam import configure_spectral_reparam_from_args
+from .eradio_model import eradio
+from .feature_normalizer import FeatureNormalizer, IntermediateFeatureNormalizer
+from .forward_intermediates import forward_intermediates
+from .radio_model import create_model_from_args
+from .radio_model import RADIOModel as RADIOModelBase, Resolution
+from .input_conditioner import get_default_conditioner, InputConditioner
+from .open_clip_adaptor import OpenCLIP_RADIO
+from .vit_patch_generator import ViTPatchGenerator
+from .vitdet import apply_vitdet_arch, VitDetArgs
+# Register extra models
+from .extra_timm_models import *
+from .extra_models import *
+class RADIOConfig(PretrainedConfig):
+    """Pretrained Hugging Face configuration for RADIO models."""
+    def __init__(
+        self,
+        args: Optional[dict] = None,
+        version: Optional[str] = DEFAULT_VERSION,
+        patch_size: Optional[int] = None,
+        max_resolution: Optional[int] = None,
+        preferred_resolution: Optional[Resolution] = None,
+        adaptor_names: Union[str, List[str]] = None,
+        adaptor_configs: Dict[str, Dict[str, int]] = None,
+        vitdet_window_size: Optional[int] = None,
+        feature_normalizer_config: Optional[dict] = None,
+        inter_feature_normalizer_config: Optional[dict] = None,
+        **kwargs,
+    ):
+        self.args = args
+        for field in ["dtype", "amp_dtype"]:
+            if self.args is not None and field in self.args:
+                # Convert to a string in order to make it serializable.
+                # For example for torch.float32 we will store "float32",
+                # for "bfloat16" we will store "bfloat16".
+                self.args[field] = str(args[field]).split(".")[-1]
+        self.version = version
+        resource = RESOURCE_MAP[version]
+        self.patch_size = patch_size or resource.patch_size
+        self.max_resolution = max_resolution or resource.max_resolution
+        self.preferred_resolution = (
+            preferred_resolution or resource.preferred_resolution
+        )
+        self.adaptor_names = adaptor_names
+        self.adaptor_configs = adaptor_configs
+        self.vitdet_window_size = vitdet_window_size
+        self.feature_normalizer_config = feature_normalizer_config
+        self.inter_feature_normalizer_config = inter_feature_normalizer_config
+        super().__init__(**kwargs)
+class RADIOModel(PreTrainedModel):
+    """Pretrained Hugging Face model for RADIO.
+    This class inherits from PreTrainedModel, which provides
+    HuggingFace's functionality for loading and saving models.
+    """
+    config_class = RADIOConfig
+    def __init__(self, config: RADIOConfig):
+        super().__init__(config)
+        RADIOArgs = namedtuple("RADIOArgs", config.args.keys())
+        args = RADIOArgs(**config.args)
+        self.config = config
+        model = create_model_from_args(args)
+        input_conditioner: InputConditioner = get_default_conditioner()
+        dtype = getattr(args, "dtype", torch.float32)
+        if isinstance(dtype, str):
+            # Convert the dtype's string representation back to a dtype.
+            dtype = getattr(torch, dtype)
+        model.to(dtype=dtype)
+        input_conditioner.dtype = dtype
+        summary_idxs = torch.tensor(
+            [i for i, t in enumerate(args.teachers) if t.get("use_summary", True)],
+            dtype=torch.int64,
+        )
+        adaptor_configs = config.adaptor_configs
+        adaptor_names = config.adaptor_names or []
+        adaptors = dict()
+        for adaptor_name in adaptor_names:
+            mlp_config = adaptor_configs[adaptor_name]
+            adaptor = GenericAdaptor(args, None, None, mlp_config)
+            adaptor.head_idx = mlp_config["head_idx"]
+            adaptors[adaptor_name] = adaptor
+        feature_normalizer = None
+        if config.feature_normalizer_config is not None:
+            # Actual normalization values will be restored when loading checkpoint weights.
+            feature_normalizer = FeatureNormalizer(config.feature_normalizer_config["embed_dim"])
+        inter_feature_normalizer = None
+        if config.inter_feature_normalizer_config is not None:
+            inter_feature_normalizer = IntermediateFeatureNormalizer(
+                config.inter_feature_normalizer_config["num_intermediates"],
+                config.inter_feature_normalizer_config["embed_dim"],
+                rot_per_layer=config.inter_feature_normalizer_config["rot_per_layer"],
+                dtype=dtype)
+        self.radio_model = RADIOModelBase(
+            model,
+            input_conditioner,
+            summary_idxs=summary_idxs,
+            patch_size=config.patch_size,
+            max_resolution=config.max_resolution,
+            window_size=config.vitdet_window_size,
+            preferred_resolution=config.preferred_resolution,
+            adaptors=adaptors,
+            feature_normalizer=feature_normalizer,
+            inter_feature_normalizer=inter_feature_normalizer,
+        )
+    @property
+    def adaptors(self) -> nn.ModuleDict:
+        return self.radio_model.adaptors
+    @property
+    def model(self) -> VisionTransformer:
+        return self.radio_model.model
+    @property
+    def input_conditioner(self) -> InputConditioner:
+        return self.radio_model.input_conditioner
+    @property
+    def num_summary_tokens(self) -> int:
+        return self.radio_model.num_summary_tokens
+    @property
+    def patch_size(self) -> int:
+        return self.radio_model.patch_size
+    @property
+    def max_resolution(self) -> int:
+        return self.radio_model.max_resolution
+    @property
+    def preferred_resolution(self) -> Resolution:
+        return self.radio_model.preferred_resolution
+    @property
+    def window_size(self) -> int:
+        return self.radio_model.window_size
+    @property
+    def min_resolution_step(self) -> int:
+        return self.radio_model.min_resolution_step
+    def make_preprocessor_external(self) -> Callable[[torch.Tensor], torch.Tensor]:
+        return self.radio_model.make_preprocessor_external()
+    def get_nearest_supported_resolution(self, height: int, width: int) -> Resolution:
+        return self.radio_model.get_nearest_supported_resolution(height, width)
+    def switch_to_deploy(self):
+        return self.radio_model.switch_to_deploy()
+    def forward(self, x: torch.Tensor):
+        return self.radio_model.forward(x)

tim/models/nvidia_radio/radio/input_conditioner.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from typing import Union, Tuple
+import torch
+from torch import nn
+norm_t = Union[Tuple[float, float, float], torch.Tensor]
+class InputConditioner(nn.Module):
+    def __init__(self,
+                 input_scale: float,
+                 norm_mean: norm_t,
+                 norm_std: norm_t,
+                 dtype: torch.dtype = None,
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.register_buffer("norm_mean", _to_tensor(norm_mean) / input_scale)
+        self.register_buffer("norm_std", _to_tensor(norm_std) / input_scale)
+    def forward(self, x: torch.Tensor):
+        y = (x - self.norm_mean) / self.norm_std
+        if self.dtype is not None:
+            y = y.to(self.dtype)
+        return y
+def get_default_conditioner():
+    from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+    return InputConditioner(
+        input_scale=1.0,
+        norm_mean=OPENAI_CLIP_MEAN,
+        norm_std=OPENAI_CLIP_STD,
+    )
+def _to_tensor(v: norm_t):
+    return torch.as_tensor(v, dtype=torch.float32).view(-1, 1, 1)

tim/models/nvidia_radio/radio/open_clip_adaptor.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from argparse import Namespace
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .adaptor_registry import adaptor_registry, dict_t, state_t
+from .adaptor_generic import GenericAdaptor
+class OpenCLIP_RADIO(GenericAdaptor):
+    def __init__(self, main_config: Namespace, adaptor_config: dict_t, state: state_t):
+        super().__init__(main_config, adaptor_config, state)
+        import open_clip
+        self.oc_model = open_clip.create_model_from_pretrained(
+            model_name=adaptor_config['model'],
+            pretrained=adaptor_config['pretrained'],
+            return_transform=False,
+        )
+        # Unload these parameters
+        self.oc_model.visual = None
+        self.tokenizer = open_clip.get_tokenizer(model_name=adaptor_config['model'])
+    def encode_text(self, text, normalize: bool = False):
+        return self.oc_model.encode_text(text, normalize=normalize)
+@adaptor_registry.register_adaptor("open_clip")
+def create_open_clip_adaptor(main_config: Namespace, adaptor_config: dict_t, state: state_t):
+    return OpenCLIP_RADIO(main_config, adaptor_config, state)

tim/models/nvidia_radio/radio/radio_model.py ADDED Viewed

	@@ -0,0 +1,375 @@

+# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from typing import Callable, Dict, Iterable, List, NamedTuple, Optional, Tuple, Union
+import torch
+from torch import nn
+from timm.models import create_model, VisionTransformer
+from types import MethodType
+from .enable_cpe_support import enable_cpe
+from .input_conditioner import InputConditioner
+from .adaptor_base import AdaptorBase, RadioOutput, AdaptorInput
+from . import eradio_model
+from .enable_spectral_reparam import configure_spectral_reparam_from_args
+from .feature_normalizer import FeatureNormalizer, IntermediateFeatureNormalizer
+from . import dual_hybrid_vit
+class Resolution(NamedTuple):
+    height: int
+    width: int
+class RADIOModel(nn.Module):
+    def __init__(
+        self,
+        model: nn.Module,
+        input_conditioner: InputConditioner,
+        patch_size: int,
+        max_resolution: int,
+        preferred_resolution: Resolution,
+        summary_idxs: Optional[torch.Tensor] = None,
+        window_size: int = None,
+        adaptors: Dict[str, AdaptorBase] = None,
+        feature_normalizer: Optional[FeatureNormalizer] = None,
+        inter_feature_normalizer: Optional[IntermediateFeatureNormalizer] = None,
+    ):
+        super().__init__()
+        self.model = model
+        self.input_conditioner = input_conditioner
+        if summary_idxs is not None:
+            self.register_buffer('summary_idxs', summary_idxs)
+        else:
+            self.summary_idxs = None
+        self._preferred_resolution = preferred_resolution
+        self._patch_size = patch_size
+        self._max_resolution = max_resolution
+        self._window_size = window_size
+        adaptors = adaptors or dict()
+        self.adaptors = nn.ModuleDict(adaptors)
+        if feature_normalizer is None:
+            feature_normalizer = nn.Identity()
+        self.feature_normalizer = feature_normalizer
+        self.inter_feature_normalizer = inter_feature_normalizer
+    @property
+    def num_summary_tokens(self) -> int:
+        if hasattr(self.model, 'num_summary_tokens'):
+            return self.model.num_summary_tokens
+        patch_gen = getattr(self.model, "patch_generator", None)
+        if patch_gen is not None:
+            return patch_gen.num_skip
+        elif getattr(self.model, 'global_pool', None) == 'avg':
+            return 0
+        return 1
+    @property
+    def num_cls_tokens(self) -> int:
+        if hasattr(self.model, 'num_cls_tokens'):
+            return self.model.num_cls_tokens
+        patch_gen = getattr(self.model, 'patch_generator', None)
+        if patch_gen is not None:
+            return patch_gen.num_cls_tokens
+        elif getattr(self.model, 'global_pool', None) == 'avg':
+            return 0
+        return 1
+    @property
+    def patch_size(self) -> int:
+        if self._patch_size is not None:
+            return self._patch_size
+        if hasattr(self.model, "patch_size"):
+            return self.model.patch_size
+        patch_gen = getattr(self.model, "patch_generator", None)
+        if patch_gen is not None:
+            return patch_gen.patch_size
+        return None
+    @property
+    def max_resolution(self) -> int:
+        return self._max_resolution
+    @property
+    def preferred_resolution(self) -> Resolution:
+        return self._preferred_resolution
+    @property
+    def window_size(self) -> int:
+        return self._window_size
+    @property
+    def min_resolution_step(self) -> int:
+        res = self.patch_size
+        if self.window_size is not None:
+            res *= self.window_size
+        return res
+    @property
+    def blocks(self) -> Iterable[nn.Module]:
+        blocks = getattr(self.model, 'blocks', None)
+        if blocks is not None:
+            return blocks
+        return None
+    @property
+    def embed_dim(self) -> int:
+        return self.model.embed_dim
+    def make_preprocessor_external(self) -> Callable[[torch.Tensor], torch.Tensor]:
+        ret = self.input_conditioner
+        self.input_conditioner = nn.Identity()
+        return ret
+    def get_nearest_supported_resolution(self, height: int, width: int) -> Resolution:
+        height = int(round(height / self.min_resolution_step) * self.min_resolution_step)
+        width = int(round(width / self.min_resolution_step) * self.min_resolution_step)
+        height = max(height, self.min_resolution_step)
+        width = max(width, self.min_resolution_step)
+        return Resolution(height=height, width=width)
+    def switch_to_deploy(self):
+        fn = getattr(self.model, 'switch_to_deploy', None)
+        if fn is not None:
+            fn()
+    def forward(self, x: torch.Tensor, feature_fmt: str = 'NLC') -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        '''
+        Forward process for model.
+        Args:
+            x: Input tensor. Unless `make_preprocessor_external` has been called, then the dynamic range of `x` is expected to be `[0, 1]`,
+                             otherwise `x` is expected to be mean centered with unit standard deviation.
+            feature_format: ['NLC', 'NCHW'] - The output format for the features.
+        '''
+        res_step = self.min_resolution_step
+        if res_step is not None and (x.shape[-2] % res_step != 0 or x.shape[-1] % res_step != 0):
+            raise ValueError('The input resolution must be a multiple of `self.min_resolution_step`. '
+                             '`self.get_nearest_supported_resolution(<height>, <width>) is provided as a convenience API. '
+                             f'Input: {x.shape[-2:]}, Nearest: {self.get_nearest_supported_resolution(*x.shape[-2:])}')
+        x = self.input_conditioner(x)
+        y = self.model.forward_features(x)
+        ret = self._extract_final(x, y, feature_fmt=feature_fmt)
+        return ret
+    def forward_pack(self, x: List[torch.Tensor], feature_fmt: str = 'NLC') -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        '''
+        Forward process for model.
+        Args:
+            x: Input tensor. Unless `make_preprocessor_external` has been called, then the dynamic range of `x` is expected to be `[0, 1]`,
+                             otherwise `x` is expected to be mean centered with unit standard deviation.
+            feature_format: ['NLC', 'NCHW'] - The output format for the features.
+        '''
+        res_step = self.min_resolution_step
+        for _x in x:
+            if res_step is not None and (_x.shape[-2] % res_step != 0 or _x.shape[-1] % res_step != 0):
+                raise ValueError('The input resolution must be a multiple of `self.min_resolution_step`. '
+                                '`self.get_nearest_supported_resolution(<height>, <width>) is provided as a convenience API. '
+                                f'Input: {_x.shape[-2:]}, Nearest: {self.get_nearest_supported_resolution(*_x.shape[-2:])}')
+        x = [self.input_conditioner(_x) for _x in x]
+        y, cu_seqlens = self.model.forward_features(x)
+        all_summary, spatial_features = [], []
+        num_cls_tokens = self.model.patch_generator.num_cls_tokens
+        num_skip = self.model.patch_generator.num_skip
+        for i in range(len(cu_seqlens)-1):
+            summary = y[cu_seqlens[i]: cu_seqlens[i+1]][: num_cls_tokens]
+            all_feat = y[cu_seqlens[i]: cu_seqlens[i+1]][num_skip :]
+            all_summary.append(summary)
+            spatial_features.append(all_feat)
+        all_summary = torch.cat(all_summary)
+        spatial_features = torch.cat(spatial_features)
+        return all_summary, spatial_features
+    def _extract_final(self, x: torch.Tensor, y: torch.Tensor, feature_fmt: str = 'NLC'):
+        if isinstance(self.model, VisionTransformer):
+            patch_gen = getattr(self.model, "patch_generator", None)
+            if patch_gen is not None:
+                all_summary = y[:, : patch_gen.num_cls_tokens]
+                if self.summary_idxs is not None:
+                    bb_summary = all_summary[:, self.summary_idxs]
+                else:
+                    bb_summary = all_summary
+                all_feat = y[:, patch_gen.num_skip :]
+            elif self.model.global_pool == "avg":
+                all_summary = y[:, self.model.num_prefix_tokens :].mean(dim=1)
+                bb_summary = all_summary
+                all_feat = y
+            else:
+                all_summary = y[:, 0]
+                bb_summary = all_summary
+                all_feat = y[:, 1:]
+        elif isinstance(self.model, eradio_model.ERADIO):
+            _, f = y
+            all_feat = f.flatten(2).transpose(1, 2)
+            all_summary = all_feat.mean(dim=1)
+            bb_summary = all_summary
+        elif isinstance(y, (list, tuple)):
+            all_summary, all_feat = y
+            bb_summary = all_summary
+        else:
+            all_summary = y[:, :self.num_cls_tokens]
+            if self.summary_idxs is not None and all_summary.shape[1] > 1:
+                if all_summary.shape[1] == 1:
+                    # Create dummy duplicates
+                    all_summary = all_summary.expand(-1, 128, -1)
+                bb_summary = all_summary[:, self.summary_idxs]
+            else:
+                bb_summary = all_summary
+            all_feat = y[:, self.num_summary_tokens:]
+        all_feat = self.feature_normalizer(all_feat)
+        if feature_fmt == 'NCHW':
+            fmt_feat = (all_feat.reshape(all_feat.shape[0], x.shape[-2] // self.patch_size, x.shape[-1] // self.patch_size, all_feat.shape[2])
+                                .permute(0, 3, 1, 2)
+            )
+        elif feature_fmt == 'NLC':
+            fmt_feat = all_feat
+        else:
+            raise ValueError(f'Unsupported feature_fmt: {feature_fmt}. Must be one of ["NLC", "NCHW"]')
+        ret = RadioOutput(bb_summary.flatten(1), fmt_feat)
+        if self.adaptors:
+            ret = dict(backbone=ret)
+            for name, adaptor in self.adaptors.items():
+                if all_summary.ndim == 3:
+                    if all_summary.shape[1] == 1:
+                        summary = all_summary[:, 0]
+                    else:
+                        summary = all_summary[:, adaptor.head_idx]
+                else:
+                    summary = all_summary
+                ada_input = AdaptorInput(images=x, summary=summary.float(), features=all_feat, feature_fmt=feature_fmt, patch_size=self.patch_size)
+                v = adaptor(ada_input).to(torch.float32)
+                ret[name] = v
+        return ret
+    def forward_intermediates(
+            self,
+            x: torch.Tensor,
+            indices: Optional[Union[int, List[int], Tuple[int]]] = None,
+            return_prefix_tokens: bool = False,
+            norm: bool = False,
+            stop_early: bool = False,
+            output_fmt: str = 'NCHW',
+            intermediates_only: bool = False,
+            aggregation: Optional[str] = "sparse",
+            norm_alpha_scheme: Optional[str] = "post-alpha",
+    ) -> List[RadioOutput]:
+        """ Forward features that returns intermediates.
+        Args:
+            x: Input image tensor
+            indices: Take last n blocks if int, select matching indices if sequence
+            return_prefix_tokens: Return both prefix and spatial intermediate tokens
+            norm: Apply norm layer to all intermediates
+            stop_early: Stop iterating over blocks when last desired intermediate hit
+            output_fmt: Shape of intermediate feature outputs. Options: NCHW, NLC
+            intermediates_only: Only return intermediate features
+            aggregation: intermediate layer aggregation method (sparse or dense).
+                Dense accumulation is done by averaging the features in each group.
+            norm_alpha_scheme: apply alpha before ("pre-alpha") or after accumulation ("post-alpha"), or don't normalize ("none")
+                Only affects dense aggregation
+        Returns:
+            List of RadioOutput objects.
+        """
+        x = self.input_conditioner(x)
+        intermediates = self.model.forward_intermediates(
+            x,
+            indices=indices,
+            return_prefix_tokens=return_prefix_tokens,
+            norm=norm,
+            stop_early=stop_early,
+            output_fmt=output_fmt,
+            intermediates_only=intermediates_only,
+            aggregation=aggregation,
+            inter_feature_normalizer=self.inter_feature_normalizer,
+            norm_alpha_scheme=norm_alpha_scheme,
+        )
+        if not intermediates_only:
+            final, intermediates = intermediates
+        def prepare_summary(summ: Optional[torch.Tensor]):
+            if summ is None:
+                return summ
+            if self.summary_idxs is not None and summ.shape[1] > 1:
+                summ = summ[:, self.summary_idxs]
+            return summ.flatten(1)
+        if return_prefix_tokens:
+            radio_outputs = [
+                RadioOutput(prepare_summary(summary), features)
+                for summary, features in intermediates
+            ]
+        else:
+            radio_outputs = intermediates
+        if intermediates_only:
+            return radio_outputs
+        else:
+            final = self._extract_final(x, final, feature_fmt=output_fmt)
+            return final, radio_outputs
+def create_model_from_args(args) -> nn.Module:
+    in_chans = 3
+    if args.in_chans is not None:
+        in_chans = args.in_chans
+    elif args.input_size is not None:
+        in_chans = args.input_size[0]
+    # Skip weight initialization unless it's explicitly requested.
+    weight_init = args.model_kwargs.pop("weight_init", "skip")
+    model = create_model(
+        args.model,
+        pretrained=args.pretrained,
+        in_chans=in_chans,
+        num_classes=args.num_classes,
+        drop_rate=args.drop,
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps,
+        scriptable=args.torchscript,
+        checkpoint_path=args.initial_checkpoint,
+        weight_init=weight_init,
+        **args.model_kwargs,
+    )
+    if hasattr(model, 'norm') and not getattr(args, 'model_norm', False):
+        model.norm = nn.Identity()
+    model.head = nn.Identity()
+    if args.cpe_max_size is not None:
+        uq_teachers = set(t['name'] for t in args.teachers)
+        enable_cpe(
+            model,
+            args.cpe_max_size,
+            num_cls_tokens=len(uq_teachers) if args.cls_token_per_teacher else 1,
+            register_multiple=getattr(args, 'register_multiple', None),
+            num_registers=getattr(args, 'cpe_num_registers', None),
+            support_packing=args.support_packing,
+        )
+    return model

tim/models/nvidia_radio/radio/vision_transformer_xpos.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import math
+from typing import Final, List, Optional, Tuple, Union
+from einops import rearrange
+from timm.models import register_model
+import torch
+from torch import Type, nn
+from torch.nn import functional as F
+from torch.nn.init import xavier_normal_, xavier_uniform_, zeros_
+from .forward_intermediates import forward_intermediates
+def _get_init_scale(num_encoder_layers: int, num_decoder_layers: int, is_encoder: bool):
+    if num_encoder_layers > 0 and num_decoder_layers == 0:
+        return math.sqrt(math.log(2 * num_encoder_layers))
+    if num_decoder_layers > 0 and num_encoder_layers == 0:
+        return math.sqrt(math.log(2 * num_decoder_layers))
+    if is_encoder:
+        # Both encoders and decoders
+        return math.sqrt(
+            0.33 * math.log(3 * num_decoder_layers) * math.log(2 * num_encoder_layers)
+        )
+    return math.sqrt(math.log(3 * num_decoder_layers))
+# [1,2]    [1,1,2,2]
+# [3,4] -> [3,3,4,4]
+# [5,6]    [5,5,6,6]
+def duplicate_interleave(m):
+    return m.view(-1, 1).repeat(1, 2).view(m.shape[0], -1)
+# 0,1,2,3,4,5,6,7 -> -1,0,-3,2,-5,4,-7,6
+def rotate_every_two(x):
+    x1 = x[:, :, ::2]
+    x2 = x[:, :, 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)  # in einsum notation: rearrange(x, '... d j -> ... (d j)')\
+class XPosEmbedding2D(torch.nn.Module):
+    """Implementation of xPos based on RotaryEmbedding from GPT-NeoX.
+    This implementation is designed to operate on queries and keys that are compatible with
+    [batch_size, n_heads_per_partition, seq_len, head_dim] (e.g. MinGPTAttention format).
+    """
+    def __init__(
+        self,
+        head_dim: int,
+        base=50000,
+        scale_base=512
+    ):
+        super().__init__()
+        half_dim = head_dim // 2
+        self.half_dim = half_dim
+        inv_freq = 1.0 / (base ** (torch.arange(0, half_dim, 2).float() / half_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.head_dim = head_dim
+        self.token_shape_cached = None
+        self.batch_size_cached = None
+        self.cos_cached: torch.Tensor | None = None
+        self.sin_cached: torch.Tensor | None = None
+        self.scale_cached: torch.Tensor | None = None
+        self.scale_base = scale_base
+        self.register_buffer("scale",
+                             (torch.arange(0, half_dim, 2) + 0.4 * half_dim) / (1.4 * half_dim))
+    def cos_sin(
+        self,
+        token_shape: Tuple[int, int],
+        device="cuda",
+        dtype=torch.bfloat16,
+    ) -> torch.Tensor:
+        if token_shape != self.token_shape_cached:
+            self.token_shape_cached = token_shape
+            y = torch.arange(token_shape[0], device=device, dtype=self.inv_freq.dtype)
+            x = torch.arange(token_shape[1], device=device, dtype=self.inv_freq.dtype)
+            x, y = torch.meshgrid(x, y, indexing='xy')
+            y_freqs = torch.einsum("i,j->ij", y.flatten(), self.inv_freq)
+            x_freqs = torch.einsum("i,j->ij", x.flatten(), self.inv_freq)
+            y_scales = self.scale ** y.flatten().div(self.scale_base)[:, None]
+            x_scales = self.scale ** x.flatten().div(self.scale_base)[:, None]
+            freqs = torch.cat([y_freqs, x_freqs], dim=-1)
+            emb = torch.repeat_interleave(freqs, repeats=2, dim=-1)
+            scales = torch.cat([y_scales, x_scales], dim=-1)
+            scales = torch.repeat_interleave(scales, repeats=2, dim=-1)
+            if dtype in [torch.float16, torch.bfloat16]:
+                emb = emb.float()
+            self.cos_cached = emb.cos()[None, :, :]
+            self.sin_cached = emb.sin()[None, :, :]
+            self.scale_cached = scales[None, :, :]
+            self.cos_cached = self.cos_cached.type(dtype)
+            self.sin_cached = self.sin_cached.type(dtype)
+            self.scale_cached = self.scale_cached.type(dtype)
+        return self.cos_cached, self.sin_cached, self.scale_cached
+    def forward(self, q: torch.Tensor, k: torch.Tensor, token_shape: Tuple[int, int]):
+        batch, seq_len, head_dim = q.shape
+        cos, sin, scale = self.cos_sin(token_shape, q.device, q.dtype)
+        # scale = self.scale**torch.arange(seq_len).to(self.scale).div(self.scale_base)[:, None]
+        # scale = torch.repeat_interleave(scale, 2, dim=-1).to(q.device)
+        # scale = torch.cat([scale, scale], dim=-1)
+        # scale = 1
+        return (
+            (q * cos * scale) + (rotate_every_two(q) * sin * scale),
+            (k * cos * (1 / scale)) + (rotate_every_two(k) * sin * (1 / scale)),
+        )
+class MagnetoAttention(nn.Module):
+    def __init__(self, d_model: int, n_head: int, pos_emb: XPosEmbedding2D):
+        super().__init__()
+        self.num_heads = n_head
+        self.head_dim = d_model // n_head
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(d_model, d_model * 3, bias=False)
+        self.proj = nn.Linear(d_model, d_model)
+        self.pos_emb = pos_emb
+        self.norm0 = nn.LayerNorm(d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+    def forward(self, x: torch.Tensor, num_prefix_tokens: int, patch_shape: Tuple[int, int]) -> torch.Tensor:
+        B, N, C = x.shape
+        x = self.norm0(x)
+        qkv = self.qkv(x).reshape(B, N, 3, C).permute(2, 0, 1, 3)
+        q, k, v = qkv.unbind(0)
+        q_pref = q[:, :num_prefix_tokens]
+        q_patch = q[:, num_prefix_tokens:]
+        k_pref = k[:, :num_prefix_tokens]
+        k_patch = k[:, num_prefix_tokens:]
+        q_patch, k_patch = self.pos_emb(q_patch, k_patch, patch_shape)
+        q = torch.cat([q_pref, q_patch], dim=1)
+        k = torch.cat([k_pref, k_patch], dim=1)
+        def head_reshape(t: torch.Tensor):
+            return t.reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        q = head_reshape(q)
+        k = head_reshape(k)
+        v = head_reshape(v)
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.norm1(x)
+        x = self.proj(x)
+        return x
+    def _reset_parameters(self):
+        xavier_uniform_(self.qkv.weight)
+        if self.qkv.bias is not None:
+            zeros_(self.qkv.bias)
+        xavier_normal_(self.proj.weight)
+        zeros_(self.proj.bias)
+class MagnetoTransformerEncoderLayer(nn.Module):
+    def __init__(self, d_model: int, nhead: int, pos_emb: XPosEmbedding2D,
+                 num_encoder_layers: int, num_decoder_layers: int = 0,
+                 dim_mhsa: int = 0,
+                 dim_feedforward: int = 2048,
+                 layer_norm_eps: float = 1e-5,
+                 batch_first: bool = True):
+        super().__init__()
+        if dim_mhsa == 0:
+            dim_mhsa = d_model
+        self._num_encoder_layers = num_encoder_layers
+        self._num_decoder_layers = num_decoder_layers
+        self.attn = MagnetoAttention(d_model, nhead, pos_emb)
+        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.linear2 = nn.Linear(d_model, dim_feedforward)
+        self.norm3 = nn.LayerNorm(dim_feedforward, eps=layer_norm_eps)
+        self.linear3 = nn.Linear(dim_feedforward, d_model)
+    def initialize(self):
+        gamma = _get_init_scale(self._num_encoder_layers, self._num_decoder_layers, is_encoder=True)
+        # Magneto Initialization
+        for mod in self.children():
+            if isinstance(mod, nn.Linear):
+                xavier_normal_(mod.weight.data, gamma)
+            elif isinstance(mod, MagnetoAttention):
+                mod._reset_parameters()
+    def forward(self, x: torch.Tensor, num_prefix_tokens: int, patch_shape: Tuple[int, int]) -> torch.Tensor:
+        x = x + self._sa_block(x, num_prefix_tokens, patch_shape)
+        x = x + self._ff_block(x)
+        return x
+    def _sa_block(self, x: torch.Tensor, num_prefix_tokens: int, patch_shape: Tuple[int, int]) -> torch.Tensor:
+        x = self.attn(x, num_prefix_tokens, patch_shape)
+        return x
+    def _ff_block(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm2(x)
+        x = self.linear2(x)
+        x = F.gelu(x)
+        x = self.norm3(x)
+        x = self.linear3(x)
+        return x
+class VisionTransformer(nn.Module):
+    """ Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+    dynamic_img_size: Final[bool]
+    def __init__(
+            self,
+            patch_size: Union[int, Tuple[int, int]] = 16,
+            in_chans: int = 3,
+            embed_dim: int = 768,
+            depth: int = 12,
+            num_heads: int = 12,
+            mlp_ratio: float = 4.,
+            num_cls_tokens: int = 1,
+            num_reg_tokens: int = 0,
+    ) -> None:
+        """
+        Args:
+            patch_size: Patch size.
+            in_chans: Number of image input channels.
+            embed_dim: Transformer embedding dimension.
+            depth: Depth of transformer.
+            num_heads: Number of attention heads.
+            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
+            num_cls_tokens: Number of cls tokens
+            num_reg_tokens: Number of register tokens.
+            block_fn: Transformer block layer.
+        """
+        super().__init__()
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.num_cls_tokens = num_cls_tokens
+        self.num_reg_tokens = num_reg_tokens
+        self.patch_embed = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.prefix_buffer = nn.Parameter(torch.randn(1, self.num_prefix_tokens, embed_dim) * .02)
+        pos_emb = XPosEmbedding2D(embed_dim)
+        self.blocks = nn.ModuleList([
+            MagnetoTransformerEncoderLayer(
+                d_model=embed_dim,
+                nhead=num_heads,
+                num_encoder_layers=depth,
+                num_decoder_layers=0,
+                dim_feedforward=int(embed_dim * mlp_ratio),
+                pos_emb=pos_emb,
+            )
+            for _ in range(depth)
+        ])
+        for block in self.blocks:
+            block.initialize()
+    @property
+    def num_prefix_tokens(self):
+        return self.num_cls_tokens + self.num_reg_tokens
+    @property
+    def num_summary_tokens(self):
+        return self.num_prefix_tokens
+    def forward_features(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        x, patch_shape = self._patchify(x)
+        for block in self.blocks:
+            x = block(x, self.num_prefix_tokens, patch_shape)
+        summary = x[:, :self.num_cls_tokens]
+        features = x[:, self.num_prefix_tokens:]
+        return summary, features
+    def forward_intermediates(self, x: torch.Tensor, norm: bool = False, **kwargs):
+        patch_shape = tuple(d // self.patch_size for d in x.shape[-2:])
+        def patch_extractor(x: torch.Tensor):
+            x, _ = self._patchify(x)
+            return x
+        return forward_intermediates(
+            self,
+            patch_extractor=patch_extractor,
+            num_summary_tokens=self.num_prefix_tokens,
+            num_cls_tokens=self.num_cls_tokens,
+            norm=lambda y: y,
+            x=x,
+            block_kwargs=dict(num_prefix_tokens=self.num_prefix_tokens, patch_shape=patch_shape),
+            **kwargs,
+        )
+    def _patchify(self, x: torch.Tensor):
+        x = self.patch_embed(x)
+        patch_shape = x.shape[-2:]
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        prefix = self.prefix_buffer.expand(x.shape[0], -1, -1)
+        x = torch.cat([prefix, x], dim=1)
+        return x, patch_shape
+@register_model
+def vit_base_patch16_xpos(num_cls_tokens: int = 1, num_reg_tokens: int = 0, **kwargs) -> VisionTransformer:
+    return VisionTransformer(patch_size=16, embed_dim=768, depth=12, num_heads=12,
+                             num_cls_tokens=num_cls_tokens, num_reg_tokens=num_reg_tokens)
+@register_model
+def vit_large_patch16_xpos(num_cls_tokens: int = 1, num_reg_tokens: int = 0, **kwargs) -> VisionTransformer:
+    return VisionTransformer(patch_size=16, embed_dim=1024, depth=24, num_heads=16,
+                             num_cls_tokens=num_cls_tokens, num_reg_tokens=num_reg_tokens)
+@register_model
+def vit_huge_patch16_xpos(num_cls_tokens: int = 1, num_reg_tokens: int = 0, **kwargs) -> VisionTransformer:
+    return VisionTransformer(patch_size=16, embed_dim=1280, depth=32, num_heads=16,
+                             num_cls_tokens=num_cls_tokens, num_reg_tokens=num_reg_tokens)
+@register_model
+def vit_giant_patch16_xpos(num_cls_tokens: int = 1, num_reg_tokens: int = 0, **kwargs) -> VisionTransformer:
+    return VisionTransformer(patch_size=16, embed_dim=1408, depth=40, num_heads=16,
+                             num_cls_tokens=num_cls_tokens, num_reg_tokens=num_reg_tokens)
+@register_model
+def vit_bigG_patch16_xpos(num_cls_tokens: int = 1, num_reg_tokens: int = 0, **kwargs) -> VisionTransformer:
+    return VisionTransformer(patch_size=16, embed_dim=1664, depth=48, num_heads=16,
+                             num_cls_tokens=num_cls_tokens, num_reg_tokens=num_reg_tokens)

tim/models/nvidia_radio/radio/vit_patch_generator.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import math
+from typing import Union, Tuple, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange
+from .cls_token import ClsToken
+input_dim_t = Union[int, Tuple[int, int]]
+try:
+    # raise ImportError()
+    from indirect_grid_sample import indirect_grid_sample
+except ImportError:
+    indirect_grid_sample = None
+class ViTPatchGenerator(nn.Module):
+    def __init__(self,
+                 patch_size: int,
+                 embed_dim: int,
+                 input_dims: input_dim_t,
+                 abs_pos: bool = True,
+                 normalize_patches: bool = False,
+                 cls_token: bool = False,
+                 max_input_dims: Optional[input_dim_t] = None,
+                 pos_dropout: float = 0.0,
+                 return_pos_enc: bool = False,
+                 num_cls_tokens: int = 1,
+                 register_multiple: Optional[int] = None,
+                 num_registers: Optional[int] = None,
+                 patch_bias: bool = False,
+                 device=None, dtype=None,
+    ):
+        super().__init__()
+        if isinstance(input_dims, int):
+            input_dims = (input_dims, input_dims)
+        if max_input_dims is None:
+            max_input_dims = input_dims
+        if isinstance(max_input_dims, int):
+            max_input_dims = (max_input_dims, max_input_dims)
+        max_input_dims = tuple(
+            int(math.ceil(d / patch_size) * patch_size)
+            for d in max_input_dims
+        )
+        self.cpe_mode = max_input_dims != input_dims
+        self.pos_dropout = pos_dropout
+        self.return_pos_enc = return_pos_enc
+        factory = dict(device=device, dtype=dtype)
+        self.patch_size = patch_size
+        self.abs_pos = abs_pos
+        self.embed_dim = embed_dim
+        self.num_rows = max_input_dims[0] // patch_size
+        self.num_cols = max_input_dims[1] // patch_size
+        self.input_dims = tuple(d // patch_size for d in input_dims)
+        self.num_patches = self.num_rows * self.num_cols
+        self.max_input_dims = max_input_dims
+        self.im_to_patches = Im2Patches(patch_size)
+        self.embedder = ViTPatchLinear(patch_size, embed_dim, bias=patch_bias, **factory)
+        if abs_pos:
+            scale = embed_dim ** -0.5
+            self.pos_embed = nn.Parameter(torch.randn(1, self.num_patches, embed_dim, **factory) * scale)
+        self.cls_token = ClsToken(
+            embed_dim,
+            num_tokens=num_cls_tokens,
+            enabled=cls_token,
+            register_multiple=register_multiple,
+            num_registers=num_registers,
+        )
+        self.patch_normalizer = nn.LayerNorm(embed_dim) if normalize_patches else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        patches = self.embed_patches(x)
+        patches, pos_enc = self.apply_pos_enc(patches, input_size=x.shape[2:])
+        patches = self.cls_token(patches)
+        patches = self.patch_normalizer(patches)
+        if self.return_pos_enc:
+            return patches, pos_enc
+        return patches
+    @property
+    def apply_cls_token(self):
+        return self.cls_token.enabled
+    @property
+    def num_cls_tokens(self):
+        return self.cls_token.num_tokens
+    @property
+    def num_cls_patches(self):
+        return self.cls_token.num_patches
+    @property
+    def num_registers(self):
+        return self.cls_token.num_registers
+    @property
+    def num_skip(self):
+        return self.num_cls_tokens + self.num_registers
+    def no_weight_decay(self):
+        return [
+            'pos_embed',
+        ]
+    def _load_embed(self, src_embed: torch.Tensor, targ_embed: nn.Parameter):
+        if src_embed.shape != targ_embed.shape:
+            src_size = int(math.sqrt(src_embed.shape[1]))
+            assert src_size ** 2 == src_embed.shape[1], 'Unable to interpolate non-square embedding'
+            src_embed = rearrange(src_embed, 'b (h w) c -> b c h w', h=src_size, w=src_size)
+            src_embed = F.interpolate(src_embed, size=(self.num_rows, self.num_cols), mode='bicubic', align_corners=True, antialias=False)
+            src_embed = rearrange(src_embed, 'b c h w -> b (h w) c')
+        targ_embed.data.copy_(src_embed)
+    def _load_projection(self, src_proj_weight: torch.Tensor, targ_proj_weight: torch.Tensor):
+        if src_proj_weight.shape != targ_proj_weight.shape:
+            src_patch_size = int(math.sqrt(src_proj_weight.shape[1] // 3))
+            assert (src_patch_size ** 2) * 3 == src_proj_weight.shape[1], 'Unable to interpolate non-square patch size'
+            src_proj_weight = rearrange(src_proj_weight, 'b (c h w) -> b c h w', c=3, h=src_patch_size, w=src_patch_size)
+            src_proj_weight = F.interpolate(src_proj_weight, size=(self.patch_size, self.patch_size), mode='bicubic', align_corners=True, antialias=False)
+            src_proj_weight = rearrange(src_proj_weight, 'b c h w -> b (c h w)')
+        targ_proj_weight.data.copy_(src_proj_weight)
+    def embed_patches(self, x: torch.Tensor) -> torch.Tensor:
+        patches = self.im_to_patches(x)
+        patches = self.embedder(patches)
+        return patches
+    def apply_pos_enc(self,
+                      patches: torch.Tensor,
+                      patch_idxs: Optional[torch.Tensor] = None,
+                      input_size: Optional[Tuple[int, int]] = None,
+    ) -> torch.Tensor:
+        if not self.abs_pos:
+            return patches
+        pos_enc = self.get_pos_enc(patches.shape[0], patch_idxs, input_size)
+        if self.training and self.pos_dropout > 0:
+            keeps = torch.rand(patches.shape[0], 1, 1, dtype=pos_enc.dtype, device=pos_enc.device) > self.pos_dropout
+            pos_enc_drop = torch.where(keeps, pos_enc, 0)
+        else:
+            pos_enc_drop = pos_enc
+        return patches + pos_enc_drop, pos_enc
+    def get_pos_enc(self,
+                    batch_size: int,
+                    patch_idxs: Optional[torch.Tensor] = None,
+                    input_size: Optional[Tuple[int, int]] = None,
+    ) -> torch.Tensor:
+        if input_size is None:
+            input_dims = self.input_dims
+        else:
+            input_dims = tuple(d // self.patch_size for d in input_size)
+        pos_embed = self._get_pos_embeddings(batch_size, input_dims)
+        if patch_idxs is None:
+            return pos_embed
+        exp_patch_idxs = patch_idxs.unsqueeze(-1).expand(-1, -1, pos_embed.shape[-1])
+        pos_embed = torch.gather(pos_embed.expand(patch_idxs.shape[0], -1, -1), dim=1, index=exp_patch_idxs)
+        return pos_embed
+    def _get_pos_embeddings(self, batch_size: int, input_dims: Tuple[int, int]):
+        if (self.num_rows, self.num_cols) == input_dims:
+            return self.pos_embed
+        pos_embed = self.pos_embed.reshape(1, self.num_rows, self.num_cols, -1).permute(0, 3, 1, 2)
+        def window_select(pos_embed):
+            if input_dims[0] < pos_embed.shape[-2]:
+                pos_embed = pos_embed[..., :input_dims[0], :]
+            if input_dims[1] < pos_embed.shape[-1]:
+                pos_embed = pos_embed[..., :, :input_dims[1]]
+            return pos_embed
+        if self.cpe_mode:
+            if self.training:
+                min_scale = math.sqrt(0.1)
+                scale = torch.rand(batch_size, 1, 1, device=pos_embed.device) * (1 - min_scale) + min_scale
+                aspect_min = math.log(3 / 4)
+                aspect_max = -aspect_min
+                aspect = torch.exp(torch.rand(batch_size, 1, 1, device=pos_embed.device) * (aspect_max - aspect_min) + aspect_min)
+                scale_x = scale * aspect
+                scale_y = scale * (1 / aspect)
+                scale_xy = torch.stack([scale_x, scale_y], dim=-1).clamp_(0, 1)
+                pos_xy = torch.rand(batch_size, 1, 1, 2, device=pos_embed.device) * (1 - scale_xy)
+                lin_x = torch.linspace(0, 1, steps=input_dims[1], device=pos_embed.device)[None, None].expand(batch_size, input_dims[0], -1)
+                lin_y = torch.linspace(0, 1, steps=input_dims[0], device=pos_embed.device)[None, :, None].expand(batch_size, -1, input_dims[1])
+                lin_xy = torch.stack([lin_x, lin_y], dim=-1)
+                grid_xy = lin_xy * scale_xy + pos_xy
+                # Convert to [-1, 1] range
+                grid_xy.mul_(2).sub_(1)
+                pos_embed = F.grid_sample(
+                    pos_embed.float().expand(batch_size, -1, -1, -1),
+                    grid=grid_xy,
+                    mode='bilinear',
+                    padding_mode='zeros',
+                    align_corners=True,
+                ).to(pos_embed.dtype)
+            else:
+                # i_rows, i_cols = input_dims
+                # p_rows, p_cols = pos_embed.shape[2:]
+                # if i_rows <= p_rows and i_cols <= p_cols:
+                #     left = (p_cols - i_cols) // 2
+                #     top = (p_rows - i_rows) // 2
+                #     pos_embed = pos_embed[..., top:top+i_rows, left:left+i_cols]
+                # else:
+                max_dim = max(input_dims)
+                pos_embed = F.interpolate(pos_embed.float(), size=(max_dim, max_dim), align_corners=True, mode='bilinear').to(pos_embed.dtype)
+                pos_embed = window_select(pos_embed)
+        else:
+            pos_embed = window_select(pos_embed)
+        if pos_embed.shape[-2:] != input_dims:
+            pos_embed = F.interpolate(pos_embed.float(), size=input_dims, align_corners=True, mode='bilinear').to(pos_embed.dtype)
+        pos_embed = pos_embed.flatten(2).permute(0, 2, 1)
+        return pos_embed
+class Im2Patches(nn.Module):
+    def __init__(self, patch_size: int):
+        super().__init__()
+        self.patch_size = patch_size
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.patch_size == 1:
+            patches = x.flatten(2)
+            patches = patches.permute(0, 2, 1)
+            return patches
+        py = x.shape[-2] // self.patch_size
+        px = x.shape[-1] // self.patch_size
+        patches = rearrange(x, 'b c (py yy) (px xx) -> b (py px) (c yy xx)',
+                            py=py, yy=self.patch_size,
+                            px=px, xx=self.patch_size,
+        )
+        return patches
+class ViTPatchLinear(nn.Linear):
+    def __init__(self, patch_size: int, embed_dim: int, bias: bool = False, **factory):
+        super().__init__(
+            3 * (patch_size ** 2),
+            embed_dim,
+            bias=bias,
+            **factory
+        )
+        self.patch_size = patch_size

tim/models/nvidia_radio/radio/vitdet.py ADDED Viewed

	@@ -0,0 +1,188 @@

+from collections import defaultdict
+from contextlib import contextmanager
+from logging import getLogger
+import math
+import sys
+from typing import List, Union, Iterable
+import numpy as np
+import torch
+from torch import nn
+from timm.models import VisionTransformer
+from einops import rearrange
+from .extra_models import DinoWrapper
+DEFAULT_NUM_WINDOWED = 5
+DEFAULT_NUM_GLOBAL = 4
+class VitDetArgs:
+    def __init__(self,
+                 window_size: int,
+                 num_summary_tokens: int,
+                 num_windowed: int = None,
+                 num_global: int = None,
+    ):
+        self.window_size = window_size
+        self.num_summary_tokens = num_summary_tokens
+        self.num_windowed = num_windowed
+        self.num_global = num_global
+def apply_vitdet_arch(model: Union[VisionTransformer, DinoWrapper], args: VitDetArgs):
+    if isinstance(model, VisionTransformer):
+        patch_embed = getattr(model, 'patch_generator', model.patch_embed)
+        return ViTDetHook(patch_embed, model.blocks, args)
+    elif isinstance(model, DinoWrapper):
+        inner = model.inner
+        patch_embed = getattr(inner, 'patch_generator', inner.patch_embed)
+        return ViTDetHook(patch_embed, inner.blocks, args)
+    else:
+        print(f'Warning: Unable to apply VitDet aug!', file=sys.stderr)
+class ViTDetHook:
+    def __init__(self,
+                 embedder: nn.Module,
+                 blocks: nn.Sequential,
+                 args: VitDetArgs,
+    ):
+        self.blocks = blocks
+        self.num_summary_tokens = args.num_summary_tokens
+        self.window_size = args.window_size
+        self._input_resolution = None
+        self._num_windows = None
+        self._cls_patch = None
+        self._order_cache = dict()
+        embedder.register_forward_pre_hook(self._enter_model)
+        # This will decide if we window-fy the patches
+        # and enable vit-det for this iteration, and if so,
+        # rearrange the patches for efficient mode switching
+        blocks.register_forward_pre_hook(self._enter_blocks)
+        is_global = True
+        if args.num_windowed is not None:
+            period = args.num_windowed + 1
+        else:
+            num_global = args.num_global or DEFAULT_NUM_GLOBAL
+            period = max(len(blocks) // num_global, 1)
+        for i, layer in enumerate(blocks[:-1]):
+            ctr = i % period
+            if ctr == 0:
+                layer.register_forward_pre_hook(self._to_windows)
+                is_global = False
+            elif ctr == period - 1:
+                layer.register_forward_pre_hook(self._to_global)
+                is_global = True
+        # Always ensure the final layer is a global layer
+        if not is_global:
+            blocks[-1].register_forward_pre_hook(self._to_global)
+        blocks.register_forward_hook(self._exit_model)
+    def _enter_model(self, _, input: List[torch.Tensor]):
+        self._input_resolution = input[0].shape[-2:]
+    def _enter_blocks(self, _, input: List[torch.Tensor]):
+        # print(f'{get_rank()} - ViTDet Window Size: {self._window_size}', file=sys.stderr)
+        patches = input[0]
+        patches = self._rearrange_patches(patches)
+        return (patches,) + input[1:]
+    def _to_windows(self, _, input: List[torch.Tensor]):
+        patches = input[0]
+        if self.num_summary_tokens:
+            self._cls_patch = patches[:, :self.num_summary_tokens]
+            patches = patches[:, self.num_summary_tokens:]
+        patches = rearrange(
+            patches, 'b (p t) c -> (b p) t c',
+            p=self._num_windows, t=self.window_size ** 2,
+        )
+        return (patches,) + input[1:]
+    def _to_global(self, _, input: List[torch.Tensor]):
+        patches = input[0]
+        patches = rearrange(
+            patches, '(b p) t c -> b (p t) c',
+            p=self._num_windows, t=self.window_size ** 2,
+            b=patches.shape[0] // self._num_windows,
+        )
+        if self.num_summary_tokens:
+            patches = torch.cat([
+                self._cls_patch,
+                patches,
+            ], dim=1)
+        return (patches,) + input[1:]
+    def _exit_model(self, _, inputs: List[torch.Tensor], patches: torch.Tensor):
+        # Return patches to their original order
+        patch_order = self._order_cache[self._input_resolution][0]
+        patch_order = patch_order.reshape(1, -1, 1).expand_as(patches)
+        ret_patches = torch.empty_like(patches)
+        ret_patches = torch.scatter(
+            ret_patches,
+            dim=1,
+            index=patch_order,
+            src=patches,
+        )
+        return ret_patches
+    def _rearrange_patches(self, patches: torch.Tensor):
+        # We rearrange the patches so that we can efficiently
+        # switch between windowed and global mode by just
+        # reshaping the tensor
+        patch_order, self._num_windows = self._order_cache.get(self._input_resolution, (None, None))
+        if patch_order is None:
+            num_feat_patches = patches.shape[1] - self.num_summary_tokens
+            num_pixels = self._input_resolution[0] * self._input_resolution[1]
+            patch_size = int(round(math.sqrt(num_pixels / num_feat_patches)))
+            rows = self._input_resolution[-2] // patch_size
+            cols = self._input_resolution[-1] // patch_size
+            w_rows = rows // self.window_size
+            w_cols = cols // self.window_size
+            patch_order = torch.arange(0, num_feat_patches, device=patches.device)
+            patch_order = rearrange(
+                patch_order, '(wy py wx px) -> (wy wx py px)',
+                wy=w_rows, wx=w_cols,
+                py=self.window_size, px=self.window_size,
+            )
+            if self.num_summary_tokens:
+                patch_order = torch.cat([
+                    torch.arange(self.num_summary_tokens, dtype=patch_order.dtype, device=patch_order.device),
+                    patch_order + self.num_summary_tokens,
+                ])
+            self._num_windows = w_rows * w_cols
+            self._order_cache[self._input_resolution] = (
+                patch_order,
+                self._num_windows,
+            )
+        patch_order = patch_order.reshape(1, -1, 1).expand_as(patches)
+        patches = torch.gather(patches, dim=1, index=patch_order)
+        return patches

tim/models/t2i/tim_model.py ADDED Viewed

	@@ -0,0 +1,493 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import math
+from timm.layers.mlp import SwiGLU, Mlp
+from timm.models.vision_transformer import PatchEmbed, Attention
+from tim.models.utils.funcs import build_mlp, modulate, get_parameter_dtype
+from tim.models.utils.rope import VisionRotaryEmbedding, rotate_half
+from flash_attn import flash_attn_func
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def positional_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        self.timestep_embedding = self.positional_embedding
+        t_freq = self.timestep_embedding(t, dim=self.frequency_embedding_size).to(t.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class CaptionEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, cap_feat_dim, hidden_size):
+        super().__init__()
+        self.norm = nn.LayerNorm(cap_feat_dim)
+        self.mlp = SwiGLU(in_features=cap_feat_dim, hidden_features=hidden_size*4, out_features=hidden_size)
+    def forward(self, cap_feats):
+        '''
+        cfg is also essential in text-to-image generation
+        '''
+        cap_feats = self.mlp(self.norm(cap_feats))
+        return cap_feats
+#################################################################################
+#                                 Attention Block                               #
+#################################################################################
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = False,
+            qk_norm: bool = False,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+            norm_layer: nn.Module = nn.LayerNorm,
+            distance_aware: bool = False,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.distance_aware = distance_aware
+        if distance_aware:
+            self.qkv_d = nn.Linear(dim, dim * 3, bias=False)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor, freqs_cos, freqs_sin, attn_type='fused_attn', delta_t=None) -> torch.Tensor:
+        B, N, C = x.shape
+        if self.distance_aware:
+            qkv = self.qkv(x) + self.qkv_d(delta_t)
+        else:
+            qkv = self.qkv(x)
+        if attn_type == 'flash_attn':   # q, k, v: (B, N, n_head, d_head)
+            qkv = qkv.reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 1, 3, 4)
+        else:                           # q, k, v: (B, n_head, N, d_head)
+            qkv = qkv.reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        ori_dtype = qkv.dtype
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        q = q * freqs_cos + rotate_half(q) * freqs_sin
+        k = k * freqs_cos + rotate_half(k) * freqs_sin
+        q, k = q.to(ori_dtype), k.to(ori_dtype)
+        if attn_type == 'flash_attn':
+            x = flash_attn_func(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+            x = x.reshape(B, N, C)
+        elif attn_type == 'fused_attn':
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+            x = x.transpose(1, 2).reshape(B, N, C)
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+            x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+#################################################################################
+#                              Cross Attention Block                            #
+#################################################################################
+class CrossAttention(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = False,
+            qk_norm: bool = False,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+            norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor, y: torch.Tensor, freqs_cos, freqs_sin, attn_type='fused_attn') -> torch.Tensor:
+        B, N, C = x.shape
+        _, M, _ = y.shape
+        if attn_type == 'flash_attn':   # q, k, v: (B, N, n_head, d_head)
+            q = self.q(x).reshape(B, N, self.num_heads, self.head_dim)
+            kv = self.kv(y).reshape(B, M, 2, self.num_heads, self.head_dim).permute(2, 0, 1, 3, 4)
+        else:                           # q, k, v: (B, n_head, N, d_head)
+            q = self.q(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+            kv = self.kv(y).reshape(B, M, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        ori_dtype = q.dtype
+        k, v = kv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        q = q * freqs_cos + rotate_half(q) * freqs_sin
+        q, k = q.to(ori_dtype), k.to(ori_dtype)
+        if attn_type == 'flash_attn':
+            x = flash_attn_func(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+            x = x.reshape(B, N, C)
+        elif attn_type == 'fused_attn':
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+            x = x.transpose(1, 2).reshape(B, N, C)
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+            x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+#################################################################################
+#                                 Core TiM Model                                #
+#################################################################################
+class TiMBlock(nn.Module):
+    """
+    A TiM block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        distance_aware = block_kwargs.get('distance_aware', False)
+        self.attn = Attention(
+            hidden_size, num_heads=num_heads, qkv_bias=True, qk_norm=block_kwargs["qk_norm"],
+            distance_aware=distance_aware
+        )
+        self.norm2_i = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm2_t = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.cross_attn = CrossAttention(
+            hidden_size, num_heads=num_heads, qkv_bias=True, qk_norm=block_kwargs["qk_norm"]
+        )
+        self.norm3 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.mlp = SwiGLU(
+            in_features=hidden_size, hidden_features=(mlp_hidden_dim*2)//3, bias=True
+        )
+        if block_kwargs.get('lora_hidden_size', None) != None:
+            lora_hidden_size = block_kwargs['lora_hidden_size']
+        else:
+            lora_hidden_size = (hidden_size//4)*3
+        self.adaLN_modulation = SwiGLU(
+            in_features=hidden_size, hidden_features=lora_hidden_size, out_features=9*hidden_size, bias=True
+        )
+    def forward(self, x, y, c, freqs_cos, freqs_sin, attn_type, delta_t=None):
+        (
+            shift_msa, scale_msa, gate_msa,
+            shift_msc, scale_msc, gate_msc,
+            shift_mlp, scale_mlp, gate_mlp
+        ) = self.adaLN_modulation(c).chunk(9, dim=-1)
+        x = x + gate_msa * self.attn(modulate(self.norm1(x), shift_msa, scale_msa), freqs_cos, freqs_sin, attn_type, delta_t)
+        x = x + gate_msc * self.cross_attn(modulate(self.norm2_i(x), shift_msc, scale_msc), self.norm2_t(y), freqs_cos, freqs_sin, attn_type)
+        x = x + gate_mlp * self.mlp(modulate(self.norm3(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of TiM.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = SwiGLU(
+            in_features=hidden_size, hidden_features=hidden_size//2, out_features=2*hidden_size, bias=True
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class TiM(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=1152,
+        encoder_depth=8,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        cap_feat_dim=2048,
+        z_dim=768,
+        projector_dim=2048,
+        use_checkpoint: bool = False,
+        new_condition: str = 't-r',
+        use_new_embed: bool = False,
+        **block_kwargs # qk_norm
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.cap_feat_dim = cap_feat_dim
+        self.encoder_depth = encoder_depth
+        self.use_checkpoint = use_checkpoint
+        self.new_condition = new_condition
+        self.use_new_embed = use_new_embed
+        self.x_embedder = PatchEmbed(
+            input_size, patch_size, in_channels, hidden_size, bias=True, strict_img_size=False
+        )
+        self.t_embedder = TimestepEmbedder(hidden_size) # timestep embedding type
+        if use_new_embed:
+            self.delta_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = CaptionEmbedder(cap_feat_dim, hidden_size)
+        # Will use fixed sin-cos embedding:
+        self.rope = VisionRotaryEmbedding(head_dim=hidden_size//num_heads)
+        self.blocks = nn.ModuleList([
+            TiMBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, **block_kwargs) for _ in range(depth)
+        ])
+        self.projector = build_mlp(hidden_size, projector_dim, z_dim)
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize label embedding table:
+        nn.init.normal_(self.y_embedder.mlp.fc1_g.weight, std=0.02)
+        nn.init.normal_(self.y_embedder.mlp.fc1_x.weight, std=0.02)
+        nn.init.normal_(self.y_embedder.mlp.fc2.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in TiM blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation.fc2.weight, 0)
+            nn.init.constant_(block.adaLN_modulation.fc2.bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation.fc2.weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation.fc2.bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x, H, W):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.patch_size
+        h, w = int(H/p), int(W/p)
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
+        return imgs
+    def get_rope(self, h, w, attn_type):
+        grid_h = torch.arange(h)
+        grid_w = torch.arange(w)
+        grid = torch.meshgrid(grid_h, grid_w, indexing='xy')
+        grid = torch.stack(grid, dim=0).reshape(2, -1).unsqueeze(0)
+        freqs_cos, freqs_sin = self.rope.get_cached_2d_rope_from_grid(grid)
+        if attn_type == 'flash_attn':   # (1, N, 1, d_head)
+            return freqs_cos.unsqueeze(2), freqs_sin.unsqueeze(2)
+        else:                           # (1, 1, N, d_head)
+            return freqs_cos.unsqueeze(1), freqs_sin.unsqueeze(1)
+    def forward(self, x, t, r, y, attn_type='flash_attn', return_zs=False, jvp=False):
+        """
+        Forward pass of TiM.
+        x: (B, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (B,) tensor of diffusion timesteps
+        y: (B,) tensor of class labels
+        """
+        B, C, H, W = x.shape
+        x = self.x_embedder(x)                          # (N, N, D), where T = H * W / patch_size ** 2
+        # timestep and class embedding
+        t_embed = self.t_embedder(t).unsqueeze(1)                   # (B, 1, D)
+        delta_embed = self.get_delta_embed(t, r).unsqueeze(1)       # (B, 1, D)
+        y = self.y_embedder(y)                                      # (B, M, D)
+        c = t_embed + delta_embed                                   # (B, 1, D)
+        freqs_cos, freqs_sin = self.get_rope(
+            int(H/self.patch_size), int(W/self.patch_size), attn_type
+        )
+        for i, block in enumerate(self.blocks):
+            if not self.use_checkpoint or jvp:
+                x = block(x, y, c, freqs_cos, freqs_sin, attn_type, delta_embed)   # (B, N, D)
+            else:
+                x = torch.utils.checkpoint.checkpoint(
+                    self.ckpt_wrapper(block), x, y, c, freqs_cos, freqs_sin, attn_type, delta_embed
+                )
+            if (i + 1) == self.encoder_depth:
+                h_proj = self.projector(x)
+        x = self.final_layer(x, c)                      # (B, N, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x, H, W)                    # (b, out_channels, H, W)
+        if return_zs:
+            return x, h_proj
+        else:
+            return x
+    def get_delta_embed(self, t, r):
+        if self.use_new_embed:
+            delta_embedder = self.delta_embedder
+        else:
+            delta_embedder = self.t_embedder
+        if self.new_condition == 't-r':
+            delta_embed = delta_embedder(t-r)
+        elif self.new_condition == 'r':
+            delta_embed = delta_embedder(r)
+        elif self.new_condition == 't,r':
+            delta_embed = self.t_embedder(t) + delta_embedder(r)
+        elif self.new_condition == 't,t-r':
+            delta_embed = self.t_embedder(t) + delta_embedder(t-r)
+        elif self.new_condition == 'r,t-r':
+            delta_embed = self.t_embedder(r) + delta_embedder(t-r)
+        elif self.new_condition == 't,r,t-r':
+            delta_embed = self.t_embedder(t) + self.t_embedder(r) + delta_embedder(t-r)
+        else:
+            raise NotImplementedError
+        return delta_embed
+    def ckpt_wrapper(self, module):
+        def ckpt_forward(*inputs):
+            outputs = module(*inputs)
+            return outputs
+        return ckpt_forward
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)

tim/models/utils/funcs.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+import torch.nn as nn
+from torch import Tensor
+from typing import List, Tuple
+from itertools import chain
+def expand_t_like_x(t, x):
+    """Function to reshape time t to broadcastable dimension of x
+    Args:
+      t: [batch_dim,], time vector
+      x: [batch_dim,...], data point
+    """
+    dims = [1] * (len(x.size()) - 1)
+    t = t.view(t.size(0), *dims)
+    return t
+def build_mlp(hidden_size, projector_dim, z_dim):
+    return nn.Sequential(
+        nn.Linear(hidden_size, projector_dim),
+        nn.SiLU(),
+        nn.Linear(projector_dim, projector_dim),
+        nn.SiLU(),
+        nn.Linear(projector_dim, z_dim),
+    )
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+def get_parameter_dtype(parameter: torch.nn.Module):
+    try:
+        params = tuple(parameter.parameters())
+        if len(params) > 0:
+            return params[0].dtype
+        buffers = tuple(parameter.buffers())
+        if len(buffers) > 0:
+            return buffers[0].dtype
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].dtype

tim/models/utils/norms.py ADDED Viewed

	@@ -0,0 +1,403 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from functools import partial
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+import torch.nn.functional as F
+def create_norm(norm_type: str, dim: int, eps: float = 1e-6):
+    """
+    Creates the specified normalization layer based on the norm_type.
+    Args:
+        norm_type (str): The type of normalization layer to create.
+            Supported types: 1. rmsnorm 2. fused_rmsnorm 3. layernorm 4. np_layernorm
+        dim (int): The dimension of the normalization layer.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+    Returns:
+        The created normalization layer.
+    Raises:
+        NotImplementedError: If an unknown norm_type is provided.
+    """
+    if norm_type == None or norm_type == "":
+        return nn.Identity()
+    norm_type = norm_type.lower()  # Normalize to lowercase
+    if norm_type == "layernorm":
+        return nn.LayerNorm(dim, eps=eps, bias=False)
+    elif norm_type == "np_layernorm":
+        return nn.LayerNorm(dim, eps=eps, elementwise_affine=False, bias=False)
+    elif norm_type == "np_layernorm_32":
+        return FP32_Layernorm(dim, eps=eps, elementwise_affine=False, bias=True)
+    elif norm_type == "layernorm_32":
+        return FP32_Layernorm(dim, eps=eps, bias=True)
+    elif norm_type == "rmsnorm":
+        return RMSNorm(dim, include_weight=True, eps=eps)
+    elif norm_type == "np_rmsnorm":
+        return RMSNorm(dim, include_weight=False, eps=1e-6)
+    elif norm_type == "fused_rmsnorm":
+        return FusedRMSNorm(dim, eps=1/65536)
+    elif norm_type == "fused_rmsnorm_32":
+        return FusedRMSNorm32(dim, eps=1e-6)
+    elif norm_type == 'none':
+        return nn.Identity()
+    else:
+        return nn.Identity()
+class FP32_Layernorm(nn.LayerNorm):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        origin_dtype = inputs.dtype
+        if self.bias == None and self.weight == None:
+            return F.layer_norm(
+                input=inputs.float(),
+                normalized_shape=self.normalized_shape,
+                eps=self.eps
+            ).to(origin_dtype)
+        elif self.bias == None:
+            return F.layer_norm(
+                input=inputs.float(),
+                normalized_shape=self.normalized_shape,
+                weight=self.weight.float(),
+                eps=self.eps
+            ).to(origin_dtype)
+        else:
+            return F.layer_norm(
+                input=inputs.float(),
+                normalized_shape=self.normalized_shape,
+                weight=self.weight.float(),
+                bias=self.bias.float(),
+                eps=self.eps
+            ).to(origin_dtype)
+class FusedRMSNorm(nn.Module):
+    """Fused RMS Norm, wraps a fused Triton Kernel"""
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.fused_rms_norm_fn = fused_rms_norm_fn
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """leverages Triton Fused RMS Norm kernel"""
+        return self.fused_rms_norm_fn(
+            x,
+            self.weight,
+            eps=self.eps,
+        )
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)  # type: ignore
+class FusedRMSNorm32(nn.Module):
+    """Fused RMS Norm, wraps a fused Triton Kernel"""
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.fused_rms_norm_fn = fused_rms_norm_fn
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """leverages Triton Fused RMS Norm kernel"""
+        dtype = x.dtype
+        return self.fused_rms_norm_fn(
+            x.to(torch.float32),
+            self.weight,
+            eps=self.eps,
+        ).to(dtype)
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)  # type: ignore
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, include_weight: bool = True, eps: float = 1e-6, **block_kwargs):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            include_weight: bool: Whether include weight in the normalization
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        super().__init__()
+        self.eps = eps
+        if include_weight:
+            self.weight = nn.Parameter(torch.ones(dim))
+        else:
+            self.weight = None
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        if self.weight == None:
+            return output
+        else:
+            return output * self.weight
+# FusedRMSNorm in Triton
+# Credit
+# Tri Dao's Triton LayerNorm: https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/triton/layer_norm.py
+# Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=["N"],
+)
+@triton.jit
+def _rms_norm_fwd_kernel(
+    X,
+    stride_x,
+    Y,
+    stride_y,
+    W,
+    Rstd,
+    eps,
+    M,  # num rows
+    N,  # num cols
+    block_N: tl.constexpr,
+):
+    row = tl.program_id(0)
+    cols = tl.arange(0, block_N)
+    # Load input data and weights
+    mask = cols < N
+    x = tl.load(X + row * stride_x + cols, mask=mask, other=0.0).to(tl.float32)
+    w = tl.load(W + cols, mask=mask, other=0.0).to(tl.float32)
+    # Compute mean and variance
+    xbar = tl.where(cols < N, x, 0.0)
+    var = tl.sum(xbar * xbar, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    # Store the reciprocal standard deviation
+    tl.store(Rstd + row, rstd)
+    # Normalize and apply linear transformation
+    x_hat = x * rstd
+    y = x_hat * w
+    # Write output
+    tl.store(Y + row * stride_y + cols, y, mask=mask)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=["N"],
+)
+@triton.jit
+def _rms_norm_bwd_kernel_sm(
+    X,
+    stride_x,
+    W,
+    DY,
+    stride_dy,
+    DX,
+    stride_dx,
+    Rstd,
+    DW,
+    eps,
+    M,  # num rows
+    N,  # num cols
+    rows_per_program,
+    block_N: tl.constexpr,
+):
+    row_block_id = tl.program_id(0)
+    row_start = row_block_id * rows_per_program
+    cols = tl.arange(0, block_N)
+    mask = cols < N
+    # Load weights
+    w = tl.load(W + cols, mask=mask, other=0.0).to(tl.float32)
+    # Accumulate gradients for weights
+    dw = tl.zeros((block_N,), dtype=tl.float32)
+    row_end = min(row_start + rows_per_program, M)
+    for row in range(row_start, row_end):
+        # Load input, output gradient, and reciprocal standard deviation
+        x = tl.load(X + row * stride_x + cols, mask=mask, other=0.0).to(tl.float32)
+        dy = tl.load(DY + row * stride_dy + cols, mask=mask, other=0.0).to(tl.float32)
+        rstd = tl.load(Rstd + row)
+        # Compute normalized input and gradients
+        x_hat = x * rstd
+        wdy = w * dy
+        dw += dy * x_hat
+        c1 = tl.sum(x_hat * wdy, axis=0) / N
+        dx = (wdy - x_hat * c1) * rstd
+        # Store input gradient
+        tl.store(DX + row * stride_dx + cols, dx, mask=mask)
+    # Store weight gradients
+    tl.store(DW + row_block_id * N + cols, dw, mask=mask)
+class TritonFusedRMSNorm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, eps):
+        x_shape_start = x.shape
+        # Flatten input
+        x = x.view(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
+        if weight.stride(-1) != 1:
+            weight = weight.contiguous()
+        M, N = x.shape
+        y = torch.empty_like(x)
+        rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
+        max_size = 65536 // x.element_size()
+        block_N = min(max_size, triton.next_power_of_2(N))
+        if N > block_N:
+            raise ValueError(f"N {N} must be <= {block_N=}")
+        grid = lambda meta: (M,)
+        _rms_norm_fwd_kernel[grid](
+            x,
+            x.stride(0),
+            y,
+            y.stride(0),
+            weight,
+            rstd,
+            eps,
+            M,
+            N,
+            block_N,
+        )
+        ctx.eps = eps
+        ctx.save_for_backward(x, weight, rstd)
+        ctx.x_shape_start = x_shape_start
+        y = y.reshape(x_shape_start)
+        return y
+    @staticmethod
+    def backward(ctx, dy):
+        x, weight, rstd = ctx.saved_tensors
+        eps = ctx.eps
+        x_shape_start = ctx.x_shape_start
+        # Flatten input and output gradients
+        dy = dy.view(-1, dy.shape[-1])
+        if dy.stride(-1) != 1:
+            dy = dy.contiguous()
+        M, N = dy.shape
+        dx = torch.empty_like(x)
+        dw = torch.empty_like(weight)
+        sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
+        _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
+        max_size = 65536 // x.element_size()
+        block_N = min(max_size, triton.next_power_of_2(N))
+        rows_per_sm = math.ceil(M / sm_count)
+        if N > block_N:
+            raise ValueError(f"N {N} must be <= {block_N=}")
+        grid = lambda meta: (sm_count,)
+        _rms_norm_bwd_kernel_sm[grid](
+            x,
+            x.stride(0),
+            weight,
+            dy,
+            dy.stride(0),
+            dx,
+            dx.stride(0),
+            rstd,
+            _dw,
+            eps,
+            M,
+            N,
+            rows_per_sm,
+            block_N,
+        )
+        dw = _dw.sum(0).to(weight.dtype)
+        dx = dx.view(x_shape_start)
+        return dx, dw, None
+# expose fusedRMSNorm as a function
+def fused_rms_norm_fn(
+    x,
+    weight,
+    eps=1e-6,
+):
+    return TritonFusedRMSNorm.apply(
+        x,
+        weight,
+        eps,
+    )

tim/models/utils/rope.py ADDED Viewed

	@@ -0,0 +1,305 @@

+# --------------------------------------------------------
+# FiT: A Flexible Vision Transformer for Image Generation
+#
+# Based on the following repository
+# https://github.com/lucidrains/rotary-embedding-torch
+# https://github.com/jquesnelle/yarn/blob/HEAD/scaled_rope
+# https://colab.research.google.com/drive/1VI2nhlyKvd5cw4-zHvAIk00cAVj2lCCC#scrollTo=b80b3f37
+# --------------------------------------------------------
+import math
+from math import pi
+from typing import Optional, Any, Union, Tuple
+import torch
+from torch import nn
+from einops import rearrange, repeat
+from functools import lru_cache
+#################################################################################
+#                                 NTK Operations                                #
+#################################################################################
+def find_correction_factor(num_rotations, dim, base=10000, max_position_embeddings=2048):
+    return (dim * math.log(max_position_embeddings/(num_rotations * 2 * math.pi)))/(2 * math.log(base)) #Inverse dim formula to find number of rotations
+def find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048):
+    low = math.floor(find_correction_factor(low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(find_correction_factor(high_rot, dim, base, max_position_embeddings))
+    return max(low, 0), min(high, dim-1) #Clamp values just in case
+def linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001 #Prevent singularity
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+def find_newbase_ntk(dim, base=10000, scale=1):
+    # Base change formula
+    return base * scale ** (dim / (dim-2))
+def get_mscale(scale=torch.Tensor):
+    # if scale <= 1:
+    #     return 1.0
+    # return 0.1 * math.log(scale) + 1.0
+    return torch.where(scale <= 1., torch.tensor(1.0), 0.1 * torch.log(scale) + 1.0)
+def get_proportion(L_test, L_train):
+    L_test = L_test * 2
+    return torch.where(torch.tensor(L_test/L_train) <= 1., torch.tensor(1.0), torch.sqrt(torch.log(torch.tensor(L_test))/torch.log(torch.tensor(L_train))))
+    # return torch.sqrt(torch.log(torch.tensor(L_test))/torch.log(torch.tensor(L_train)))
+#################################################################################
+#                                 Rotate Q or K                                 #
+#################################################################################
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+#################################################################################
+#                               Core Vision RoPE                                #
+#################################################################################
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        head_dim: int,  # embed dimension for each head
+        custom_freqs: str = 'normal',
+        theta: int = 10000,
+        online_rope: bool = False,
+        max_cached_len: int = 1024,
+        max_pe_len_h: Optional[int] = None,
+        max_pe_len_w: Optional[int] = None,
+        decouple: bool = False,
+        ori_max_pe_len: Optional[int] = None,
+    ):
+        super().__init__()
+        dim = head_dim // 2
+        assert dim % 2 == 0 # accually, this is important
+        self.dim = dim
+        self.custom_freqs = custom_freqs.lower()
+        self.theta = theta
+        self.decouple = decouple
+        self.ori_max_pe_len = ori_max_pe_len
+        self.custom_freqs = custom_freqs.lower()
+        if not online_rope:
+            if self.custom_freqs in ['normal', 'scale1', 'scale2']:
+                freqs_h = 1. / (theta ** (torch.arange(0, dim, 2).float() / dim))
+                freqs_w = 1. / (theta ** (torch.arange(0, dim, 2).float() / dim))
+            else:
+                if decouple:
+                    freqs_h = self.get_1d_rope_freqs(theta, dim, max_pe_len_h, ori_max_pe_len)
+                    freqs_w = self.get_1d_rope_freqs(theta, dim, max_pe_len_w, ori_max_pe_len)
+                else:
+                    max_pe_len = max(max_pe_len_h, max_pe_len_w)
+                    freqs_h = self.get_1d_rope_freqs(theta, dim, max_pe_len, ori_max_pe_len)
+                    freqs_w = self.get_1d_rope_freqs(theta, dim, max_pe_len, ori_max_pe_len)
+            self.register_buffer('freqs_h', freqs_h, persistent=False)
+            self.register_buffer('freqs_w', freqs_w, persistent=False)
+            if max_pe_len_h != None and max_pe_len_w != None and ori_max_pe_len != None:
+                attn_factor = 1.0
+                scale = torch.clamp_min(torch.tensor(max(max_pe_len_h, max_pe_len_w)) / ori_max_pe_len, 1.0)   # dynamic scale
+                self.mscale = get_mscale(scale).to(scale) * attn_factor # Get n-d magnitude scaling corrected for interpolation
+                self.proportion1 = get_proportion(max(max_pe_len_h, max_pe_len_w), ori_max_pe_len)
+                self.proportion2 = get_proportion(max_pe_len_h * max_pe_len_w, ori_max_pe_len ** 2)
+            freqs_h_cached = torch.einsum('..., f -> ... f', torch.arange(max_cached_len), self.freqs_h)
+            freqs_h_cached = repeat(freqs_h_cached, '... n -> ... (n r)', r = 2)
+            self.register_buffer('freqs_h_cached', freqs_h_cached, persistent=False)
+            freqs_w_cached = torch.einsum('..., f -> ... f', torch.arange(max_cached_len), self.freqs_w)
+            freqs_w_cached = repeat(freqs_w_cached, '... n -> ... (n r)', r = 2)
+            self.register_buffer('freqs_w_cached', freqs_w_cached, persistent=False)
+    def get_1d_rope_freqs(self, theta, dim, max_pe_len, ori_max_pe_len):
+        # scaling operations for extrapolation
+        assert isinstance(ori_max_pe_len, int)
+        # scale = max_pe_len / ori_max_pe_len
+        if not isinstance(max_pe_len, torch.Tensor):
+            max_pe_len = torch.tensor(max_pe_len)
+        scale = torch.clamp_min(max_pe_len / ori_max_pe_len, 1.0)   # dynamic scale
+        if self.custom_freqs == 'linear': # equal to position interpolation
+            freqs = 1. / torch.einsum('..., f -> ... f', scale, theta ** (torch.arange(0, dim, 2).float() / dim))
+        elif self.custom_freqs == 'ntk-aware' or self.custom_freqs == 'ntk-aware-pro1' or self.custom_freqs == 'ntk-aware-pro2':
+            freqs = 1. / torch.pow(
+                find_newbase_ntk(dim, theta, scale).view(-1, 1),
+                (torch.arange(0, dim, 2).to(scale).float() / dim)
+            ).squeeze()
+        elif self.custom_freqs == 'ntk-by-parts':
+            #Interpolation constants found experimentally for LLaMA (might not be totally optimal though)
+            #Do not change unless there is a good reason for doing so!
+            beta_0 = 1.25
+            beta_1 = 0.75
+            gamma_0 = 16
+            gamma_1 = 2
+            ntk_factor = 1
+            extrapolation_factor = 1
+            #Three RoPE extrapolation/interpolation methods
+            freqs_base = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
+            freqs_linear = 1.0 / torch.einsum('..., f -> ... f', scale, (theta ** (torch.arange(0, dim, 2).to(scale).float() / dim)))
+            freqs_ntk = 1. / torch.pow(
+                find_newbase_ntk(dim, theta, scale).view(-1, 1),
+                (torch.arange(0, dim, 2).to(scale).float() / dim)
+            ).squeeze()
+            #Combine NTK and Linear
+            low, high = find_correction_range(beta_0, beta_1, dim, theta, ori_max_pe_len)
+            freqs_mask = (1 - linear_ramp_mask(low, high, dim // 2).to(scale)) * ntk_factor
+            freqs = freqs_linear * (1 - freqs_mask) + freqs_ntk * freqs_mask
+            #Combine Extrapolation and NTK and Linear
+            low, high = find_correction_range(gamma_0, gamma_1, dim, theta, ori_max_pe_len)
+            freqs_mask = (1 - linear_ramp_mask(low, high, dim // 2).to(scale)) * extrapolation_factor
+            freqs = freqs * (1 - freqs_mask) + freqs_base * freqs_mask
+        elif self.custom_freqs == 'yarn':
+            #Interpolation constants found experimentally for LLaMA (might not be totally optimal though)
+            #Do not change unless there is a good reason for doing so!
+            beta_fast = 32
+            beta_slow = 1
+            extrapolation_factor = 1
+            freqs_extrapolation = 1.0 / (theta ** (torch.arange(0, dim, 2).to(scale).float() / dim))
+            freqs_interpolation = 1.0 / torch.einsum('..., f -> ... f', scale, (theta ** (torch.arange(0, dim, 2).to(scale).float() / dim)))
+            low, high = find_correction_range(beta_fast, beta_slow, dim, theta, ori_max_pe_len)
+            freqs_mask = (1 - linear_ramp_mask(low, high, dim // 2).to(scale).float()) * extrapolation_factor # Get n-d rotational scaling corrected for extrapolation
+            freqs = freqs_interpolation * (1 - freqs_mask) + freqs_extrapolation * freqs_mask
+        else:
+            raise ValueError(f'Unknown modality {self.custom_freqs}. Only support normal, linear, ntk-aware, ntk-by-parts, yarn!')
+        return freqs
+    def online_get_2d_rope_from_grid(self, grid, size):
+        '''
+        grid: (B, 2, N)
+            N = H * W
+            the first dimension represents width, and the second reprensents height
+            e.g.,   [0. 1. 2. 3. 0. 1. 2. 3. 0. 1. 2. 3.]
+                    [0. 0. 0. 0. 1. 1. 1. 1. 2. 2. 2. 2.]
+        size: (B, 1, 2), h goes first and w goes last
+        '''
+        size = size.squeeze()   # (B, 1, 2) -> (B, 2)
+        if self.decouple:
+            size_h = size[:, 0]
+            size_w = size[:, 1]
+            freqs_h = self.get_1d_rope_freqs(self.theta, self.dim, size_h, self.ori_max_pe_len)
+            freqs_w = self.get_1d_rope_freqs(self.theta, self.dim, size_w, self.ori_max_pe_len)
+        else:
+            size_max = torch.max(size[:, 0], size[:, 1])
+            freqs_h = self.get_1d_rope_freqs(self.theta, self.dim, size_max, self.ori_max_pe_len)
+            freqs_w = self.get_1d_rope_freqs(self.theta, self.dim, size_max, self.ori_max_pe_len)
+        freqs_w = grid[:, 0][..., None] * freqs_w[:, None, :]
+        freqs_w = repeat(freqs_w, '... n -> ... (n r)', r = 2)
+        freqs_h = grid[:, 1][..., None] * freqs_h[:, None, :]
+        freqs_h = repeat(freqs_h, '... n -> ... (n r)', r = 2)
+        freqs = torch.cat([freqs_h, freqs_w], dim=-1)   # (B, N, D)
+        if self.custom_freqs == 'yarn':
+            freqs_cos = freqs.cos() * self.mscale[:, None, None]
+            freqs_sin = freqs.sin() * self.mscale[:, None, None]
+        elif self.custom_freqs == 'ntk-aware-pro1':
+            freqs_cos = freqs.cos() * self.proportion1[:, None, None]
+            freqs_sin = freqs.sin() * self.proportion1[:, None, None]
+        elif self.custom_freqs == 'ntk-aware-pro2':
+            freqs_cos = freqs.cos() * self.proportion2[:, None, None]
+            freqs_sin = freqs.sin() * self.proportion2[:, None, None]
+        else:
+            freqs_cos = freqs.cos()
+            freqs_sin = freqs.sin()
+        return freqs_cos, freqs_sin
+    @lru_cache()
+    def get_2d_rope_from_grid(self, grid):
+        '''
+        grid: (B, 2, N)
+            N = H * W
+            the first dimension represents width, and the second reprensents height
+            e.g.,   [0. 1. 2. 3. 0. 1. 2. 3. 0. 1. 2. 3.]
+                    [0. 0. 0. 0. 1. 1. 1. 1. 2. 2. 2. 2.]
+        '''
+        freqs_h = torch.einsum('..., f -> ... f', grid[:, 0], self.freqs_h)
+        freqs_h = repeat(freqs_h, '... n -> ... (n r)', r = 2)
+        freqs_w = torch.einsum('..., f -> ... f', grid[:, 1], self.freqs_w)
+        freqs_w = repeat(freqs_w, '... n -> ... (n r)', r = 2)
+        freqs = torch.cat([freqs_h, freqs_w], dim=-1)   # (B, N, D)
+        if self.custom_freqs == 'yarn':
+            freqs_cos = freqs.cos() * self.mscale
+            freqs_sin = freqs.sin() * self.mscale
+        elif self.custom_freqs in ['ntk-aware-pro1', 'scale1']:
+            freqs_cos = freqs.cos() * self.proportion1
+            freqs_sin = freqs.sin() * self.proportion1
+        elif self.custom_freqs in ['ntk-aware-pro2', 'scale2']:
+            freqs_cos = freqs.cos() * self.proportion2
+            freqs_sin = freqs.sin() * self.proportion2
+        else:
+            freqs_cos = freqs.cos()
+            freqs_sin = freqs.sin()
+        return freqs_cos, freqs_sin
+    @lru_cache()
+    def get_cached_2d_rope_from_grid(self, grid: torch.Tensor):
+        '''
+        grid: (B, 2, N)
+            N = H * W
+            the first dimension represents width, and the second reprensents height
+            e.g.,   [0. 1. 2. 3. 0. 1. 2. 3. 0. 1. 2. 3.]
+                    [0. 0. 0. 0. 1. 1. 1. 1. 2. 2. 2. 2.]
+        '''
+        if len(grid.shape) == 3:    # (B, 2, N)
+            freqs_h, freqs_w = self.freqs_h_cached[grid[:, 0]], self.freqs_w_cached[grid[:, 1]]
+        elif len(grid.shape) == 2:  # (2, N)
+            freqs_h, freqs_w = self.freqs_h_cached[grid[0]], self.freqs_w_cached[grid[1]]
+        freqs = torch.cat([freqs_h, freqs_w], dim=-1)   # (B, N, D)
+        if self.custom_freqs == 'yarn':
+            freqs_cos = freqs.cos() * self.mscale
+            freqs_sin = freqs.sin() * self.mscale
+        elif self.custom_freqs in ['ntk-aware-pro1', 'scale1']:
+            freqs_cos = freqs.cos() * self.proportion1
+            freqs_sin = freqs.sin() * self.proportion1
+        elif self.custom_freqs in ['ntk-aware-pro2', 'scale2']:
+            freqs_cos = freqs.cos() * self.proportion2
+            freqs_sin = freqs.sin() * self.proportion2
+        else:
+            freqs_cos = freqs.cos()
+            freqs_sin = freqs.sin()
+        return freqs_cos, freqs_sin
+    def forward(self, x, grid):
+        '''
+        x: (B, n_head, N, D)
+        grid: (B, 2, N)
+        '''
+        # freqs_cos, freqs_sin = self.get_2d_rope_from_grid(grid)
+        # freqs_cos, freqs_sin = freqs_cos.unsqueeze(1), freqs_sin.unsqueeze(1)
+        # using cache to accelerate, this is the same with the above codes:
+        freqs_cos, freqs_sin = self.get_cached_2d_rope_from_grid(grid)
+        freqs_cos, freqs_sin = freqs_cos.unsqueeze(1), freqs_sin.unsqueeze(1)
+        return  x * freqs_cos + rotate_half(x) * freqs_sin

tim/models/utils/text_encoders.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import torch
+from transformers import T5EncoderModel, AutoModelForCausalLM, AutoTokenizer
+# load text-encoder
+def load_text_encoder(text_encoder_dir, device, weight_dtype):
+    os.environ["TOKENIZERS_PARALLELISM"] = "true"
+    tokenizer = AutoTokenizer.from_pretrained(text_encoder_dir)
+    if "gemma" in text_encoder_dir:
+        tokenizer.padding_side = "right"
+        text_encoder = AutoModelForCausalLM.from_pretrained(
+            text_encoder_dir,
+            attn_implementation="flash_attention_2",
+            device_map="cpu",
+            torch_dtype=weight_dtype,
+        ).model
+    elif "t5" in text_encoder_dir:
+        text_encoder = T5EncoderModel.from_pretrained(
+            text_encoder_dir,
+            attn_implementation="sdpa",
+            device_map="cpu",
+            torch_dtype=weight_dtype,
+        )
+    else:
+        raise NotImplementedError
+    text_encoder.requires_grad_(False)
+    text_encoder = text_encoder.eval().to(device=device, dtype=weight_dtype)
+    return text_encoder, tokenizer
+def encode_prompt(
+    tokenizer,
+    text_encoder,
+    device,
+    weight_dtype,
+    captions,
+    use_last_hidden_state,
+    max_seq_length=256,
+):
+    text_inputs = tokenizer(
+        captions,
+        padding="max_length",
+        max_length=max_seq_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids.to(device)
+    prompt_masks = text_inputs.attention_mask.to(device)
+    with torch.no_grad(), torch.autocast("cuda", dtype=weight_dtype):
+        results = text_encoder(
+            input_ids=text_input_ids,
+            attention_mask=prompt_masks,
+            output_hidden_states=True,
+        )
+        if use_last_hidden_state:
+            prompt_embeds = results.last_hidden_state
+        else:  # from Imagen paper
+            prompt_embeds = results.hidden_states[-2]
+    return prompt_embeds, prompt_masks