recursionaut commited on Nov 5, 2024

Commit

6ded986

verified ·

1 Parent(s): c1a2b2a

testing files upload (#7)

Browse files

- testing files upload (f489a598b0d6a46d9a99c819210b220269d9b29b)

Files changed (22) hide show

.gitignore +32 -0
LICENSE +399 -0
MODELCARD.md +128 -0
README.md +34 -120
config.yaml +16 -0
generate_reconstructions.ipynb +0 -0
huggingface_mae.py +293 -0
loss.py +59 -0
mae_modules.py +273 -0
mae_utils.py +70 -0
masking.py +51 -0
normalizer.py +7 -0
pyproject.toml +34 -0
sample/AA41_s1_1.jp2 +0 -0
sample/AA41_s1_2.jp2 +0 -0
sample/AA41_s1_3.jp2 +0 -0
sample/AA41_s1_4.jp2 +0 -0
sample/AA41_s1_5.jp2 +0 -0
sample/AA41_s1_6.jp2 +0 -0
test_huggingface_mae.py +32 -0
vit.py +309 -0
vit_encoder.py +61 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,32 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# model artifacts
+*.pickle
+*.ckpt
+*.safetensors

LICENSE ADDED Viewed

	@@ -0,0 +1,399 @@

+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+	wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

MODELCARD.md ADDED Viewed

	@@ -0,0 +1,128 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Phenom CA-MAE-S/16
+Channel-agnostic image encoding model designed for microscopy image featurization.
+The model uses a vision transformer backbone with channelwise cross-attention over patch tokens to create contextualized representations separately for each channel.
+## Model Details
+### Model Description
+This model is a [channel-agnostic masked autoencoder](https://openaccess.thecvf.com/content/CVPR2024/html/Kraus_Masked_Autoencoders_for_Microscopy_are_Scalable_Learners_of_Cellular_Biology_CVPR_2024_paper.html) trained to reconstruct microscopy images over three datasets:
+1. RxRx3
+2. JUMP-CP overexpression
+3. JUMP-CP gene-knockouts
+- **Developed, funded, and shared by:** Recursion
+- **Model type:** Vision transformer CA-MAE
+- **Image modality:** Optimized for microscopy images from the CellPainting assay
+- **License:**
+### Model Sources
+- **Repository:** [https://github.com/recursionpharma/maes_microscopy](https://github.com/recursionpharma/maes_microscopy)
+- **Paper:** [Masked Autoencoders for Microscopy are Scalable Learners of Cellular Biology](https://openaccess.thecvf.com/content/CVPR2024/html/Kraus_Masked_Autoencoders_for_Microscopy_are_Scalable_Learners_of_Cellular_Biology_CVPR_2024_paper.html)
+## Uses
+NOTE: model embeddings tend to extract features only after using standard batch correction post-processing techniques. **We recommend**, at a *minimum*, after inferencing the model over your images, to do the standard `PCA-CenterScale` pattern or better yet Typical Variation Normalization:
+1. Fit a PCA kernel on all the *control images* (or all images if no controls) from across all experimental batches (e.g. the plates of wells from your assay),
+2. Transform all the embeddings with that PCA kernel,
+3. For each experimental batch, fit a separate StandardScaler on the transformed embeddings of the controls from step 2, then transform the rest of the embeddings from that batch with that StandardScaler.
+### Direct Use
+- Create biologically useful embeddings of microscopy images
+- Create contextualized embeddings of each channel of a microscopy image (set `return_channelwise_embeddings=True`)
+- Leverage the full MAE encoder + decoder to predict new channels / stains for images without all 6 CellPainting channels
+### Downstream Use
+- A determined ML expert could fine-tune the encoder for downstream tasks such as classification
+### Out-of-Scope Use
+- Unlikely to be especially performant on brightfield microscopy images
+- Out-of-domain medical images, such as H&E (maybe it would be a decent baseline though)
+## Bias, Risks, and Limitations
+- Primary limitation is that the embeddings tend to be more useful at scale. For example, if you only have 1 plate of microscopy images, the embeddings might underperform compared to a supervised bespoke model.
+## How to Get Started with the Model
+You should be able to successfully run the below tests, which demonstrate how to use the model at inference time.
+```python
+import pytest
+import torch
+from huggingface_mae import MAEModel
+huggingface_phenombeta_model_dir = "."
+# huggingface_modelpath = "recursionpharma/test-pb-model"
+@pytest.fixture
+def huggingface_model():
+    # Make sure you have the model/config downloaded from https://huggingface.co/recursionpharma/test-pb-model to this directory
+    # huggingface-cli download recursionpharma/test-pb-model --local-dir=.
+    huggingface_model = MAEModel.from_pretrained(huggingface_phenombeta_model_dir)
+    huggingface_model.eval()
+    return huggingface_model
+@pytest.mark.parametrize("C", [1, 4, 6, 11])
+@pytest.mark.parametrize("return_channelwise_embeddings", [True, False])
+def test_model_predict(huggingface_model, C, return_channelwise_embeddings):
+    example_input_array = torch.randint(
+        low=0,
+        high=255,
+        size=(2, C, 256, 256),
+        dtype=torch.uint8,
+        device=huggingface_model.device,
+    )
+    huggingface_model.return_channelwise_embeddings = return_channelwise_embeddings
+    embeddings = huggingface_model.predict(example_input_array)
+    expected_output_dim = 384 * C if return_channelwise_embeddings else 384
+    assert embeddings.shape == (2, expected_output_dim)
+```
+## Training, evaluation and testing details
+See paper linked above for details on model training and evaluation. Primary hyperparameters are included in the repo linked above.
+## Environmental Impact
+- **Hardware Type:** Nvidia H100 Hopper nodes
+- **Hours used:** 400
+- **Cloud Provider:** private cloud
+- **Carbon Emitted:** 138.24 kg co2 (roughly the equivalent of one car driving from Toronto to Montreal)
+**BibTeX:**
+```TeX
+@inproceedings{kraus2024masked,
+  title={Masked Autoencoders for Microscopy are Scalable Learners of Cellular Biology},
+  author={Kraus, Oren and Kenyon-Dean, Kian and Saberian, Saber and Fallah, Maryam and McLean, Peter and Leung, Jess and Sharma, Vasudev and Khan, Ayla and Balakrishnan, Jia and Celik, Safiye and others},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={11757--11768},
+  year={2024}
+}
+```
+## Model Card Contact
+- Kian Kenyon-Dean: [email protected]
+- Oren Kraus: [email protected]
+- Or, email: [email protected]

README.md CHANGED Viewed

@@ -1,128 +1,42 @@
----
-library_name: transformers
-tags: []
----
-# Model Card for Phenom CA-MAE-S/16
-Channel-agnostic image encoding model designed for microscopy image featurization.
-The model uses a vision transformer backbone with channelwise cross-attention over patch tokens to create contextualized representations separately for each channel.
-## Model Details
-### Model Description
-This model is a [channel-agnostic masked autoencoder](https://openaccess.thecvf.com/content/CVPR2024/html/Kraus_Masked_Autoencoders_for_Microscopy_are_Scalable_Learners_of_Cellular_Biology_CVPR_2024_paper.html) trained to reconstruct microscopy images over three datasets:
-1. RxRx3
-2. JUMP-CP overexpression
-3. JUMP-CP gene-knockouts
-- **Developed, funded, and shared by:** Recursion
-- **Model type:** Vision transformer CA-MAE
-- **Image modality:** Optimized for microscopy images from the CellPainting assay
-- **License:**
-### Model Sources
-- **Repository:** [https://github.com/recursionpharma/maes_microscopy](https://github.com/recursionpharma/maes_microscopy)
-- **Paper:** [Masked Autoencoders for Microscopy are Scalable Learners of Cellular Biology](https://openaccess.thecvf.com/content/CVPR2024/html/Kraus_Masked_Autoencoders_for_Microscopy_are_Scalable_Learners_of_Cellular_Biology_CVPR_2024_paper.html)
-## Uses
-NOTE: model embeddings tend to extract features only after using standard batch correction post-processing techniques. **We recommend**, at a *minimum*, after inferencing the model over your images, to do the standard `PCA-CenterScale` pattern or better yet Typical Variation Normalization:
-1. Fit a PCA kernel on all the *control images* (or all images if no controls) from across all experimental batches (e.g. the plates of wells from your assay),
-2. Transform all the embeddings with that PCA kernel,
-3. For each experimental batch, fit a separate StandardScaler on the transformed embeddings of the controls from step 2, then transform the rest of the embeddings from that batch with that StandardScaler.
-### Direct Use
-- Create biologically useful embeddings of microscopy images
-- Create contextualized embeddings of each channel of a microscopy image (set `return_channelwise_embeddings=True`)
-- Leverage the full MAE encoder + decoder to predict new channels / stains for images without all 6 CellPainting channels
-### Downstream Use
-- A determined ML expert could fine-tune the encoder for downstream tasks such as classification
-### Out-of-Scope Use
-- Unlikely to be especially performant on brightfield microscopy images
-- Out-of-domain medical images, such as H&E (maybe it would be a decent baseline though)
-## Bias, Risks, and Limitations
-- Primary limitation is that the embeddings tend to be more useful at scale. For example, if you only have 1 plate of microscopy images, the embeddings might underperform compared to a supervised bespoke model.
-## How to Get Started with the Model
-You should be able to successfully run the below tests, which demonstrate how to use the model at inference time.
-```python
-import pytest
-import torch
-from huggingface_mae import MAEModel
-huggingface_phenombeta_model_dir = "models/phenom_beta_huggingface"
-# huggingface_modelpath = "recursionpharma/test-pb-model"
-@pytest.fixture
-def huggingface_model():
-    # Make sure you have the model/config downloaded from https://huggingface.co/recursionpharma/test-pb-model to this directory
-    # huggingface-cli download recursionpharma/test-pb-model --local-dir=models/phenom_beta_huggingface
-    huggingface_model = MAEModel.from_pretrained(huggingface_phenombeta_model_dir)
-    huggingface_model.eval()
-    return huggingface_model
-@pytest.mark.parametrize("C", [1, 4, 6, 11])
-@pytest.mark.parametrize("return_channelwise_embeddings", [True, False])
-def test_model_predict(huggingface_model, C, return_channelwise_embeddings):
-    example_input_array = torch.randint(
-        low=0,
-        high=255,
-        size=(2, C, 256, 256),
-        dtype=torch.uint8,
-        device=huggingface_model.device,
-    )
-    huggingface_model.return_channelwise_embeddings = return_channelwise_embeddings
-    embeddings = huggingface_model.predict(example_input_array)
-    expected_output_dim = 384 * C if return_channelwise_embeddings else 384
-    assert embeddings.shape == (2, expected_output_dim)
 ```
-## Training, evaluation and testing details
-See paper linked above for details on model training and evaluation. Primary hyperparameters are included in the repo linked above.
-## Environmental Impact
-- **Hardware Type:** Nvidia H100 Hopper nodes
-- **Hours used:** 400
-- **Cloud Provider:** private cloud
-- **Carbon Emitted:** 138.24 kg co2 (roughly the equivalent of one car driving from Toronto to Montreal)
-**BibTeX:**
-```TeX
-@inproceedings{kraus2024masked,
-  title={Masked Autoencoders for Microscopy are Scalable Learners of Cellular Biology},
-  author={Kraus, Oren and Kenyon-Dean, Kian and Saberian, Saber and Fallah, Maryam and McLean, Peter and Leung, Jess and Sharma, Vasudev and Khan, Ayla and Balakrishnan, Jia and Celik, Safiye and others},
-  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
-  pages={11757--11768},
-  year={2024}
-}
 ```
-## Model Card Contact
-- Kian Kenyon-Dean: [email protected]
-- Oren Kraus: oren.kraus@recursion.com
-- Or, email: info@rxrx.ai

+# Masked Autoencoders are Scalable Learners of Cellular Morphology
+Official repo for Recursion's two recently accepted papers:
+- Spotlight full-length paper at [CVPR 2024](https://cvpr.thecvf.com/Conferences/2024/AcceptedPapers) -- Masked Autoencoders for Microscopy are Scalable Learners of Cellular Biology
+  - Paper: https://arxiv.org/abs/2404.10242
+  - CVPR poster page with video: https://cvpr.thecvf.com/virtual/2024/poster/31565
+- Spotlight workshop paper at [NeurIPS 2023 Generative AI &amp; Biology workshop](https://openreview.net/group?id=NeurIPS.cc/2023/Workshop/GenBio)
+  - Paper: https://arxiv.org/abs/2309.16064
+![vit_diff_mask_ratios](https://github.com/recursionpharma/maes_microscopy/assets/109550980/c15f46b1-cdb9-41a7-a4af-bdc9684a971d)
+## Provided code
+See the repo for ingredients required for defining our MAEs. Users seeking to re-implement training will need to stitch together the Encoder and Decoder modules according to their usecase.
+Furthermore the baseline Vision Transformer architecture backbone used in this work can be built with the following code snippet from Timm:
 ```
+import timm.models.vision_transformer as vit
+def vit_base_patch16_256(**kwargs):
+    default_kwargs = dict(
+        img_size=256,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        drop_path_rate=0.1,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.vit_base_patch16_224(**default_kwargs)
 ```
+## Provided models
+A publicly available model for research can be found via Nvidia's BioNemo platform, which handles inference and auto-scaling: https://www.rxrx.ai/phenom
+We have partnered with Nvidia to host a publicly-available smaller and more flexible version of the MAE phenomics foundation model, called Phenom-Beta. Interested parties can access it directly through the Nvidia BioNemo API:
+- https://blogs.nvidia.com/blog/drug-discovery-bionemo-generative-ai/
+- https://www.youtube.com/watch?v=Gch6bX1toB0

config.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+# © Recursion Pharmaceuticals 2024
+loss:
+  _target_: torch.nn.MSELoss  # combine with fourier loss weighted at 0.01 mixing factor for best results
+  reduction: none
+optimizer:
+  _target_: timm.optim.lion.Lion
+  _partial_: true
+  lr: *lr 1e-4   # 1e-4 for <= ViT-B, and 3e-5 for ViT-L
+  weight_decay: 0.05
+  betas: [0.9, 0.95]
+lr_scheduler:
+  _target_: torch.optim.lr_scheduler.OneCycleLR
+  _partial_: true
+  max_lr: @lr
+  pct_start: 0.1
+  anneal_strategy: cos

generate_reconstructions.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

huggingface_mae.py ADDED Viewed

	@@ -0,0 +1,293 @@

+from typing import Dict, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig, PreTrainedModel
+from loss import FourierLoss
+from normalizer import Normalizer
+from mae_modules import CAMAEDecoder, MAEDecoder, MAEEncoder
+from mae_utils import flatten_images
+from vit import (
+    generate_2d_sincos_pos_embeddings,
+    sincos_positional_encoding_vit,
+    vit_small_patch16_256,
+)
+TensorDict = Dict[str, torch.Tensor]
+class MAEConfig(PretrainedConfig):
+    model_type = "MAE"
+    def __init__(
+        self,
+        mask_ratio=0.75,
+        encoder=None,
+        decoder=None,
+        loss=None,
+        optimizer=None,
+        input_norm=None,
+        fourier_loss=None,
+        fourier_loss_weight=0.0,
+        lr_scheduler=None,
+        use_MAE_weight_init=False,
+        crop_size=-1,
+        mask_fourier_loss=True,
+        return_channelwise_embeddings=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.mask_ratio = mask_ratio
+        self.encoder = encoder
+        self.decoder = decoder
+        self.loss = loss
+        self.optimizer = optimizer
+        self.input_norm = input_norm
+        self.fourier_loss = fourier_loss
+        self.fourier_loss_weight = fourier_loss_weight
+        self.lr_scheduler = lr_scheduler
+        self.use_MAE_weight_init = use_MAE_weight_init
+        self.crop_size = crop_size
+        self.mask_fourier_loss = mask_fourier_loss
+        self.return_channelwise_embeddings = return_channelwise_embeddings
+class MAEModel(PreTrainedModel):
+    config_class = MAEConfig
+    # Loss metrics
+    TOTAL_LOSS = "loss"
+    RECON_LOSS = "reconstruction_loss"
+    FOURIER_LOSS = "fourier_loss"
+    def __init__(self, config: MAEConfig):
+        super().__init__(config)
+        self.mask_ratio = config.mask_ratio
+        # Could use Hydra to instantiate instead
+        self.encoder = MAEEncoder(
+            vit_backbone=sincos_positional_encoding_vit(
+                vit_backbone=vit_small_patch16_256(global_pool="avg")
+            ),
+            max_in_chans=11,  # upper limit on number of input channels
+            channel_agnostic=True,
+        )
+        self.decoder = CAMAEDecoder(
+            depth=8,
+            embed_dim=512,
+            mlp_ratio=4,
+            norm_layer=nn.LayerNorm,
+            num_heads=16,
+            num_modalities=6,
+            qkv_bias=True,
+            tokens_per_modality=256,
+        )
+        self.input_norm = torch.nn.Sequential(
+            Normalizer(),
+            nn.InstanceNorm2d(None, affine=False, track_running_stats=False),
+        )
+        self.fourier_loss_weight = config.fourier_loss_weight
+        self.mask_fourier_loss = config.mask_fourier_loss
+        self.return_channelwise_embeddings = config.return_channelwise_embeddings
+        self.tokens_per_channel = 256  # hardcode the number of tokens per channel since we are patch16 crop 256
+        # loss stuff
+        self.loss = torch.nn.MSELoss(reduction="none")
+        self.fourier_loss = FourierLoss(num_multimodal_modalities=6)
+        if self.fourier_loss_weight > 0 and self.fourier_loss is None:
+            raise ValueError(
+                "FourierLoss weight is activated but no fourier_loss was defined in constructor"
+            )
+        elif self.fourier_loss_weight >= 1:
+            raise ValueError(
+                "FourierLoss weight is too large to do mixing factor, weight should be < 1"
+            )
+        self.patch_size = int(self.encoder.vit_backbone.patch_embed.patch_size[0])
+        # projection layer between the encoder and decoder
+        self.encoder_decoder_proj = nn.Linear(
+            self.encoder.embed_dim, self.decoder.embed_dim, bias=True
+        )
+        self.decoder_pred = nn.Linear(
+            self.decoder.embed_dim,
+            self.patch_size**2
+            * (1 if self.encoder.channel_agnostic else self.in_chans),
+            bias=True,
+        )  # linear layer from decoder embedding to input dims
+        # overwrite decoder pos embeddings based on encoder params
+        self.decoder.pos_embeddings = generate_2d_sincos_pos_embeddings(  # type: ignore[assignment]
+            self.decoder.embed_dim,
+            length=self.encoder.vit_backbone.patch_embed.grid_size[0],
+            use_class_token=self.encoder.vit_backbone.cls_token is not None,
+            num_modality=(
+                self.decoder.num_modalities if self.encoder.channel_agnostic else 1
+            ),
+        )
+        if config.use_MAE_weight_init:
+            w = self.encoder.vit_backbone.patch_embed.proj.weight.data
+            torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+            torch.nn.init.normal_(self.encoder.vit_backbone.cls_token, std=0.02)
+            torch.nn.init.normal_(self.decoder.mask_token, std=0.02)
+            self.apply(self._MAE_init_weights)
+    def setup(self, stage: str) -> None:
+        super().setup(stage)
+    def _MAE_init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @staticmethod
+    def decode_to_reconstruction(
+        encoder_latent: torch.Tensor,
+        ind_restore: torch.Tensor,
+        proj: torch.nn.Module,
+        decoder: MAEDecoder | CAMAEDecoder,
+        pred: torch.nn.Module,
+    ) -> torch.Tensor:
+        """Feed forward the encoder latent through the decoders necessary projections and transformations."""
+        decoder_latent_projection = proj(
+            encoder_latent
+        )  # projection from encoder.embed_dim to decoder.embed_dim
+        decoder_tokens = decoder.forward_masked(
+            decoder_latent_projection, ind_restore
+        )  # decoder.embed_dim output
+        predicted_reconstruction = pred(
+            decoder_tokens
+        )  # linear projection to input dim
+        return predicted_reconstruction[:, 1:, :]  # drop class token
+    def forward(
+        self, imgs: torch.Tensor, constant_noise: Union[torch.Tensor, None] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        imgs = self.input_norm(imgs)
+        latent, mask, ind_restore = self.encoder.forward_masked(
+            imgs, self.mask_ratio, constant_noise
+        )  # encoder blocks
+        reconstruction = self.decode_to_reconstruction(
+            latent,
+            ind_restore,
+            self.encoder_decoder_proj,
+            self.decoder,
+            self.decoder_pred,
+        )
+        return latent, reconstruction, mask
+    def compute_MAE_loss(
+        self,
+        reconstruction: torch.Tensor,
+        img: torch.Tensor,
+        mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, float]]:
+        """Computes final loss and returns specific values of component losses for metric reporting."""
+        loss_dict = {}
+        img = self.input_norm(img)
+        target_flattened = flatten_images(
+            img,
+            patch_size=self.patch_size,
+            channel_agnostic=self.encoder.channel_agnostic,
+        )
+        loss: torch.Tensor = self.loss(
+            reconstruction, target_flattened
+        )  # should be with MSE or MAE (L1) with reduction='none'
+        loss = loss.mean(
+            dim=-1
+        )  # average over embedding dim -> mean loss per patch (N,L)
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on masked patches only
+        loss_dict[self.RECON_LOSS] = loss.item()
+        # compute fourier loss
+        if self.fourier_loss_weight > 0:
+            floss: torch.Tensor = self.fourier_loss(reconstruction, target_flattened)
+            if not self.mask_fourier_loss:
+                floss = floss.mean()
+            else:
+                floss = floss.mean(dim=-1)
+                floss = (floss * mask).sum() / mask.sum()
+            loss_dict[self.FOURIER_LOSS] = floss.item()
+        # here we use a mixing factor to keep the loss magnitude appropriate with fourier
+        if self.fourier_loss_weight > 0:
+            loss = (1 - self.fourier_loss_weight) * loss + (
+                self.fourier_loss_weight * floss
+            )
+        return loss, loss_dict
+    def training_step(self, batch: TensorDict, batch_idx: int) -> TensorDict:
+        img = batch["pixels"]
+        latent, reconstruction, mask = self(img.clone())
+        full_loss, loss_dict = self.compute_MAE_loss(reconstruction, img.float(), mask)
+        return {
+            "loss": full_loss,
+            **loss_dict,  # type: ignore[dict-item]
+        }
+    def validation_step(self, batch: TensorDict, batch_idx: int) -> TensorDict:
+        return self.training_step(batch, batch_idx)
+    def update_metrics(self, outputs: TensorDict, batch: TensorDict) -> None:
+        self.metrics["lr"].update(value=self.lr_scheduler.get_last_lr())
+        for key, value in outputs.items():
+            if key.endswith("loss"):
+                self.metrics[key].update(value)
+    def on_validation_batch_end(  # type: ignore[override]
+        self,
+        outputs: TensorDict,
+        batch: TensorDict,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        super().on_validation_batch_end(outputs, batch, batch_idx, dataloader_idx)
+    def predict(self, imgs: torch.Tensor) -> torch.Tensor:
+        imgs = self.input_norm(imgs)
+        X = self.encoder.vit_backbone.forward_features(
+            imgs
+        )  # 3d tensor N x num_tokens x dim
+        if self.return_channelwise_embeddings:
+            N, _, d = X.shape
+            num_channels = imgs.shape[1]
+            X_reshaped = X[:, 1:, :].view(N, num_channels, self.tokens_per_channel, d)
+            pooled_segments = X_reshaped.mean(
+                dim=2
+            )  # Resulting shape: (N, num_channels, d)
+            latent = pooled_segments.view(N, num_channels * d).contiguous()
+        else:
+            latent = X[:, 1:, :].mean(dim=1)  # 1 + 256 * C tokens
+        return latent
+    def save_pretrained(self, save_directory: str, **kwargs):
+        filename = kwargs.pop("filename", "model.safetensors")
+        modelpath = f"{save_directory}/{filename}"
+        self.config.save_pretrained(save_directory)
+        torch.save({"state_dict": self.state_dict()}, modelpath)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        filename = kwargs.pop("filename", "model.safetensors")
+        modelpath = f"{pretrained_model_name_or_path}/{filename}"
+        config = MAEConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        state_dict = torch.load(modelpath, map_location="cpu")
+        model = cls(config, *model_args, **kwargs)
+        model.load_state_dict(state_dict["state_dict"])
+        return model

loss.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# © Recursion Pharmaceuticals 2024
+import torch
+import torch.nn as nn
+class FourierLoss(nn.Module):
+    def __init__(
+        self,
+        use_l1_loss: bool = True,
+        num_multimodal_modalities: int = 1,  # set to 1 for vanilla MAE, 6 for channel-agnostic MAE
+    ) -> None:
+        """
+        Fourier transform loss is only sound when using L1 or L2 loss to compare the frequency domains
+        between the images / their radial histograms.
+        We will always set `reduction="none"` and enforce that the computation of any reductions from the
+        output of this loss be managed by the model under question.
+        """
+        super().__init__()
+        self.loss = (
+            nn.L1Loss(reduction="none") if use_l1_loss else nn.MSELoss(reduction="none")
+        )
+        self.num_modalities = num_multimodal_modalities
+    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        # input = reconstructed image, target = original image
+        # flattened images from MAE are (B, H*W, C), so, here we convert to B x C x H x W (note we assume H == W)
+        flattened_images = len(input.shape) == len(target.shape) == 3
+        if flattened_images:
+            B, H_W, C = input.shape
+            H_W = H_W // self.num_modalities
+            four_d_shape = (B, C * self.num_modalities, int(H_W**0.5), int(H_W**0.5))
+            input = input.view(*four_d_shape)
+            target = target.view(*four_d_shape)
+        else:
+            B, C, h, w = input.shape
+            H_W = h * w
+        if len(input.shape) != len(target.shape) != 4:
+            raise ValueError(
+                f"Invalid input shape: got {input.shape} and {target.shape}."
+            )
+        fft_reconstructed = torch.fft.fft2(input)
+        fft_original = torch.fft.fft2(target)
+        magnitude_reconstructed = torch.abs(fft_reconstructed)
+        magnitude_original = torch.abs(fft_original)
+        loss_tensor: torch.Tensor = self.loss(
+            magnitude_reconstructed, magnitude_original
+        )
+        if (
+            flattened_images and not self.num_bins
+        ):  # then output loss should be reshaped
+            loss_tensor = loss_tensor.reshape(B, H_W * self.num_modalities, C)
+        return loss_tensor

mae_modules.py ADDED Viewed

	@@ -0,0 +1,273 @@

+# © Recursion Pharmaceuticals 2024
+from functools import partial
+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+from timm.models.helpers import checkpoint_seq
+from timm.models.vision_transformer import Block, Mlp, VisionTransformer
+from masking import transformer_random_masking
+from vit import channel_agnostic_vit
+# If interested in training new MAEs, combine an encoder and decoder into a new module, and you should
+# leverage the flattening and unflattening utilities as needed from mae_utils.py.
+# Be sure to use an encoder-decoder Linear projection layer to match encoder dims with decoder dimensions.
+# As described in the paper, images are self-standardized at the start.
+class SelfStandardize(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.self_standardize = nn.LazyInstanceNorm2d(
+            affine=False, track_running_stats=False
+        )
+    def forward(self, pixels: torch.Tensor) -> torch.Tensor:
+        x = pixels.float() / 255.0
+        return self.self_standardize(x)
+class MAEEncoder(nn.Module):
+    def __init__(
+        self,
+        vit_backbone: VisionTransformer,
+        max_in_chans: int = 6,
+        channel_agnostic: bool = False,
+    ) -> None:
+        super().__init__()
+        if channel_agnostic:
+            self.vit_backbone = channel_agnostic_vit(
+                vit_backbone, max_in_chans=max_in_chans
+            )
+        else:
+            self.vit_backbone = vit_backbone
+        self.max_in_chans = max_in_chans
+        self.channel_agnostic = channel_agnostic
+    @property
+    def embed_dim(self) -> int:
+        return int(self.vit_backbone.embed_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.vit_backbone.forward_features(x)
+        x = self.vit_backbone.forward_head(x)
+        return x  # type: ignore[no-any-return]
+    def forward_masked(
+        self,
+        x: torch.Tensor,
+        mask_ratio: float,
+        constant_noise: Union[torch.Tensor, None] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        x = self.vit_backbone.patch_embed(x)
+        x = self.vit_backbone._pos_embed(x)  # adds class token
+        x_ = x[:, 1:, :]  # no class token
+        x_, mask, ind_restore = transformer_random_masking(
+            x_, mask_ratio, constant_noise
+        )
+        x = torch.cat([x[:, :1, :], x_], dim=1)  # add class token
+        x = self.vit_backbone.norm_pre(x)
+        if self.vit_backbone.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.vit_backbone.blocks, x)
+        else:
+            x = self.vit_backbone.blocks(x)
+        x = self.vit_backbone.norm(x)
+        return x, mask, ind_restore
+class MAEDecoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int = 512,
+        depth: int = 8,
+        num_heads: int = 16,
+        mlp_ratio: float = 4,
+        qkv_bias: bool = True,
+        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),  # type: ignore[assignment]
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.pos_embeddings = None  # to be overwritten by MAE class
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.blocks = nn.Sequential(
+            *[
+                Block(
+                    embed_dim,
+                    num_heads,
+                    mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = norm_layer(embed_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.pos_embeddings
+        x = self.blocks(x)
+        x = self.norm(x)
+        return x  # type: ignore[no-any-return]
+    def forward_masked(
+        self, x: torch.Tensor, ind_restore: torch.Tensor
+    ) -> torch.Tensor:
+        mask_tokens = self.mask_token.repeat(
+            x.shape[0], ind_restore.shape[1] + 1 - x.shape[1], 1
+        )
+        x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1)  # remove class token
+        x_ = torch.gather(
+            x_, dim=1, index=ind_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])
+        )  # unshuffle
+        x = torch.cat([x[:, :1, :], x_], dim=1)  # add class token
+        x = x + self.pos_embeddings
+        x = self.blocks(x)
+        x = self.norm(x)
+        return x  # type: ignore[no-any-return]
+class CrossAttention(nn.Module):
+    def __init__(
+        self, embed_dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = embed_dim // num_heads
+        self.scale = head_dim**-0.5
+        self.q = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+        self.kv = nn.Linear(embed_dim, embed_dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, context):
+        B, N, C = x.shape
+        _, M, _ = context.shape
+        q = (
+            self.q(x)
+            .reshape(B, N, self.num_heads, C // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+        kv = (
+            self.kv(context)
+            .reshape(B, M, 2, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        k, v = kv[0], kv[1]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class CAMAEDecoder(nn.Module):
+    def __init__(
+        self,
+        num_modalities: int = 6,
+        tokens_per_modality: int = 256,
+        embed_dim: int = 256,
+        depth: int = 2,
+        num_heads: int = 16,
+        mlp_ratio: float = 4,
+        qkv_bias: bool = True,
+        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),  # type: ignore[assignment]
+    ) -> None:
+        super().__init__()
+        self.num_modalities = num_modalities
+        self.tokens_per_modality = tokens_per_modality
+        self.embed_dim = embed_dim
+        self.pos_embeddings = None  # to be overwritten by MAE class
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.placeholder = nn.Parameter(
+            torch.zeros(1, 1, embed_dim), requires_grad=False
+        )
+        self.modality_tokens = nn.ParameterList(
+            [
+                nn.Parameter(torch.zeros(1, 1, self.embed_dim))
+                for modality in range(self.num_modalities)
+            ]
+        )
+        self.cross_attention = CrossAttention(embed_dim=self.embed_dim)
+        self.mlp = Mlp(self.embed_dim, hidden_features=int(self.embed_dim * mlp_ratio))
+        self.decoders = nn.ModuleList(
+            [
+                nn.Sequential(
+                    *[
+                        Block(
+                            embed_dim,
+                            num_heads,
+                            mlp_ratio,
+                            qkv_bias=qkv_bias,
+                            norm_layer=norm_layer,
+                        )
+                        for i in range(depth)
+                    ]
+                )
+                for modality in range(self.num_modalities)
+            ]
+        )
+        # self.norm = norm_layer(embed_dim)  # we decided to drop the last layer norm
+        self.context_norm = norm_layer(embed_dim)
+        self.query_norm = norm_layer(embed_dim)
+        self.out_norm = norm_layer(embed_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_m_s = []
+        modality_tokens_concat = torch.cat(
+            [
+                self.placeholder,
+            ]  # placeholder for class token
+            + [
+                m_t.repeat(1, self.tokens_per_modality, 1)
+                for m_t in self.modality_tokens
+            ],
+            dim=1,
+        )
+        x = (
+            x + self.pos_embeddings + modality_tokens_concat
+        )  # add pos and tiled modality tokens
+        x_ = x[:, 1:, :]  # no class token
+        for m, decoder in enumerate(
+            self.decoders
+        ):  # iterate through modalities and decoders
+            x_m = x_[
+                :, m * self.tokens_per_modality : (m + 1) * self.tokens_per_modality, :
+            ]
+            x_m = self.cross_attention(self.query_norm(x_m), self.context_norm(x_))
+            x_m = x_m + self.mlp(self.out_norm(x_m))
+            x_m = decoder(x_m)
+            x_m_s.append(x_m)
+        x_m_s = torch.cat(x_m_s, dim=1)  # concat all tokens
+        # x_m_s = self.norm(x_m_s)  # we decided to drop the last layer norm
+        x_m_s = torch.cat([x[:, :1, :], x_m_s], dim=1)  # add back class token
+        return x_m_s
+    def forward_masked(
+        self, x: torch.Tensor, ind_restore: torch.Tensor
+    ) -> torch.Tensor:
+        mask_tokens = self.mask_token.repeat(
+            x.shape[0], ind_restore.shape[1] + 1 - x.shape[1], 1
+        )
+        x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1)  # remove class token
+        x_ = torch.gather(
+            x_, dim=1, index=ind_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])
+        )  # unshuffle
+        x = torch.cat([x[:, :1, :], x_], dim=1)  # add class token
+        x = self.forward(x)
+        return x

mae_utils.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# © Recursion Pharmaceuticals 2024
+import math
+import torch
+def flatten_images(
+    img: torch.Tensor, patch_size: int, channel_agnostic: bool = False
+) -> torch.Tensor:
+    """
+    Flattens 2D images into tokens with the same pixel values
+    Parameters
+    ----------
+    img : input image tensor (N, C, H, W)
+    Returns
+    -------
+    flattened_img: flattened image tensor (N, L, patch_size**2 * C)
+    """
+    if (img.shape[2] != img.shape[3]) or (img.shape[2] % patch_size != 0):
+        raise ValueError("image H must equal image W and be divisible by patch_size")
+    in_chans = img.shape[1]
+    h = w = int(img.shape[2] // patch_size)
+    x = img.reshape(shape=(img.shape[0], in_chans, h, patch_size, w, patch_size))
+    if channel_agnostic:
+        x = torch.permute(x, (0, 1, 2, 4, 3, 5))  # NCHPWQ -> NCHWPQ
+        x = x.reshape(shape=(img.shape[0], in_chans * h * w, int(patch_size**2)))
+    else:
+        x = torch.permute(x, (0, 2, 4, 3, 5, 1))  # NCHPWQ -> NHWPQC
+        x = x.reshape(shape=(img.shape[0], h * w, int(patch_size**2 * in_chans)))
+    return x
+def unflatten_tokens(
+    tokens: torch.Tensor,
+    patch_size: int,
+    num_modalities: int = 1,
+    channel_agnostic: bool = False,
+) -> torch.Tensor:
+    """
+    Unflattens tokens (N,L,patch_size**2 * C) into image tensor (N,C,H,W) with the pixel values
+    Parameters
+    ----------
+    tokens : input token tensor (N,L,patch_size**2 * C)
+    Returns
+    -------
+    img: image tensor (N,C,H,W)
+    """
+    if num_modalities > 1 and not channel_agnostic:
+        raise ValueError("Multiple modalities requires channel agnostic unflattening.")
+    h = w = int(math.sqrt(tokens.shape[1] // num_modalities))
+    if h * w != (tokens.shape[1] // num_modalities):
+        raise ValueError("sqrt of number of tokens not integer")
+    if channel_agnostic:
+        x = tokens.reshape(shape=(tokens.shape[0], -1, h, w, patch_size, patch_size))
+        x = torch.permute(x, (0, 1, 2, 4, 3, 5))  # NCHWPQ -> NCHPWQ
+    else:
+        x = tokens.reshape(shape=(tokens.shape[0], h, w, patch_size, patch_size, -1))
+        x = torch.permute(x, (0, 5, 1, 3, 2, 4))  # NHWPQC -> NCHPWQ
+    img = x.reshape(shape=(x.shape[0], -1, h * patch_size, h * patch_size))
+    return img

masking.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# © Recursion Pharmaceuticals 2024
+from typing import Tuple, Union
+import torch
+def transformer_random_masking(
+    x: torch.Tensor, mask_ratio: float, constant_noise: Union[torch.Tensor, None] = None
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Random mask patches per sample
+    Parameters
+    ----------
+    x : token tensor (N, L, D)
+    mask_ratio: float - ratio of image to mask
+    constant_noise: None, if provided should be a tensor of shape (N, L) to produce consistent masks
+    Returns
+    -------
+    x_masked : sub-sampled version of x ( int(mask_ratio * N), L, D)
+    mask : binary mask indicated masked tokens (1 where masked) (N, L)
+    ind_restore : locations of masked tokens, needed for decoder
+    """
+    N, L, D = x.shape  # batch, length, dim
+    len_keep = int(L * (1 - mask_ratio))
+    # use random noise to generate batch based random masks
+    if constant_noise is not None:
+        noise = constant_noise
+    else:
+        noise = torch.rand(N, L, device=x.device)
+    shuffled_tokens = torch.argsort(noise, dim=1)  # shuffled index
+    ind_restore = torch.argsort(shuffled_tokens, dim=1)  # unshuffled index
+    # get masked input
+    tokens_to_keep = shuffled_tokens[:, :len_keep]  # keep the first len_keep indices
+    x_masked = torch.gather(
+        x, dim=1, index=tokens_to_keep.unsqueeze(-1).repeat(1, 1, D)
+    )
+    # get binary mask used for loss masking: 0 is keep, 1 is remove
+    mask = torch.ones([N, L], device=x.device)
+    mask[:, :len_keep] = 0
+    mask = torch.gather(
+        mask, dim=1, index=ind_restore
+    )  # unshuffle to get the binary mask
+    return x_masked, mask, ind_restore

normalizer.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import torch
+class Normalizer(torch.nn.Module):
+    def forward(self, pixels: torch.Tensor) -> torch.Tensor:
+        pixels = pixels.float()
+        return pixels / 255.0

pyproject.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[build-system]
+requires = ["setuptools >= 61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "maes_microscopy_project"
+version = "0.1.0"
+authors = [
+    {name = "kian-kd", email = "[email protected]"},
+    {name = "Laksh47", email = "[email protected]"},
+]
+requires-python = ">=3.10.4"
+dependencies = [
+    "huggingface-hub",
+    "timm",
+    "torch>=2.3",
+    "torchmetrics",
+    "torchvision",
+    "tqdm",
+    "transformers",
+    "xformers",
+    "zarr",
+    "pytorch-lightning>=2.1",
+    "matplotlib",
+    "scikit-image",
+    "ipykernel",
+    "isort",
+    "ruff",
+    "pytest",
+]
+[tool.setuptools]
+py-modules = []

sample/AA41_s1_1.jp2 ADDED Viewed

sample/AA41_s1_2.jp2 ADDED Viewed

sample/AA41_s1_3.jp2 ADDED Viewed

sample/AA41_s1_4.jp2 ADDED Viewed

sample/AA41_s1_5.jp2 ADDED Viewed

sample/AA41_s1_6.jp2 ADDED Viewed

test_huggingface_mae.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import pytest
+import torch
+from huggingface_mae import MAEModel
+huggingface_phenombeta_model_dir = "."
+# huggingface_modelpath = "recursionpharma/test-pb-model"
+@pytest.fixture
+def huggingface_model():
+    # Make sure you have the model/config downloaded from https://huggingface.co/recursionpharma/test-pb-model to this directory
+    # huggingface-cli download recursionpharma/test-pb-model --local-dir=.
+    huggingface_model = MAEModel.from_pretrained(huggingface_phenombeta_model_dir)
+    huggingface_model.eval()
+    return huggingface_model
+@pytest.mark.parametrize("C", [1, 4, 6, 11])
+@pytest.mark.parametrize("return_channelwise_embeddings", [True, False])
+def test_model_predict(huggingface_model, C, return_channelwise_embeddings):
+    example_input_array = torch.randint(
+        low=0,
+        high=255,
+        size=(2, C, 256, 256),
+        dtype=torch.uint8,
+        device=huggingface_model.device,
+    )
+    huggingface_model.return_channelwise_embeddings = return_channelwise_embeddings
+    embeddings = huggingface_model.predict(example_input_array)
+    expected_output_dim = 384 * C if return_channelwise_embeddings else 384
+    assert embeddings.shape == (2, expected_output_dim)

vit.py ADDED Viewed

	@@ -0,0 +1,309 @@

+# © Recursion Pharmaceuticals 2024
+import timm.models.vision_transformer as vit
+import torch
+def generate_2d_sincos_pos_embeddings(
+    embedding_dim: int,
+    length: int,
+    scale: float = 10000.0,
+    use_class_token: bool = True,
+    num_modality: int = 1,
+) -> torch.nn.Parameter:
+    """
+    Generate 2Dimensional sin/cosine positional embeddings
+    Parameters
+    ----------
+    embedding_dim : int
+        embedding dimension used in vit
+    length : int
+        number of tokens along height or width of image after patching (assuming square)
+    scale : float
+        scale for sin/cos functions
+    use_class_token : bool
+        True - add zero vector to be added to class_token, False - no vector added
+    num_modality: number of modalities. If 0, a single modality is assumed.
+        Otherwise one-hot modality encoding is added and sincos encoding size is appropriately reduced.
+    Returns
+    -------
+    positional_encoding : torch.Tensor
+        positional encoding to add to vit patch encodings
+        [num_modality*length*length, embedding_dim] or [1+num_modality*length*length, embedding_dim]
+        (w/ or w/o cls_token)
+    """
+    linear_positions = torch.arange(length, dtype=torch.float32)
+    height_mesh, width_mesh = torch.meshgrid(
+        linear_positions, linear_positions, indexing="ij"
+    )
+    positional_dim = embedding_dim // 4  # accomodate h and w x cos and sin embeddings
+    positional_weights = (
+        torch.arange(positional_dim, dtype=torch.float32) / positional_dim
+    )
+    positional_weights = 1.0 / (scale**positional_weights)
+    height_weights = torch.outer(height_mesh.flatten(), positional_weights)
+    width_weights = torch.outer(width_mesh.flatten(), positional_weights)
+    positional_encoding = torch.cat(
+        [
+            torch.sin(height_weights),
+            torch.cos(height_weights),
+            torch.sin(width_weights),
+            torch.cos(width_weights),
+        ],
+        dim=1,
+    )[None, :, :]
+    # repeat positional encoding for multiple channel modalities
+    positional_encoding = positional_encoding.repeat(1, num_modality, 1)
+    if use_class_token:
+        class_token = torch.zeros([1, 1, embedding_dim], dtype=torch.float32)
+        positional_encoding = torch.cat([class_token, positional_encoding], dim=1)
+    positional_encoding = torch.nn.Parameter(positional_encoding, requires_grad=False)
+    return positional_encoding
+class ChannelAgnosticPatchEmbed(vit.PatchEmbed):  # type: ignore[misc]
+    def __init__(
+        self,
+        img_size: int,
+        patch_size: int,
+        embed_dim: int,
+        bias: bool = True,
+    ) -> None:
+        super().__init__(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=1,  # in_chans is used by self.proj, which we override anyway
+            embed_dim=embed_dim,
+            norm_layer=None,
+            flatten=False,
+            bias=bias,
+        )
+        # channel-agnostic MAE has a single projection for all chans
+        self.proj = torch.nn.Conv2d(
+            1, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        in_chans = x.shape[1]
+        x = torch.stack(
+            [self.proj(x[:, i : i + 1]) for i in range(in_chans)], dim=2
+        )  # single project for all chans
+        x = x.flatten(2).transpose(1, 2)  # BCMHW -> BNC
+        return x
+class ChannelAgnosticViT(vit.VisionTransformer):  # type: ignore[misc]
+    def _pos_embed(self, x: torch.Tensor) -> torch.Tensor:
+        # rewrite https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L586
+        to_cat = []
+        if self.cls_token is not None:
+            to_cat.append(self.cls_token.expand(x.shape[0], -1, -1))
+        # TODO: upgrade timm to get access to register tokens
+        # if self.vit_backbone.reg_token is not None:
+        #     to_cat.append(self.reg_token.expand(x.shape[0], -1, -1))
+        # MAIN DIFFERENCE with Timm - we DYNAMICALLY ADDING POS EMBEDDINGS based on shape of inputs
+        # this supports having CA-MAEs actually be channel-agnostic at inference time
+        if self.no_embed_class:
+            x = x + self.pos_embed[:, : x.shape[1]]
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+        else:
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+            x = x + self.pos_embed[:, : x.shape[1]]
+        return self.pos_drop(x)  # type: ignore[no-any-return]
+def channel_agnostic_vit(
+    vit_backbone: vit.VisionTransformer, max_in_chans: int
+) -> vit.VisionTransformer:
+    # replace patch embedding with channel-agnostic version
+    vit_backbone.patch_embed = ChannelAgnosticPatchEmbed(
+        img_size=vit_backbone.patch_embed.img_size[0],
+        patch_size=vit_backbone.patch_embed.patch_size[0],
+        embed_dim=vit_backbone.embed_dim,
+    )
+    # replace positional embedding with channel-agnostic version
+    vit_backbone.pos_embed = generate_2d_sincos_pos_embeddings(
+        embedding_dim=vit_backbone.embed_dim,
+        length=vit_backbone.patch_embed.grid_size[0],
+        use_class_token=vit_backbone.cls_token is not None,
+        num_modality=max_in_chans,
+    )
+    # change the class to be ChannelAgnostic so that it actually uses the new _pos_embed
+    vit_backbone.__class__ = ChannelAgnosticViT
+    return vit_backbone
+def sincos_positional_encoding_vit(
+    vit_backbone: vit.VisionTransformer, scale: float = 10000.0
+) -> vit.VisionTransformer:
+    """Attaches no-grad sin-cos positional embeddings to a pre-constructed ViT backbone model.
+    Parameters
+    ----------
+    vit_backbone : timm.models.vision_transformer.VisionTransformer
+        the constructed vision transformer from timm
+    scale : float (default 10000.0)
+        hyperparameter for sincos positional embeddings, recommend keeping at 10,000
+    Returns
+    -------
+    timm.models.vision_transformer.VisionTransformer
+        the same ViT but with fixed no-grad positional encodings to add to vit patch encodings
+    """
+    # length: number of tokens along height or width of image after patching (assuming square)
+    length = (
+        vit_backbone.patch_embed.img_size[0] // vit_backbone.patch_embed.patch_size[0]
+    )
+    pos_embeddings = generate_2d_sincos_pos_embeddings(
+        vit_backbone.embed_dim,
+        length=length,
+        scale=scale,
+        use_class_token=vit_backbone.cls_token is not None,
+    )
+    # note, if the model had weight_init == 'skip', this might get overwritten
+    vit_backbone.pos_embed = pos_embeddings
+    return vit_backbone
+def vit_small_patch16_256(**kwargs):
+    default_kwargs = dict(
+        img_size=256,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        drop_path_rate=0.1,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.vit_small_patch16_224(**default_kwargs)
+def vit_small_patch32_512(**kwargs):
+    default_kwargs = dict(
+        img_size=512,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        drop_path_rate=0.1,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.vit_small_patch32_384(**default_kwargs)
+def vit_base_patch8_256(**kwargs):
+    default_kwargs = dict(
+        img_size=256,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        drop_path_rate=0.1,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.vit_base_patch8_224(**default_kwargs)
+def vit_base_patch16_256(**kwargs):
+    default_kwargs = dict(
+        img_size=256,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        drop_path_rate=0.1,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.vit_base_patch16_224(**default_kwargs)
+def vit_base_patch32_512(**kwargs):
+    default_kwargs = dict(
+        img_size=512,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        drop_path_rate=0.1,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.vit_base_patch32_384(**default_kwargs)
+def vit_large_patch8_256(**kwargs):
+    default_kwargs = dict(
+        img_size=256,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        patch_size=8,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        drop_path_rate=0.3,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.VisionTransformer(**default_kwargs)
+def vit_large_patch16_256(**kwargs):
+    default_kwargs = dict(
+        img_size=256,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        drop_path_rate=0.3,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.vit_large_patch16_384(**default_kwargs)

vit_encoder.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# © Recursion Pharmaceuticals 2024
+from typing import Dict
+import timm.models.vision_transformer as vit
+import torch
+def build_imagenet_baselines() -> Dict[str, torch.jit.ScriptModule]:
+    """This returns the prepped imagenet encoders from timm, not bad for microscopy data."""
+    vit_backbones = [
+        _make_vit(vit.vit_small_patch16_384),
+        _make_vit(vit.vit_base_patch16_384),
+        _make_vit(vit.vit_base_patch8_224),
+        _make_vit(vit.vit_large_patch16_384),
+    ]
+    model_names = [
+        "vit_small_patch16_384",
+        "vit_base_patch16_384",
+        "vit_base_patch8_224",
+        "vit_large_patch16_384",
+    ]
+    imagenet_encoders = list(map(_make_torchscripted_encoder, vit_backbones))
+    return {name: model for name, model in zip(model_names, imagenet_encoders)}
+def _make_torchscripted_encoder(vit_backbone) -> torch.jit.ScriptModule:
+    dummy_input = torch.testing.make_tensor(
+        (2, 6, 256, 256),
+        low=0,
+        high=255,
+        dtype=torch.uint8,
+        device=torch.device("cpu"),
+    )
+    encoder = torch.nn.Sequential(
+        Normalizer(),
+        torch.nn.LazyInstanceNorm2d(
+            affine=False, track_running_stats=False
+        ),  # this module performs self-standardization, very important
+        vit_backbone,
+    ).to(device="cpu")
+    _ = encoder(dummy_input)  # get those lazy modules built
+    return torch.jit.freeze(torch.jit.script(encoder.eval()))
+def _make_vit(constructor):
+    return constructor(
+        pretrained=True,  # download imagenet weights
+        img_size=256,  # 256x256 crops
+        in_chans=6,  # we expect 6-channel microscopy images
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        global_pool="avg",  # minimal perf diff btwn "cls" and "avg"
+    )
+class Normalizer(torch.nn.Module):
+    def forward(self, pixels: torch.Tensor) -> torch.Tensor:
+        pixels = pixels.float()
+        pixels /= 255.0
+        return pixels