update model and InfiMed.py

Browse files

Files changed (5) hide show

InfiMed.py +5 -2
README.md +89 -3
config.json +9 -4
model-00001-of-00002.safetensors +1 -1
model-00002-of-00002.safetensors +1 -1

InfiMed.py CHANGED Viewed

@@ -181,14 +181,16 @@ class InfiMed(PreTrainedModel):
         if vision_model is not None:
             self.vision_model = vision_model
         else:
-            self.vision_model = SiglipVisionModel.from_pretrained(config.vision_config._name_or_path, hidden_act = "gelu")
         if language_model is not None:
             self.language_model = language_model
             self.config.llm_config = language_model.config
         else:
             if config.llm_config.architectures[0] == 'Qwen3ForCausalLM':
-                self.language_model = Qwen3ForCausalLM.from_pretrained(config.llm_config._name_or_path, pad_token_id = 151670, bos_token_id = 128245, eos_token_id = 151645, tie_word_embeddings = False)
             else:
                 raise NotImplementedError(
                     f'{config.llm_config.architectures[0]} is not implemented.')
@@ -520,3 +522,4 @@ def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX
             return torch.tensor(input_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long)
         raise ValueError(f'Unsupported tensor type: {return_tensors}')
     return input_ids, labels

         if vision_model is not None:
             self.vision_model = vision_model
         else:
+            # self.vision_model = SiglipVisionModel.from_pretrained(config.vision_config._name_or_path, hidden_act = "gelu")
+            self.vision_model = SiglipVisionModel(config.vision_config)
         if language_model is not None:
             self.language_model = language_model
             self.config.llm_config = language_model.config
         else:
             if config.llm_config.architectures[0] == 'Qwen3ForCausalLM':
+                # self.language_model = Qwen3ForCausalLM.from_pretrained(config.llm_config._name_or_path, pad_token_id = 151670, bos_token_id = 128245, eos_token_id = 151645, tie_word_embeddings = False)
+                self.language_model = Qwen3ForCausalLM(config.llm_config)
             else:
                 raise NotImplementedError(
                     f'{config.llm_config.architectures[0]} is not implemented.')
             return torch.tensor(input_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long)
         raise ValueError(f'Unsupported tensor type: {return_tensors}')
     return input_ids, labels

README.md CHANGED Viewed

@@ -1,3 +1,89 @@
----
-license: mit
----

+---
+license: apache-2.0
+language:
+- en
+- zh
+base_model:
+- google/siglip-so400m-patch14-384
+- Qwen/Qwen3-4B
+---
+## Introduction
+InfiMed-4B is a medical Multimodal Large Language Model (MLLM) developed by the InfiXAI team. Our model outperforms HuatuoGPT-V-7B and MedGemma-4B-IT. The goal of InfiMed-4B is to develop a high-performance medical MLLM that ensures accessibility and affordability for a broad audience. Welcome to explore its capabilities and feel free to contact us for any questions or opportunities.
+## Model Card
+### Model Architecture:
+| Architecture | ViT | LLM | Adapter | Resolution |
+| --- | --- | --- | --- | --- | --- |
+| 🤗InfiMed-4B | [🤗siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) |[🤗Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) | 2-layer MLP | 384x384xN |
+## Evaluation
+InfiMed-4B not only outperforms HuatuoGPT-V-7B and MedGemma-4B-IT, but is also competitive compared to recently released SoTA models.
+### Detail Evaluations:
+| Model               | Size | MMMU-Med | VQA-RAD | SLAKE | PathVQA | PMC-VQA | OMVQA | MedXVQA | Avg.  |
+|---------------------|------|----------|---------|-------|---------|---------|-------|---------|-------|
+| **Proprietary Models** |      |          |         |       |         |         |       |         |       |
+| GPT-5               |      | 83.6     | 67.8    | 78.1  | 52.8    | 60.0    | 76.4  | 71.0    | 70.0  |
+| GPT-5-mini          |      | 80.5     | 66.3    | 76.1  | 52.4    | 57.6    | 70.9  | 60.1    | 66.3  |
+| GPT-5-nano          |      | 74.1     | 55.4    | 69.3  | 45.4    | 51.3    | 66.5  | 45.1    | 58.2  |
+| GPT-4.1             |      | 75.2     | 65.0    | 72.2  | 55.5    | 55.2    | 75.5  | 45.2    | 63.4  |
+| Claude Sonnet 4     |      | 74.6     | 67.6    | 70.6  | 54.2    | 54.4    | 65.5  | 43.3    | 61.5  |
+| Gemini-2.5-Flash    |      | 76.9     | 68.5    | 75.8  | 55.4    | 55.4    | 71.0  | 52.8    | 65.1  |
+| **General Open-source Models** | | | | | | | | | |
+| Qwen2.5VL-3B        | 3B   | 51.3     | 56.8    | 63.2  | 37.1    | 50.6    | 64.5  | 20.7    | 49.2  |
+| Qwen2.5VL-7B        | 7B   | 50.6     | 64.5    | 67.2  | 44.1    | 51.9    | 63.6  | 22.3    | 52.0  |
+| InternVL3-8B        | 8B   | 59.2     | 65.4    | 72.8  | 48.6    | 53.8    | 79.1  | 22.4    | 57.3  |
+| **Medical Open-source Models** | | | | | | | | | |
+| MedGemma-4B-IT      | 4B   | 43.7     | 72.5    | 76.4  | 48.8    | 49.9    | 69.8  | 22.3    | 54.8  |
+| LLaVA-Med-7B        | 7B   | 29.3     | 53.7    | 48.0  | 38.8    | 30.5    | 44.3  | 20.3    | 37.8  |
+| HuatuoGPT-V-7B      | 7B   | 47.3     | 67.0    | 67.8  | 48.0    | 53.3    | 74.2  | 21.6    | 54.2  |
+| Lingshu-7B          | 7B   | 54.0     | 67.9    | 83.1  | 61.9    | 56.3    | 82.9  | 26.7    | 61.8  |
+| BioMediX2-8B        | 8B   | 39.8     | 49.2    | 57.7  | 37.0    | 43.5    | 63.3  | 21.8    | 44.6  |
+| Infi-Med-1.7B       | 1.7B | 34.7     | 56.3    | 75.3  | 60.7    | 48.1    | 58.9  | 21.8    | 50.8  |
+| Infi-Med-4B         | 4B   | 43.3     | 57.9    | 77.7  | 63.4    | 56.6    | 76.8  | 21.9    | 56.4  |
+### Code:
+```Python
+from InfiMed import InfiMed
+from PIL import Image
+import torch
+# Define the path to the pretrained checkpoint
+pretrained_model_path = "."
+# Load the model from the pretrained checkpoint
+model = InfiMed.from_pretrained(pretrained_model_path, device_map="auto", torch_dtype=torch.bfloat16)
+image_path = ""  # Replace with the path to your image file
+image = Image.open(image_path).convert("RGB")  # Ensure the image is in RGB format
+# Prepare input messages
+messages = {
+    "prompt": "What modality is used to take this image?",
+    "image": image  # No image for this example, set to None
+}
+# Generate output
+output_text = model.generate_output(messages)
+# Print the result
+print("Model Response:", output_text)
+```
+<br>
+## Acknowledge
+Our model is built upon numerous outstanding open-source projects, and we are grateful for their contributions. We extend special thanks to the google team and Qwen team for their great base models.
+## License
+This project is licensed under [Apache License 2.0](LICENSE).

config.json CHANGED Viewed

@@ -3,10 +3,13 @@
     "InfiMed"
   ],
   "llm_config": {
-    "_name_or_path": "/lustre/projects/polyullm/models/Qwen3/Qwen3-4B",
     "architectures": [
       "Qwen3ForCausalLM"
     ],
     "attention_bias": false,
     "attention_dropout": 0.0,
     "bos_token_id": 151643,
@@ -30,7 +33,8 @@
     "torch_dtype": "bfloat16",
     "use_cache": true,
     "use_sliding_window": false,
-    "vocab_size": 151936
   },
   "load_precision": "bf16",
   "max_length": 32,
@@ -47,10 +51,11 @@
   ],
   "transformers_version": "4.52.4",
   "vision_config": {
-    "_name_or_path": "/home/projects/polyullm/guanghao/train_code/siglip-so400m-patch14-384",
     "architectures": [
       "SiglipModel"
     ],
     "attention_dropout": 0.0,
     "hidden_act": "gelu_pytorch_tanh",
     "hidden_size": 1152,
@@ -67,4 +72,4 @@
   },
   "wandb_entity": null,
   "wandb_project": "mmpretrain"
-}

     "InfiMed"
   ],
   "llm_config": {
+    "_name_or_path": ".",
     "architectures": [
       "Qwen3ForCausalLM"
     ],
+    "pad_token_id": 151670,
+    "bos_token_id": 128245,
+    "eos_token_id": 151645,
     "attention_bias": false,
     "attention_dropout": 0.0,
     "bos_token_id": 151643,
     "torch_dtype": "bfloat16",
     "use_cache": true,
     "use_sliding_window": false,
+    "vocab_size": 151936,
+    "tie_word_embeddings": false
   },
   "load_precision": "bf16",
   "max_length": 32,
   ],
   "transformers_version": "4.52.4",
   "vision_config": {
+    "_name_or_path": ".",
     "architectures": [
       "SiglipModel"
     ],
+    "hidden_act": "gelu",
     "attention_dropout": 0.0,
     "hidden_act": "gelu_pytorch_tanh",
     "hidden_size": 1152,
   },
   "wandb_entity": null,
   "wandb_project": "mmpretrain"
+}

model-00001-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1031cef38e5bc62149fd5d1b68bd67d00d7d723a82180c97fd17090c70e63449
 size 4966471968

 version https://git-lfs.github.com/spec/v1
+oid sha256:69b2009323a7164d5b24aa2829f27dd67a64c588444a37220a976e17bc2ab9a3
 size 4966471968

model-00002-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d3b8d072383149ce00cc8183badf0f5abf42ee7fa2e4a795dcfed5208e18b438
 size 4731957576

 version https://git-lfs.github.com/spec/v1
+oid sha256:cbc5e171fac4c829b458fd9eb6e6233444209a40bb3f5d5963aa481b4880dadc
 size 4731957576