task_categories:
- visual-question-answering
language:
- en
tags:
- gguf
- remyx
- SpatialReasoning
- spatial-reasoning
- test-time-compute
- thinking
- reasoning
- multimodal
- vlm
- vision-language
- distance-estimation
- quantitative-spatial-reasoning
pretty_name: SpaceOm-GGUF
license: apache-2.0
datasets:
- remyxai/SpaceThinker
base_model:
- remyxai/SpaceOm
pipeline_tag: image-text-to-text
library_name: llama.cpp
model-index:
- name: SpaceOm
results:
- task:
type: visual-question-answering
name: Spatial Reasoning
dataset:
name: 3DSRBench
type: benchmark
metrics:
- type: success_rate
name: Overall Success Rate
value: 0.5419
results_by_subcategory:
- name: 3D Positional Relation / Orientation
success_rate: 0.4877
- name: Object Localization / 3D Localization
success_rate: 0.6337
- name: Object Properties / Size
success_rate: 0.5043
- task:
type: visual-question-answering
name: Spatial Reasoning
dataset:
name: BLINK
type: benchmark
metrics:
- type: success_rate
name: Overall Success Rate
value: 0.599
results_by_subcategory:
- name: 3D Positional Relation / Orientation
success_rate: 0.7972
- name: Counting / Object Counting
success_rate: 0.6167
- name: Depth and Distance / Relative
success_rate: 0.621
- name: Object Localization / 2D Localization
success_rate: 0.582
- name: Point and Object Tracking / Point Correspondence
success_rate: 0.3779
- task:
type: visual-question-answering
name: Spatial Reasoning
dataset:
name: MMIU
type: benchmark
metrics:
- type: success_rate
name: Overall Success Rate
value: 0.388
results_by_subcategory:
- name: Camera and Image Transformation / 2D Transformation
success_rate: 0.255
- name: Camera and Image Transformation / 3D Camera Pose
success_rate: 0.4
- name: Camera and Image Transformation / Camera Motion
success_rate: 0.4436
- name: Depth and Distance / Absolute
success_rate: 0.265
- name: Object Localization / 3D Localization
success_rate: 0.3625
- name: Point and Object Tracking / 3D Tracking
success_rate: 0.725
- name: Point and Object Tracking / Point Correspondence
success_rate: 0.265
- task:
type: visual-question-answering
name: Spatial Reasoning
dataset:
name: MMVP
type: benchmark
metrics:
- type: success_rate
name: Overall Success Rate
value: 0.5833
results_by_subcategory:
- name: Others / Miscellaneous
success_rate: 0.5833
- task:
type: visual-question-answering
name: Spatial Reasoning
dataset:
name: QSpatialBench-Plus
type: benchmark
metrics:
- type: success_rate
name: Overall Success Rate
value: 0.4455
results_by_subcategory:
- name: Depth and Distance / Absolute
success_rate: 0.4455
- task:
type: visual-question-answering
name: Spatial Reasoning
dataset:
name: QSpatialBench-ScanNet
type: benchmark
metrics:
- type: success_rate
name: Overall Success Rate
value: 0.4876
results_by_subcategory:
- name: Depth and Distance / Absolute
success_rate: 0.464
- name: Object Properties / Size
success_rate: 0.5111
- task:
type: visual-question-answering
name: Spatial Reasoning
dataset:
name: RealWorldQA
type: benchmark
metrics:
- type: success_rate
name: Overall Success Rate
value: 0.6105
results_by_subcategory:
- name: Others / Miscellaneous
success_rate: 0.6105
- task:
type: visual-question-answering
name: Spatial Reasoning
dataset:
name: SpatialSense
type: benchmark
metrics:
- type: success_rate
name: Overall Success Rate
value: 0.7043
results_by_subcategory:
- name: 3D Positional Relation / Orientation
success_rate: 0.7043
- task:
type: visual-question-answering
name: Spatial Reasoning
dataset:
name: VGBench
type: benchmark
metrics:
- type: success_rate
name: Overall Success Rate
value: 0.3504
results_by_subcategory:
- name: Camera and Image Transformation / 2D Transformation
success_rate: 0.2568
- name: Camera and Image Transformation / 3D Camera Pose
success_rate: 0.4371
- name: Depth and Distance / Absolute
success_rate: 0.3339
- name: Depth and Distance / Relative
success_rate: 0.32
- name: Object Localization / 3D Localization
success_rate: 0.4283
- name: Point and Object Tracking / 3D Tracking
success_rate: 0.3264
- task:
type: visual-question-answering
name: Spatial Reasoning
dataset:
name: VSI-Bench_8
type: benchmark
metrics:
- type: success_rate
name: Overall Success Rate
value: 0.2558
results_by_subcategory:
- name: 3D Positional Relation / Orientation
success_rate: 0.3998
- name: Counting / Object Counting
success_rate: 0.229
- name: Depth and Distance / Absolute
success_rate: 0.1562
- name: Depth and Distance / Relative
success_rate: 0.3648
- name: Object Properties / Size
success_rate: 0.1645
- name: Others / Miscellaneous
success_rate: 0.2204
- task:
type: visual-question-answering
name: Spatial Reasoning
dataset:
name: VSR-ZeroShot
type: benchmark
metrics:
- type: success_rate
name: Overall Success Rate
value: 0.8085
results_by_subcategory:
- name: 3D Positional Relation / Orientation
success_rate: 0.8085
- task:
type: visual-question-answering
name: Spatial Reasoning
dataset:
name: cvbench
type: benchmark
metrics:
- type: success_rate
name: Overall Success Rate
value: 0.6839
results_by_subcategory:
- name: Counting / Object Counting
success_rate: 0.6294
- name: Depth and Distance / Relative
success_rate: 0.7408
- name: Object Localization / 3D Localization
success_rate: 0.6815
- task:
type: visual-question-answering
name: Spatial Reasoning
dataset:
name: spatialbench
type: benchmark
metrics:
- type: success_rate
name: Overall Success Rate
value: 0.6553
results_by_subcategory:
- name: 3D Positional Relation / Orientation
success_rate: 0.6765
- name: Counting / Object Counting
success_rate: 0.75
- name: Object Properties / Existence
success_rate: 0.925
- name: Object Properties / Reachability
success_rate: 0.55
- name: Object Properties / Size
success_rate: 0.375
SpaceOm
Model creator: remyxai
Original model: SpaceOm
GGUF quantization: llama.cpp
commit 2baf07727f921d9a4a1b63a2eff941e95d0488ed
Description

Model Overview
SpaceOm improves over SpaceThinker by adding:
- the target module
o_proj
in LoRA fine-tuning - SpaceOm dataset for longer reasoning traces
- Robo2VLM-Reasoning dataset for more robotics domain and MCVQA examples
The choice to include o_proj
among the target modules in LoRA finetuning was inspired by the study here, which argues for
the importance of this module in reasoning models.
The reasoning traces in the SpaceThinker dataset average ~200 "thinking" tokens so now we've included longer reasoning traces in the training data to help the model use more tokens in reasoning.
Aiming to improve alignment for robotics applications, we've trained with synthetic reasoning traces, derived from the Robo2VLM-1 dataset.
Model Evaluation
SpatialScore - 3B and 4B models
Model | Overall | Count. | Obj.-Loc. | Pos.-Rel. | Dist. | Obj.-Prop. | Cam.&IT. | Tracking | Others |
---|---|---|---|---|---|---|---|---|---|
SpaceQwen2.5-VL-3B | 42.31 | 45.01 | 49.78 | 57.88 | 27.36 | 34.11 | 26.34 | 26.44 | 43.58 |
SpatialBot-Phi2-3B | 41.65 | 53.23 | 54.32 | 55.40 | 27.12 | 26.10 | 24.21 | 27.57 | 41.66 |
Kimi-VL-3B | 51.48 | 49.22 | 61.99 | 61.34 | 38.27 | 46.74 | 33.75 | 56.28 | 47.23 |
Kimi-VL-3B-Thinking | 52.60 | 52.66 | 58.93 | 63.28 | 39.38 | 42.57 | 32.00 | 46.97 | 42.73 |
Qwen2.5-VL-3B | 47.90 | 46.62 | 55.55 | 62.23 | 32.39 | 32.97 | 30.66 | 36.90 | 42.19 |
InternVL2.5-4B | 49.82 | 53.32 | 62.02 | 62.02 | 32.80 | 27.00 | 32.49 | 37.02 | 48.95 |
SpaceOm (3B) | 49.00 | 56.00 | 54.00 | 65.00 | 41.00 | 50.00 | 36.00 | 42.00 | 47.00 |
See all results for evaluating SpaceOm on the SpatialScore benchmark.
Compared to SpaceQwen, this model outperforms by all categories

And comparing to SpaceThinker:

SpaCE-10 Benchmark Comparison
This table compares SpaceOm
evaluated using GPT scoring against several top models from the SpaCE-10 benchmark leaderboard. Top scores in each category are bolded.
Model | EQ | SQ | SA | OO | OS | EP | FR | SP | Source |
---|---|---|---|---|---|---|---|---|---|
SpaceOm | 32.47 | 24.81 | 47.63 | 50.00 | 32.52 | 9.12 | 37.04 | 25.00 | GPT Eval |
Qwen2.5-VL-7B-Instruct | 32.70 | 31.00 | 41.30 | 32.10 | 27.60 | 15.40 | 26.30 | 27.50 | Table |
LLaVA-OneVision-7B | 37.40 | 36.20 | 42.90 | 44.20 | 27.10 | 11.20 | 45.60 | 27.20 | Table |
VILA1.5-7B | 30.20 | 38.60 | 39.90 | 44.10 | 16.50 | 35.10 | 30.10 | 37.60 | Table |
InternVL2.5-4B | 34.30 | 34.40 | 43.60 | 44.60 | 16.10 | 30.10 | 33.70 | 36.70 | Table |
Legend:
- EQ: Entity Quantification
- SQ: Scene Quantification
- SA: Size Assessment
- OO: Object-Object spatial relations
- OS: Object-Scene spatial relations
- EP: Entity Presence
- FR: Functional Reasoning
- SP: Spatial Planning
ℹ️ Note: Scores for SpaceOm are generated via
gpt_eval_score
on single-choice (*-single
) versions of the SpaCE-10 benchmark tasks. Other entries reflect leaderboard accuracy scores from the official SpaCE-10 evaluation table.
Read more about the SpaCE-10 benchmark
Limitations
- Performance may degrade in cluttered environments or camera perspective.
- This model was fine-tuned using synthetic reasoning over an internet image dataset.
- Multimodal biases inherent to the base model (Qwen2.5-VL) may persist.
- Not intended for use in safety-critical or legal decision-making.
Users are encouraged to evaluate outputs critically and consider fine-tuning for domain-specific safety and performance. Distances estimated using autoregressive transformers may help in higher-order reasoning for planning and behavior but may not be suitable replacements for measurements taken with high-precision sensors, calibrated stereo vision systems, or specialist monocular depth estimation models capable of more accurate, pixel-wise predictions and real-time performance.
Citation
@article{chen2024spatialvlm,
title = {SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning Capabilities},
author = {Chen, Boyuan and Xu, Zhuo and Kirmani, Sean and Ichter, Brian and Driess, Danny and Florence, Pete and Sadigh, Dorsa and Guibas, Leonidas and Xia, Fei},
journal = {arXiv preprint arXiv:2401.12168},
year = {2024},
url = {https://arxiv.org/abs/2401.12168},
}
@misc{qwen2.5-VL,
title = {Qwen2.5-VL},
url = {https://qwenlm.github.io/blog/qwen2.5-vl/},
author = {Qwen Team},
month = {January},
year = {2025}
}
@misc{vl-thinking2025,
title={SFT or RL? An Early Investigation into Training R1-Like Reasoning Large Vision-Language Models },
author={Hardy Chen and Haoqin Tu and Fali Wang and Hui Liu and Xianfeng Tang and Xinya Du and Yuyin Zhou and Cihang Xie},
year = {2025},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/UCSC-VLAA/VLAA-Thinking}},
}
@article{wu2025spatialscore,
author = {Wu, Haoning and Huang, Xiao and Chen, Yaohui and Zhang, Ya and Wang, Yanfeng and Xie, Weidi},
title = {SpatialScore: Towards Unified Evaluation for Multimodal Spatial Understanding},
journal = {arXiv preprint arXiv:2505.17012},
year = {2025},
}
@article{gong2025space10,
title = {SpaCE-10: A Comprehensive Benchmark for Multimodal Large Language Models in Compositional Spatial Intelligence},
author = {Ziyang Gong and Wenhao Li and Oliver Ma and Songyuan Li and Jiayi Ji and Xue Yang and Gen Luo and Junchi Yan and Rongrong Ji},
journal = {arXiv preprint arXiv:2506.07966},
year = {2025},
url = {https://arxiv.org/abs/2506.07966}
}