FineVision

Running

App Files Files Community

FineVision / app /src /content /bibliography.bib

lusxvr

added citation

5c744c8 about 1 month ago

raw

history blame contribute delete

192 kB


	@article{li_densefusion-1m_2024,
	title = {Densefusion-1m: Merging vision experts for comprehensive multimodal perception},
	volume = {37},
	url = {https://proceedings.neurips.cc/paper_files/paper/2024/hash/20ffc2b42c7de4a1960cfdadf305bbe2-Abstract-Datasets_and_Benchmarks_Track.html},
	shorttitle = {Densefusion-1m},
	pages = {18535--18556},
	journaltitle = {Advances in Neural Information Processing Systems},
	author = {Li, Xiaotong and Zhang, Fan and Diao, Haiwen and Wang, Yueze and Wang, Xinlong and Duan, Lingyu},
	urldate = {2025-08-28},
	date = {2024},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/ZVH2EVDN/Li et al. - 2024 - Densefusion-1m Merging vision experts for comprehensive multimodal perception.pdf:application/pdf},
	}

	@online{noauthor_hazal-karakusmscoco-controlnet,
	title = {hazal-karakus/mscoco-controlnet-canny-less-colors · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/hazal-karakus/mscoco-controlnet-canny-less-colors/viewer},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-08-28},
	file = {Snapshot:/Users/luis/Zotero/storage/ZWV2Q5QT/viewer.html:text/html},
	}

	@misc{pi_image_2024,
	title = {Image Textualization: An Automatic Framework for Creating Accurate and Detailed Image Descriptions},
	url = {http://arxiv.org/abs/2406.07502},
	doi = {10.48550/arXiv.2406.07502},
	shorttitle = {Image Textualization},
	abstract = {Image description datasets play a crucial role in the advancement of various applications such as image understanding, text-to-image generation, and text-image retrieval. Currently, image description datasets primarily originate from two sources. One source is the scraping of image-text pairs from the web. Despite their abundance, these descriptions are often of low quality and noisy. Another is through human labeling. Datasets such as {COCO} are generally very short and lack details. Although detailed image descriptions can be annotated by humans, the high annotation cost limits the feasibility. These limitations underscore the need for more efficient and scalable methods to generate accurate and detailed image descriptions. In this paper, we propose an innovative framework termed Image Textualization ({IT}), which automatically produces high-quality image descriptions by leveraging existing multi-modal large language models ({MLLMs}) and multiple vision expert models in a collaborative manner, which maximally convert the visual information into text. To address the current lack of benchmarks for detailed descriptions, we propose several benchmarks for comprehensive evaluation, which verifies the quality of image descriptions created by our framework. Furthermore, we show that {LLaVA}-7B, benefiting from training on {IT}-curated descriptions, acquire improved capability to generate richer image descriptions, substantially increasing the length and detail of their output with less hallucination.},
	number = {{arXiv}:2406.07502},
	publisher = {{arXiv}},
	author = {Pi, Renjie and Zhang, Jianshu and Zhang, Jipeng and Pan, Rui and Chen, Zhekai and Zhang, Tong},
	urldate = {2025-08-28},
	date = {2024-06-11},
	eprinttype = {arxiv},
	eprint = {2406.07502 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/TBAG4PFW/Pi et al. - 2024 - Image Textualization An Automatic Framework for Creating Accurate and Detailed Image Descriptions.pdf:application/pdf},
	}

	@incollection{vedaldi_connecting_2020,
	location = {Cham},
	title = {Connecting Vision and Language with Localized Narratives},
	volume = {12350},
	isbn = {978-3-030-58557-0 978-3-030-58558-7},
	url = {https://link.springer.com/10.1007/978-3-030-58558-7_38},
	pages = {647--664},
	booktitle = {Computer Vision – {ECCV} 2020},
	publisher = {Springer International Publishing},
	author = {Pont-Tuset, Jordi and Uijlings, Jasper and Changpinyo, Soravit and Soricut, Radu and Ferrari, Vittorio},
	editor = {Vedaldi, Andrea and Bischof, Horst and Brox, Thomas and Frahm, Jan-Michael},
	urldate = {2025-08-28},
	date = {2020},
	langid = {english},
	doi = {10.1007/978-3-030-58558-7_38},
	note = {Series Title: Lecture Notes in Computer Science},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/JBX9L6E7/Pont-Tuset et al. - 2020 - Connecting Vision and Language with Localized Narratives.pdf:application/pdf},
	}

	@online{noauthor_sharegpt-4o_nodate,
	title = {{ShareGPT}-4o},
	url = {https://sharegpt4o.github.io/},
	urldate = {2025-08-28},
	file = {ShareGPT-4o:/Users/luis/Zotero/storage/ALYF2GT3/sharegpt4o.github.io.html:text/html},
	}

	@incollection{vedaldi_textcaps_2020,
	location = {Cham},
	title = {{TextCaps}: A Dataset for Image Captioning with Reading Comprehension},
	volume = {12347},
	isbn = {978-3-030-58535-8 978-3-030-58536-5},
	url = {https://link.springer.com/10.1007/978-3-030-58536-5_44},
	shorttitle = {{TextCaps}},
	pages = {742--758},
	booktitle = {Computer Vision – {ECCV} 2020},
	publisher = {Springer International Publishing},
	author = {Sidorov, Oleksii and Hu, Ronghang and Rohrbach, Marcus and Singh, Amanpreet},
	editor = {Vedaldi, Andrea and Bischof, Horst and Brox, Thomas and Frahm, Jan-Michael},
	urldate = {2025-08-28},
	date = {2020},
	langid = {english},
	doi = {10.1007/978-3-030-58536-5_44},
	note = {Series Title: Lecture Notes in Computer Science},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/4VWQHTEL/Sidorov et al. - 2020 - TextCaps A Dataset for Image Captioning with Reading Comprehension.pdf:application/pdf},
	}

	@misc{kantharaj_chart-text_2022,
	title = {Chart-to-Text: A Large-Scale Benchmark for Chart Summarization},
	url = {http://arxiv.org/abs/2203.06486},
	doi = {10.48550/arXiv.2203.06486},
	shorttitle = {Chart-to-Text},
	abstract = {Charts are commonly used for exploring data and communicating insights. Generating natural language summaries from charts can be very helpful for people in inferring key insights that would otherwise require a lot of cognitive and perceptual efforts. We present Chart-to-text, a large-scale benchmark with two datasets and a total of 44,096 charts covering a wide range of topics and chart types. We explain the dataset construction process and analyze the datasets. We also introduce a number of state-of-the-art neural models as baselines that utilize image captioning and data-to-text generation techniques to tackle two problem variations: one assumes the underlying data table of the chart is available while the other needs to extract data from chart images. Our analysis with automatic and human evaluation shows that while our best models usually generate fluent summaries and yield reasonable {BLEU} scores, they also suffer from hallucinations and factual errors as well as difficulties in correctly explaining complex patterns and trends in charts.},
	number = {{arXiv}:2203.06486},
	publisher = {{arXiv}},
	author = {Kantharaj, Shankar and Leong, Rixie Tiffany Ko and Lin, Xiang and Masry, Ahmed and Thakkar, Megh and Hoque, Enamul and Joty, Shafiq},
	urldate = {2025-08-28},
	date = {2022-04-14},
	eprinttype = {arxiv},
	eprint = {2203.06486 [cs]},
	keywords = {Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/D88D7JTV/Kantharaj et al. - 2022 - Chart-to-Text A Large-Scale Benchmark for Chart Summarization.pdf:application/pdf},
	}

	@misc{masry_chartqa_2022,
	title = {{ChartQA}: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning},
	url = {http://arxiv.org/abs/2203.10244},
	doi = {10.48550/arXiv.2203.10244},
	shorttitle = {{ChartQA}},
	abstract = {Charts are very popular for analyzing data. When exploring charts, people often ask a variety of complex reasoning questions that involve several logical and arithmetic operations. They also commonly refer to visual features of a chart in their questions. However, most existing datasets do not focus on such complex reasoning questions as their questions are template-based and answers come from a fixed-vocabulary. In this work, we present a large-scale benchmark covering 9.6K human-written questions as well as 23.1K questions generated from human-written chart summaries. To address the unique challenges in our benchmark involving visual and logical reasoning over charts, we present two transformer-based models that combine visual features and the data table of the chart in a unified way to answer questions. While our models achieve the state-of-the-art results on the previous datasets as well as on our benchmark, the evaluation also reveals several challenges in answering complex reasoning questions.},
	number = {{arXiv}:2203.10244},
	publisher = {{arXiv}},
	author = {Masry, Ahmed and Long, Do Xuan and Tan, Jia Qing and Joty, Shafiq and Hoque, Enamul},
	urldate = {2025-08-28},
	date = {2022-03-19},
	eprinttype = {arxiv},
	eprint = {2203.10244 [cs]},
	keywords = {Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/92LBNRMX/Masry et al. - 2022 - ChartQA A Benchmark for Question Answering about Charts with Visual and Logical Reasoning.pdf:application/pdf},
	}

	@misc{yang_scaling_2025,
	title = {Scaling Text-Rich Image Understanding via Code-Guided Synthetic Multimodal Data Generation},
	url = {http://arxiv.org/abs/2502.14846},
	doi = {10.48550/arXiv.2502.14846},
	abstract = {Reasoning about images with rich text, such as charts and documents, is a critical application of vision-language models ({VLMs}). However, {VLMs} often struggle in these domains due to the scarcity of diverse text-rich vision-language data. To address this challenge, we present {CoSyn}, a framework that leverages the coding capabilities of text-only large language models ({LLMs}) to automatically create synthetic text-rich multimodal data. Given input text describing a target domain (e.g., "nutrition fact labels"), {CoSyn} prompts an {LLM} to generate code (Python, {HTML}, {LaTeX}, etc.) for rendering synthetic images. With the underlying code as textual representations of the synthetic images, {CoSyn} can generate high-quality instruction-tuning data, again relying on a text-only {LLM}. Using {CoSyn}, we constructed a dataset comprising 400K images and 2.7M rows of vision-language instruction-tuning data. Comprehensive experiments on seven benchmarks demonstrate that models trained on our synthetic data achieve state-of-the-art performance among competitive open-source models, including Llama 3.2, and surpass proprietary models such as {GPT}-4V and Gemini 1.5 Flash. Furthermore, {CoSyn} can produce synthetic pointing data, enabling {VLMs} to ground information within input images, showcasing its potential for developing multimodal agents capable of acting in real-world environments.},
	number = {{arXiv}:2502.14846},
	publisher = {{arXiv}},
	author = {Yang, Yue and Patel, Ajay and Deitke, Matt and Gupta, Tanmay and Weihs, Luca and Head, Andrew and Yatskar, Mark and Callison-Burch, Chris and Krishna, Ranjay and Kembhavi, Aniruddha and Clark, Christopher},
	urldate = {2025-08-28},
	date = {2025-05-21},
	eprinttype = {arxiv},
	eprint = {2502.14846 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/MJRL59N6/Yang et al. - 2025 - Scaling Text-Rich Image Understanding via Code-Guided Synthetic Multimodal Data Generation.pdf:application/pdf},
	}

	@inproceedings{kafle_dvqa_2018,
	title = {Dvqa: Understanding data visualizations via question answering},
	url = {http://openaccess.thecvf.com/content_cvpr_2018/html/Kafle_DVQA_Understanding_Data_CVPR_2018_paper.html},
	shorttitle = {Dvqa},
	pages = {5648--5656},
	booktitle = {Proceedings of the {IEEE} conference on computer vision and pattern recognition},
	author = {Kafle, Kushal and Price, Brian and Cohen, Scott and Kanan, Christopher},
	urldate = {2025-08-28},
	date = {2018},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/R7UNUPCF/Kafle et al. - 2018 - Dvqa Understanding data visualizations via question answering.pdf:application/pdf},
	}

	@misc{kahou_figureqa_2018,
	title = {{FigureQA}: An Annotated Figure Dataset for Visual Reasoning},
	url = {http://arxiv.org/abs/1710.07300},
	doi = {10.48550/arXiv.1710.07300},
	shorttitle = {{FigureQA}},
	abstract = {We introduce {FigureQA}, a visual reasoning corpus of over one million question-answer pairs grounded in over 100,000 images. The images are synthetic, scientific-style figures from five classes: line plots, dot-line plots, vertical and horizontal bar graphs, and pie charts. We formulate our reasoning task by generating questions from 15 templates; questions concern various relationships between plot elements and examine characteristics like the maximum, the minimum, area-under-the-curve, smoothness, and intersection. To resolve, such questions often require reference to multiple plot elements and synthesis of information distributed spatially throughout a figure. To facilitate the training of machine learning systems, the corpus also includes side data that can be used to formulate auxiliary objectives. In particular, we provide the numerical data used to generate each figure as well as bounding-box annotations for all plot elements. We study the proposed visual reasoning task by training several models, including the recently proposed Relation Network as a strong baseline. Preliminary results indicate that the task poses a significant machine learning challenge. We envision {FigureQA} as a first step towards developing models that can intuitively recognize patterns from visual representations of data.},
	number = {{arXiv}:1710.07300},
	publisher = {{arXiv}},
	author = {Kahou, Samira Ebrahimi and Michalski, Vincent and Atkinson, Adam and Kadar, Akos and Trischler, Adam and Bengio, Yoshua},
	urldate = {2025-08-28},
	date = {2018-02-22},
	eprinttype = {arxiv},
	eprint = {1710.07300 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/PY8DQBUQ/Kahou et al. - 2018 - FigureQA An Annotated Figure Dataset for Visual Reasoning.pdf:application/pdf},
	}

	@misc{chen_finqa_2022,
	title = {{FinQA}: A Dataset of Numerical Reasoning over Financial Data},
	url = {http://arxiv.org/abs/2109.00122},
	doi = {10.48550/arXiv.2109.00122},
	shorttitle = {{FinQA}},
	abstract = {The sheer volume of financial statements makes it difficult for humans to access and analyze a business's financials. Robust numerical reasoning likewise faces unique challenges in this domain. In this work, we focus on answering deep questions over financial data, aiming to automate the analysis of a large corpus of financial documents. In contrast to existing tasks on general domain, the finance domain includes complex numerical reasoning and understanding of heterogeneous representations. To facilitate analytical progress, we propose a new large-scale dataset, {FinQA}, with Question-Answering pairs over Financial reports, written by financial experts. We also annotate the gold reasoning programs to ensure full explainability. We further introduce baselines and conduct comprehensive experiments in our dataset. The results demonstrate that popular, large, pre-trained models fall far short of expert humans in acquiring finance knowledge and in complex multi-step numerical reasoning on that knowledge. Our dataset -- the first of its kind -- should therefore enable significant, new community research into complex application domains. The dataset and code are publicly available{\textbackslash}url\{https://github.com/czyssrs/{FinQA}\}.},
	number = {{arXiv}:2109.00122},
	publisher = {{arXiv}},
	author = {Chen, Zhiyu and Chen, Wenhu and Smiley, Charese and Shah, Sameena and Borova, Iana and Langdon, Dylan and Moussa, Reema and Beane, Matt and Huang, Ting-Hao and Routledge, Bryan and Wang, William Yang},
	urldate = {2025-08-28},
	date = {2022-05-07},
	eprinttype = {arxiv},
	eprint = {2109.00122 [cs]},
	keywords = {Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/YXNA2ZVU/Chen et al. - 2022 - FinQA A Dataset of Numerical Reasoning over Financial Data.pdf:application/pdf},
	}

	@misc{cheng_hitab_2022,
	title = {{HiTab}: A Hierarchical Table Dataset for Question Answering and Natural Language Generation},
	url = {http://arxiv.org/abs/2108.06712},
	doi = {10.48550/arXiv.2108.06712},
	shorttitle = {{HiTab}},
	abstract = {Tables are often created with hierarchies, but existing works on table reasoning mainly focus on flat tables and neglect hierarchical tables. Hierarchical tables challenge existing methods by hierarchical indexing, as well as implicit relationships of calculation and semantics. This work presents {HiTab}, a free and open dataset to study question answering ({QA}) and natural language generation ({NLG}) over hierarchical tables. {HiTab} is a cross-domain dataset constructed from a wealth of statistical reports (analyses) and Wikipedia pages, and has unique characteristics: (1) nearly all tables are hierarchical, and (2) both target sentences for {NLG} and questions for {QA} are revised from original, meaningful, and diverse descriptive sentences authored by analysts and professions of reports. (3) to reveal complex numerical reasoning in statistical analyses, we provide fine-grained annotations of entity and quantity alignment. {HiTab} provides 10,686 {QA} pairs and descriptive sentences with well-annotated quantity and entity alignment on 3,597 tables with broad coverage of table hierarchies and numerical reasoning types. Targeting hierarchical structure, we devise a novel hierarchy-aware logical form for symbolic reasoning over tables, which shows high effectiveness. Targeting complex numerical reasoning, we propose partially supervised training given annotations of entity and quantity alignment, which helps models to largely reduce spurious predictions in the {QA} task. In the {NLG} task, we find that entity and quantity alignment also helps {NLG} models to generate better results in a conditional generation setting. Experiment results of state-of-the-art baselines suggest that this dataset presents a strong challenge and a valuable benchmark for future research.},
	number = {{arXiv}:2108.06712},
	publisher = {{arXiv}},
	author = {Cheng, Zhoujun and Dong, Haoyu and Wang, Zhiruo and Jia, Ran and Guo, Jiaqi and Gao, Yan and Han, Shi and Lou, Jian-Guang and Zhang, Dongmei},
	urldate = {2025-08-28},
	date = {2022-03-26},
	eprinttype = {arxiv},
	eprint = {2108.06712 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Information Retrieval},
	file = {Preprint PDF:/Users/luis/Zotero/storage/UI5LLWNJ/Cheng et al. - 2022 - HiTab A Hierarchical Table Dataset for Question Answering and Natural Language Generation.pdf:application/pdf},
	}

	@misc{liu_mmc_2024,
	title = {{MMC}: Advancing Multimodal Chart Understanding with Large-scale Instruction Tuning},
	url = {http://arxiv.org/abs/2311.10774},
	doi = {10.48550/arXiv.2311.10774},
	shorttitle = {{MMC}},
	abstract = {With the rapid development of large language models ({LLMs}) and their integration into large multimodal models ({LMMs}), there has been impressive progress in zero-shot completion of user-oriented vision-language tasks. However, a gap remains in the domain of chart image understanding due to the distinct abstract components in charts. To address this, we introduce a large-scale {MultiModal} Chart Instruction ({\textbackslash}textbf\{{MMC}-Instruction\}) dataset comprising 600k instances supporting diverse tasks and chart types. Leveraging this data, we develop {MultiModal} Chart Assistant ({\textbackslash}textbf\{{MMCA}\}), an {LMM} that achieves state-of-the-art performance on existing chart {QA} benchmarks. Recognizing the need for a comprehensive evaluation of {LMM} chart understanding, we also propose a {MultiModal} Chart Benchmark ({\textbackslash}textbf\{{MMC}-Benchmark\}), a comprehensive human-annotated benchmark with nine distinct tasks evaluating reasoning capabilities over charts. Extensive experiments on {MMC}-Benchmark reveal the limitations of existing {LMMs} on correctly interpreting charts, even for the most recent {GPT}-4V model. Our work provides an instruction-tuning methodology and benchmark to advance multimodal understanding of charts. Code and data are available at https://github.com/{FuxiaoLiu}/{MMC}.},
	number = {{arXiv}:2311.10774},
	publisher = {{arXiv}},
	author = {Liu, Fuxiao and Wang, Xiaoyang and Yao, Wenlin and Chen, Jianshu and Song, Kaiqiang and Cho, Sangwoo and Yacoob, Yaser and Yu, Dong},
	urldate = {2025-08-28},
	date = {2024-04-15},
	eprinttype = {arxiv},
	eprint = {2311.10774 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/Z5S2DC94/Liu et al. - 2024 - MMC Advancing Multimodal Chart Understanding with Large-scale Instruction Tuning.pdf:application/pdf},
	}

	@misc{zhao_multihiertt_2022,
	title = {{MultiHiertt}: Numerical Reasoning over Multi Hierarchical Tabular and Textual Data},
	url = {http://arxiv.org/abs/2206.01347},
	doi = {10.48550/arXiv.2206.01347},
	shorttitle = {{MultiHiertt}},
	abstract = {Numerical reasoning over hybrid data containing both textual and tabular content (e.g., financial reports) has recently attracted much attention in the {NLP} community. However, existing question answering ({QA}) benchmarks over hybrid data only include a single flat table in each document and thus lack examples of multi-step numerical reasoning across multiple hierarchical tables. To facilitate data analytical progress, we construct a new large-scale benchmark, {MultiHiertt}, with {QA} pairs over Multi Hierarchical Tabular and Textual data. {MultiHiertt} is built from a wealth of financial reports and has the following unique characteristics: 1) each document contain multiple tables and longer unstructured texts; 2) most of tables contained are hierarchical; 3) the reasoning process required for each question is more complex and challenging than existing benchmarks; and 4) fine-grained annotations of reasoning processes and supporting facts are provided to reveal complex numerical reasoning. We further introduce a novel {QA} model termed {MT}2Net, which first applies facts retrieving to extract relevant supporting facts from both tables and text and then uses a reasoning module to perform symbolic reasoning over retrieved facts. We conduct comprehensive experiments on various baselines. The experimental results show that {MultiHiertt} presents a strong challenge for existing baselines whose results lag far behind the performance of human experts. The dataset and code are publicly available at https://github.com/psunlpgroup/{MultiHiertt}.},
	number = {{arXiv}:2206.01347},
	publisher = {{arXiv}},
	author = {Zhao, Yilun and Li, Yunxiang and Li, Chenying and Zhang, Rui},
	urldate = {2025-08-28},
	date = {2022-06-03},
	eprinttype = {arxiv},
	eprint = {2206.01347 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning},
	file = {Preprint PDF:/Users/luis/Zotero/storage/EG34XBNU/Zhao et al. - 2022 - MultiHiertt Numerical Reasoning over Multi Hierarchical Tabular and Textual Data.pdf:application/pdf},
	}

	@inproceedings{methani_plotqa_2020,
	title = {Plotqa: Reasoning over scientific plots},
	url = {http://openaccess.thecvf.com/content_WACV_2020/html/Methani_PlotQA_Reasoning_over_Scientific_Plots_WACV_2020_paper.html},
	shorttitle = {Plotqa},
	pages = {1527--1536},
	booktitle = {Proceedings of the ieee/cvf winter conference on applications of computer vision},
	author = {Methani, Nitesh and Ganguly, Pritha and Khapra, Mitesh M. and Kumar, Pratyush},
	urldate = {2025-08-28},
	date = {2020},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/32HZVCMK/Methani et al. - 2020 - Plotqa Reasoning over scientific plots.pdf:application/pdf},
	}

	@misc{zhao_robut_2023,
	title = {{RobuT}: A Systematic Study of Table {QA} Robustness Against Human-Annotated Adversarial Perturbations},
	url = {http://arxiv.org/abs/2306.14321},
	doi = {10.48550/arXiv.2306.14321},
	shorttitle = {{RobuT}},
	abstract = {Despite significant progress having been made in question answering on tabular data (Table {QA}), it's unclear whether, and to what extent existing Table {QA} models are robust to task-specific perturbations, e.g., replacing key question entities or shuffling table columns. To systematically study the robustness of Table {QA} models, we propose a benchmark called {RobuT}, which builds upon existing Table {QA} datasets ({WTQ}, {WikiSQL}-Weak, and {SQA}) and includes human-annotated adversarial perturbations in terms of table header, table content, and question. Our results indicate that both state-of-the-art Table {QA} models and large language models (e.g., {GPT}-3) with few-shot learning falter in these adversarial sets. We propose to address this problem by using large language models to generate adversarial examples to enhance training, which significantly improves the robustness of Table {QA} models. Our data and code is publicly available at https://github.com/yilunzhao/{RobuT}.},
	number = {{arXiv}:2306.14321},
	publisher = {{arXiv}},
	author = {Zhao, Yilun and Zhao, Chen and Nan, Linyong and Qi, Zhenting and Zhang, Wenlin and Tang, Xiangru and Mi, Boyu and Radev, Dragomir},
	urldate = {2025-08-28},
	date = {2023-06-25},
	eprinttype = {arxiv},
	eprint = {2306.14321 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/ND8GUKM7/Zhao et al. - 2023 - RobuT A Systematic Study of Table QA Robustness Against Human-Annotated Adversarial Perturbations.pdf:application/pdf},
	}

	@misc{lu_dynamic_2023,
	title = {Dynamic Prompt Learning via Policy Gradient for Semi-structured Mathematical Reasoning},
	url = {http://arxiv.org/abs/2209.14610},
	doi = {10.48550/arXiv.2209.14610},
	abstract = {Mathematical reasoning, a core ability of human intelligence, presents unique challenges for machines in abstract thinking and logical reasoning. Recent large pre-trained language models such as {GPT}-3 have achieved remarkable progress on mathematical reasoning tasks written in text form, such as math word problems ({MWP}). However, it is unknown if the models can handle more complex problems that involve math reasoning over heterogeneous information, such as tabular data. To fill the gap, we present Tabular Math Word Problems ({TabMWP}), a new dataset containing 38,431 open-domain grade-level problems that require mathematical reasoning on both textual and tabular data. Each question in {TabMWP} is aligned with a tabular context, which is presented as an image, semi-structured text, and a structured table. There are two types of questions: free-text and multi-choice, and each problem is annotated with gold solutions to reveal the multi-step reasoning process. We evaluate different pre-trained models on {TabMWP}, including the {GPT}-3 model in a few-shot setting. As earlier studies suggest, since few-shot {GPT}-3 relies on the selection of in-context examples, its performance is unstable and can degrade to near chance. The unstable issue is more severe when handling complex problems like {TabMWP}. To mitigate this, we further propose a novel approach, {PromptPG}, which utilizes policy gradient to learn to select in-context examples from a small amount of training data and then constructs the corresponding prompt for the test example. Experimental results show that our method outperforms the best baseline by 5.31\% on the accuracy metric and reduces the prediction variance significantly compared to random selection, which verifies its effectiveness in selecting in-context examples.},
	number = {{arXiv}:2209.14610},
	publisher = {{arXiv}},
	author = {Lu, Pan and Qiu, Liang and Chang, Kai-Wei and Wu, Ying Nian and Zhu, Song-Chun and Rajpurohit, Tanmay and Clark, Peter and Kalyan, Ashwin},
	urldate = {2025-08-28},
	date = {2023-03-02},
	eprinttype = {arxiv},
	eprint = {2209.14610 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
	file = {Preprint PDF:/Users/luis/Zotero/storage/GIVRNCEK/Lu et al. - 2023 - Dynamic Prompt Learning via Policy Gradient for Semi-structured Mathematical Reasoning.pdf:application/pdf},
	}

	@inproceedings{zhu_towards_2022,
	location = {Lisboa Portugal},
	title = {Towards Complex Document Understanding By Discrete Reasoning},
	isbn = {978-1-4503-9203-7},
	url = {https://dl.acm.org/doi/10.1145/3503161.3548422},
	doi = {10.1145/3503161.3548422},
	eventtitle = {{MM} '22: The 30th {ACM} International Conference on Multimedia},
	pages = {4857--4866},
	booktitle = {Proceedings of the 30th {ACM} International Conference on Multimedia},
	publisher = {{ACM}},
	author = {Zhu, Fengbin and Lei, Wenqiang and Feng, Fuli and Wang, Chao and Zhang, Haozhou and Chua, Tat-Seng},
	urldate = {2025-08-28},
	date = {2022-10-10},
	langid = {english},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/D4QC7KKI/Zhu et al. - 2022 - Towards Complex Document Understanding By Discrete Reasoning.pdf:application/pdf},
	}

	@misc{zhu_tat-qa_2021,
	title = {{TAT}-{QA}: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance},
	url = {http://arxiv.org/abs/2105.07624},
	doi = {10.48550/arXiv.2105.07624},
	shorttitle = {{TAT}-{QA}},
	abstract = {Hybrid data combining both tabular and textual content (e.g., financial reports) are quite pervasive in the real world. However, Question Answering ({QA}) over such hybrid data is largely neglected in existing research. In this work, we extract samples from real financial reports to build a new large-scale {QA} dataset containing both Tabular And Textual data, named {TAT}-{QA}, where numerical reasoning is usually required to infer the answer, such as addition, subtraction, multiplication, division, counting, comparison/sorting, and the compositions. We further propose a novel {QA} model termed {TAGOP}, which is capable of reasoning over both tables and text. It adopts sequence tagging to extract relevant cells from the table along with relevant spans from the text to infer their semantics, and then applies symbolic reasoning over them with a set of aggregation operators to arrive at the final answer. {TAGOPachieves} 58.0\% {inF}1, which is an 11.1\% absolute increase over the previous best baseline model, according to our experiments on {TAT}-{QA}. But this result still lags far behind performance of expert human, i.e.90.8\% in F1. It is demonstrated that our {TAT}-{QA} is very challenging and can serve as a benchmark for training and testing powerful {QA} models that address hybrid form data.},
	number = {{arXiv}:2105.07624},
	publisher = {{arXiv}},
	author = {Zhu, Fengbin and Lei, Wenqiang and Huang, Youcheng and Wang, Chao and Zhang, Shuo and Lv, Jiancheng and Feng, Fuli and Chua, Tat-Seng},
	urldate = {2025-08-28},
	date = {2021-06-01},
	eprinttype = {arxiv},
	eprint = {2105.07624 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/A4HU4FP8/Zhu et al. - 2021 - TAT-QA A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance.pdf:application/pdf},
	}

	@misc{masry_unichart_2023,
	title = {{UniChart}: A Universal Vision-language Pretrained Model for Chart Comprehension and Reasoning},
	url = {http://arxiv.org/abs/2305.14761},
	doi = {10.48550/arXiv.2305.14761},
	shorttitle = {{UniChart}},
	abstract = {Charts are very popular for analyzing data, visualizing key insights and answering complex reasoning questions about data. To facilitate chart-based data analysis using natural language, several downstream tasks have been introduced recently such as chart question answering and chart summarization. However, most of the methods that solve these tasks use pretraining on language or vision-language tasks that do not attempt to explicitly model the structure of the charts (e.g., how data is visually encoded and how chart elements are related to each other). To address this, we first build a large corpus of charts covering a wide variety of topics and visual styles. We then present {UniChart}, a pretrained model for chart comprehension and reasoning. {UniChart} encodes the relevant text, data, and visual elements of charts and then uses a chart-grounded text decoder to generate the expected output in natural language. We propose several chart-specific pretraining tasks that include: (i) low-level tasks to extract the visual elements (e.g., bars, lines) and data from charts, and (ii) high-level tasks to acquire chart understanding and reasoning skills. We find that pretraining the model on a large corpus with chart-specific low- and high-level tasks followed by finetuning on three down-streaming tasks results in state-of-the-art performance on three downstream tasks.},
	number = {{arXiv}:2305.14761},
	publisher = {{arXiv}},
	author = {Masry, Ahmed and Kavehzadeh, Parsa and Do, Xuan Long and Hoque, Enamul and Joty, Shafiq},
	urldate = {2025-08-28},
	date = {2023-10-10},
	eprinttype = {arxiv},
	eprint = {2305.14761 [cs]},
	keywords = {Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/PTZP5IKI/Masry et al. - 2023 - UniChart A Universal Vision-language Pretrained Model for Chart Comprehension and Reasoning.pdf:application/pdf},
	}

	@misc{tang_vistext_2023,
	title = {{VisText}: A Benchmark for Semantically Rich Chart Captioning},
	url = {http://arxiv.org/abs/2307.05356},
	doi = {10.48550/arXiv.2307.05356},
	shorttitle = {{VisText}},
	abstract = {Captions that describe or explain charts help improve recall and comprehension of the depicted data and provide a more accessible medium for people with visual disabilities. However, current approaches for automatically generating such captions struggle to articulate the perceptual or cognitive features that are the hallmark of charts (e.g., complex trends and patterns). In response, we introduce {VisText}: a dataset of 12,441 pairs of charts and captions that describe the charts' construction, report key statistics, and identify perceptual and cognitive phenomena. In {VisText}, a chart is available as three representations: a rasterized image, a backing data table, and a scene graph -- a hierarchical representation of a chart's visual elements akin to a web page's Document Object Model ({DOM}). To evaluate the impact of {VisText}, we fine-tune state-of-the-art language models on our chart captioning task and apply prefix-tuning to produce captions that vary the semantic content they convey. Our models generate coherent, semantically rich captions and perform on par with state-of-the-art chart captioning models across machine translation and text generation metrics. Through qualitative analysis, we identify six broad categories of errors that our models make that can inform future work.},
	number = {{arXiv}:2307.05356},
	publisher = {{arXiv}},
	author = {Tang, Benny J. and Boggust, Angie and Satyanarayan, Arvind},
	urldate = {2025-08-28},
	date = {2023-06-28},
	eprinttype = {arxiv},
	eprint = {2307.05356 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Human-Computer Interaction, Computer Science - Machine Learning},
	file = {Preprint PDF:/Users/luis/Zotero/storage/DH48LX4U/Tang et al. - 2023 - VisText A Benchmark for Semantically Rich Chart Captioning.pdf:application/pdf},
	}

	@misc{chen_allava_2024,
	title = {{ALLaVA}: Harnessing {GPT}4V-Synthesized Data for Lite Vision-Language Models},
	url = {http://arxiv.org/abs/2402.11684},
	doi = {10.48550/arXiv.2402.11684},
	shorttitle = {{ALLaVA}},
	abstract = {Large vision-language models ({LVLMs}) have shown premise in a broad range of vision-language tasks with their strong reasoning and generalization capabilities. However, they require considerable computational resources for training and deployment. This study aims to bridge the performance gap between traditional-scale {LVLMs} and resource-friendly lite versions by adopting high-quality training data. To this end, we propose a comprehensive pipeline for generating a synthetic dataset. The key idea is to leverage strong proprietary models to generate (i) fine-grained image annotations for vision-language alignment and (ii) complex reasoning visual question-answering pairs for visual instruction fine-tuning, yielding 1.3M samples in total. We train a series of lite {VLMs} on the synthetic dataset and experimental results demonstrate the effectiveness of the proposed scheme, where they achieve competitive performance on 17 benchmarks among 4B {LVLMs}, and even perform on par with 7B/13B-scale models on various benchmarks. This work highlights the feasibility of adopting high-quality data in crafting more efficient {LVLMs}. We name our dataset {\textbackslash}textit\{{ALLaVA}\}, and open-source it to research community for developing better resource-efficient {LVLMs} for wider usage.},
	number = {{arXiv}:2402.11684},
	publisher = {{arXiv}},
	author = {Chen, Guiming Hardy and Chen, Shunian and Zhang, Ruifei and Chen, Junying and Wu, Xiangbo and Zhang, Zhiyi and Chen, Zhihong and Li, Jianquan and Wan, Xiang and Wang, Benyou},
	urldate = {2025-08-28},
	date = {2024-06-17},
	eprinttype = {arxiv},
	eprint = {2402.11684 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/T4H5UI72/Chen et al. - 2024 - ALLaVA Harnessing GPT4V-Synthesized Data for Lite Vision-Language Models.pdf:application/pdf},
	}

	@article{tong_cambrian-1_2024,
	title = {Cambrian-1: A fully open, vision-centric exploration of multimodal llms},
	volume = {37},
	url = {https://proceedings.neurips.cc/paper_files/paper/2024/hash/9ee3a664ccfeabc0da16ac6f1f1cfe59-Abstract-Conference.html},
	shorttitle = {Cambrian-1},
	pages = {87310--87356},
	journaltitle = {Advances in Neural Information Processing Systems},
	author = {Tong, Peter and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and {IYER}, Adithya Jairam Vedagiri and Akula, Sai Charitha and Yang, Shusheng and Yang, Jihan and Middepogu, Manoj and Wang, Ziteng},
	urldate = {2025-08-28},
	date = {2024},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/24INNH7R/Tong et al. - 2024 - Cambrian-1 A fully open, vision-centric exploration of multimodal llms.pdf:application/pdf},
	}

	@article{ren_exploring_2015,
	title = {Exploring models and data for image question answering},
	volume = {28},
	url = {https://proceedings.neurips.cc/paper/2015/hash/831c2f88a604a07ca94314b56a4921b8-Abstract.html},
	journaltitle = {Advances in neural information processing systems},
	author = {Ren, Mengye and Kiros, Ryan and Zemel, Richard},
	urldate = {2025-08-28},
	date = {2015},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/D6V7YF5L/Ren et al. - 2015 - Exploring models and data for image question answering.pdf:application/pdf},
	}

	@misc{belouadi_automatikz_2024,
	title = {{AutomaTikZ}: Text-Guided Synthesis of Scientific Vector Graphics with {TikZ}},
	url = {http://arxiv.org/abs/2310.00367},
	doi = {10.48550/arXiv.2310.00367},
	shorttitle = {{AutomaTikZ}},
	abstract = {Generating bitmap graphics from text has gained considerable attention, yet for scientific figures, vector graphics are often preferred. Given that vector graphics are typically encoded using low-level graphics primitives, generating them directly is difficult. To address this, we propose the use of {TikZ}, a well-known abstract graphics language that can be compiled to vector graphics, as an intermediate representation of scientific figures. {TikZ} offers human-oriented, high-level commands, thereby facilitating conditional language modeling with any large language model. To this end, we introduce {DaTikZ}, the first large-scale {TikZ} dataset consisting of 120k {TikZ} drawings aligned with captions. We fine-tune {LLaMA} on {DaTikZ}, as well as our new model {CLiMA}, which augments {LLaMA} with multimodal {CLIP} embeddings. In both human and automatic evaluation, {CLiMA} and {LLaMA} outperform commercial {GPT}-4 and Claude 2 in terms of similarity to human-created figures, with {CLiMA} additionally improving text-image alignment. Our detailed analysis shows that all models generalize well and are not susceptible to memorization. {GPT}-4 and Claude 2, however, tend to generate more simplistic figures compared to both humans and our models. We make our framework, {AutomaTikZ}, along with model weights and datasets, publicly available.},
	number = {{arXiv}:2310.00367},
	publisher = {{arXiv}},
	author = {Belouadi, Jonas and Lauscher, Anne and Eger, Steffen},
	urldate = {2025-08-28},
	date = {2024-01-23},
	eprinttype = {arxiv},
	eprint = {2310.00367 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/TG4FFN68/Belouadi et al. - 2024 - AutomaTikZ Text-Guided Synthesis of Scientific Vector Graphics with TikZ.pdf:application/pdf},
	}

	@incollection{leonardis_drivelm_2025,
	location = {Cham},
	title = {{DriveLM}: Driving with Graph Visual Question Answering},
	volume = {15110},
	isbn = {978-3-031-72942-3 978-3-031-72943-0},
	url = {https://link.springer.com/10.1007/978-3-031-72943-0_15},
	shorttitle = {{DriveLM}},
	pages = {256--274},
	booktitle = {Computer Vision – {ECCV} 2024},
	publisher = {Springer Nature Switzerland},
	author = {Sima, Chonghao and Renz, Katrin and Chitta, Kashyap and Chen, Li and Zhang, Hanxue and Xie, Chengen and Beißwenger, Jens and Luo, Ping and Geiger, Andreas and Li, Hongyang},
	editor = {Leonardis, Aleš and Ricci, Elisa and Roth, Stefan and Russakovsky, Olga and Sattler, Torsten and Varol, Gül},
	urldate = {2025-08-28},
	date = {2025},
	langid = {english},
	doi = {10.1007/978-3-031-72943-0_15},
	note = {Series Title: Lecture Notes in Computer Science},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/9WIWKVMA/Sima et al. - 2025 - DriveLM Driving with Graph Visual Question Answering.pdf:application/pdf},
	}

	@article{kiela_hateful_2020,
	title = {The hateful memes challenge: Detecting hate speech in multimodal memes},
	volume = {33},
	url = {https://proceedings.neurips.cc/paper/2020/hash/1b84c4cee2b8b3d823b30e2d604b1878-Abstract.html},
	shorttitle = {The hateful memes challenge},
	pages = {2611--2624},
	journaltitle = {Advances in neural information processing systems},
	author = {Kiela, Douwe and Firooz, Hamed and Mohan, Aravind and Goswami, Vedanuj and Singh, Amanpreet and Ringshia, Pratik and Testuggine, Davide},
	urldate = {2025-08-28},
	date = {2020},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/I2TQNYI6/Kiela et al. - 2020 - The hateful memes challenge Detecting hate speech in multimodal memes.pdf:application/pdf},
	}

	@misc{lu_iconqa_2022,
	title = {{IconQA}: A New Benchmark for Abstract Diagram Understanding and Visual Language Reasoning},
	url = {http://arxiv.org/abs/2110.13214},
	doi = {10.48550/arXiv.2110.13214},
	shorttitle = {{IconQA}},
	abstract = {Current visual question answering ({VQA}) tasks mainly consider answering human-annotated questions for natural images. However, aside from natural images, abstract diagrams with semantic richness are still understudied in visual understanding and reasoning research. In this work, we introduce a new challenge of Icon Question Answering ({IconQA}) with the goal of answering a question in an icon image context. We release {IconQA}, a large-scale dataset that consists of 107,439 questions and three sub-tasks: multi-image-choice, multi-text-choice, and filling-in-the-blank. The {IconQA} dataset is inspired by real-world diagram word problems that highlight the importance of abstract diagram understanding and comprehensive cognitive reasoning. Thus, {IconQA} requires not only perception skills like object recognition and text understanding, but also diverse cognitive reasoning skills, such as geometric reasoning, commonsense reasoning, and arithmetic reasoning. To facilitate potential {IconQA} models to learn semantic representations for icon images, we further release an icon dataset Icon645 which contains 645,687 colored icons on 377 classes. We conduct extensive user studies and blind experiments and reproduce a wide range of advanced {VQA} methods to benchmark the {IconQA} task. Also, we develop a strong {IconQA} baseline Patch-{TRM} that applies a pyramid cross-modal Transformer with input diagram embeddings pre-trained on the icon dataset. {IconQA} and Icon645 are available at https://iconqa.github.io.},
	number = {{arXiv}:2110.13214},
	publisher = {{arXiv}},
	author = {Lu, Pan and Qiu, Liang and Chen, Jiaqi and Xia, Tony and Zhao, Yizhou and Zhang, Wei and Yu, Zhou and Liang, Xiaodan and Zhu, Song-Chun},
	urldate = {2025-08-28},
	date = {2022-07-25},
	eprinttype = {arxiv},
	eprint = {2110.13214 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
	file = {Preprint PDF:/Users/luis/Zotero/storage/3MAQSSLG/Lu et al. - 2022 - IconQA A New Benchmark for Abstract Diagram Understanding and Visual Language Reasoning.pdf:application/pdf},
	}

	@misc{cha_visually_2024,
	title = {Visually Dehallucinative Instruction Generation: Know What You Don't Know},
	url = {http://arxiv.org/abs/2402.09717},
	doi = {10.48550/arXiv.2402.09717},
	shorttitle = {Visually Dehallucinative Instruction Generation},
	abstract = {"When did the emperor Napoleon invented {iPhone}?" Such hallucination-inducing question is well known challenge in generative language modeling. In this study, we present an innovative concept of visual hallucination, referred to as "I Know ({IK})" hallucination, to address scenarios where "I Don't Know" is the desired response. To effectively tackle this issue, we propose the {VQAv}2-{IDK} benchmark, the subset of {VQAv}2 comprising unanswerable image-question pairs as determined by human annotators. Stepping further, we present the visually dehallucinative instruction generation method for {IK} hallucination and introduce the {IDK}-Instructions visual instruction database. Our experiments show that current methods struggle with {IK} hallucination. Yet, our approach effectively reduces these hallucinations, proving its versatility across different frameworks and datasets.},
	number = {{arXiv}:2402.09717},
	publisher = {{arXiv}},
	author = {Cha, Sungguk and Lee, Jusung and Lee, Younghyun and Yang, Cheoljong},
	urldate = {2025-08-28},
	date = {2024-02-15},
	eprinttype = {arxiv},
	eprint = {2402.09717 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/V7DCFQ6G/Cha et al. - 2024 - Visually Dehallucinative Instruction Generation Know What You Don't Know.pdf:application/pdf},
	}

	@article{liu_visual_2023,
	title = {Visual instruction tuning},
	volume = {36},
	url = {https://proceedings.neurips.cc/paper_files/paper/2023/hash/6dcf277ea32ce3288914faf369fe6de0-Abstract-Conference.html},
	pages = {34892--34916},
	journaltitle = {Advances in neural information processing systems},
	author = {Liu, Haotian and Li, Chunyuan and Wu, Qingyang and Lee, Yong Jae},
	urldate = {2025-08-28},
	date = {2023},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/RHUATQZF/Liu et al. - 2023 - Visual instruction tuning.pdf:application/pdf},
	}

	@misc{zhang_llavar_2024,
	title = {{LLaVAR}: Enhanced Visual Instruction Tuning for Text-Rich Image Understanding},
	url = {http://arxiv.org/abs/2306.17107},
	doi = {10.48550/arXiv.2306.17107},
	shorttitle = {{LLaVAR}},
	abstract = {Instruction tuning unlocks the superior capability of Large Language Models ({LLM}) to interact with humans. Furthermore, recent instruction-following datasets include images as visual inputs, collecting responses for image-based instructions. However, visual instruction-tuned models cannot comprehend textual details within images well. This work enhances the current visual instruction tuning pipeline with text-rich images (e.g., movie posters, book covers, etc.). Specifically, we first use publicly available {OCR} tools to collect results on 422K text-rich images from the {LAION} dataset. Moreover, we prompt text-only {GPT}-4 with recognized texts and image captions to generate 16K conversations, each containing question-answer pairs for text-rich images. By combining our collected data with previous multi-modal instruction-following data, our model, {LLaVAR}, substantially improves the {LLaVA} model's capability on text-based {VQA} datasets (up to 20\% accuracy improvement) while achieving an accuracy of 91.42\% on {ScienceQA}. The {GPT}-4-based instruction-following evaluation also demonstrates the improvement of our model on both natural images and text-rich images. Through qualitative analysis, {LLaVAR} shows promising interaction (e.g., reasoning, writing, and elaboration) skills with humans based on the latest real-world online content that combines text and images. We make our code/data/models publicly available at https://llavar.github.io/.},
	number = {{arXiv}:2306.17107},
	publisher = {{arXiv}},
	author = {Zhang, Yanzhe and Zhang, Ruiyi and Gu, Jiuxiang and Zhou, Yufan and Lipka, Nedim and Yang, Diyi and Sun, Tong},
	urldate = {2025-08-28},
	date = {2024-02-02},
	eprinttype = {arxiv},
	eprint = {2306.17107 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/G6DY2DKN/Zhang et al. - 2024 - LLaVAR Enhanced Visual Instruction Tuning for Text-Rich Image Understanding.pdf:application/pdf},
	}

	@online{noauthor_viks_nodate,
	title = {Vik's {ML} Research Blog},
	url = {https://vikhyat.net/posts/2024-08-17-lnqa.html},
	urldate = {2025-08-28},
	file = {Vik's ML Research Blog:/Users/luis/Zotero/storage/VM27FFP7/2024-08-17-lnqa.html:text/html},
	}

	@misc{liu_mitigating_2024,
	title = {Mitigating Hallucination in Large Multi-Modal Models via Robust Instruction Tuning},
	url = {http://arxiv.org/abs/2306.14565},
	doi = {10.48550/arXiv.2306.14565},
	abstract = {Despite the promising progress in multi-modal tasks, current large multi-modal models ({LMMs}) are prone to hallucinating inconsistent descriptions with respect to the associated image and human instructions. This paper addresses this issue by introducing the first large and diverse visual instruction tuning dataset, named Large-scale Robust Visual ({LRV})-Instruction. Our dataset comprises 400k visual instructions generated by {GPT}4, covering 16 vision-and-language tasks with open-ended instructions and answers. Unlike existing studies that primarily focus on positive instruction samples, we design {LRV}-Instruction to include both positive and negative instructions for more robust visual instruction tuning. Our negative instructions are designed at three semantic levels: (i) Nonexistent Object Manipulation, (ii) Existent Object Manipulation and (iii) Knowledge Manipulation. To efficiently measure the hallucination generated by {LMMs}, we propose {GPT}4-Assisted Visual Instruction Evaluation ({GAVIE}), a stable approach to evaluate visual instruction tuning like human experts. {GAVIE} does not require human-annotated groundtruth answers and can adapt to diverse instruction formats. We conduct comprehensive experiments to investigate the hallucination of {LMMs}. Our results demonstrate existing {LMMs} exhibit significant hallucinations when presented with our negative instructions, particularly Existent Object and Knowledge Manipulation instructions. Moreover, we successfully mitigate hallucination by finetuning {MiniGPT}4 and {mPLUG}-Owl on {LRV}-Instruction while improving performance on several public datasets compared to state-of-the-art methods. Additionally, we observed that a balanced ratio of positive and negative instances in the training data leads to a more robust model. Code and data are available at https://github.com/{FuxiaoLiu}/{LRV}-Instruction.},
	number = {{arXiv}:2306.14565},
	publisher = {{arXiv}},
	author = {Liu, Fuxiao and Lin, Kevin and Li, Linjie and Wang, Jianfeng and Yacoob, Yaser and Wang, Lijuan},
	urldate = {2025-08-28},
	date = {2024-03-19},
	eprinttype = {arxiv},
	eprint = {2306.14565 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Computational Engineering, Finance, and Science, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Multimedia},
	file = {Preprint PDF:/Users/luis/Zotero/storage/ICF5CV8V/Liu et al. - 2024 - Mitigating Hallucination in Large Multi-Modal Models via Robust Instruction Tuning.pdf:application/pdf},
	}

	@misc{wang_see_2023,
	title = {To See is to Believe: Prompting {GPT}-4V for Better Visual Instruction Tuning},
	url = {http://arxiv.org/abs/2311.07574},
	doi = {10.48550/arXiv.2311.07574},
	shorttitle = {To See is to Believe},
	abstract = {Existing visual instruction tuning methods typically prompt large language models with textual descriptions to generate instruction-following data. Despite the promising performance achieved, these descriptions are derived from image annotations, which are oftentimes coarse-grained. Furthermore, the instructions might even contradict the visual content without observing the entire visual context. To address this challenge, we introduce a fine-grained visual instruction dataset, {LVIS}-Instruct4V, which contains 220K visually aligned and context-aware instructions produced by prompting the powerful {GPT}-4V with images from {LVIS}. Through experimental validation and case studies, we demonstrate that high-quality visual instructional data could improve the performance of {LLaVA}-1.5, a state-of-the-art large multimodal model, across a wide spectrum of benchmarks by clear margins. Notably, by simply replacing the {LLaVA}-Instruct with our {LVIS}-Instruct4V, we achieve better results than {LLaVA} on most challenging {LMM} benchmarks, e.g., {LLaVA}\${\textasciicircum}w\$ (76.7 vs. 70.7) and {MM}-Vet (40.2 vs. 35.4). We release our data and model at https://github.com/X2FD/{LVIS}-{INSTRUCT}4V.},
	number = {{arXiv}:2311.07574},
	publisher = {{arXiv}},
	author = {Wang, Junke and Meng, Lingchen and Weng, Zejia and He, Bo and Wu, Zuxuan and Jiang, Yu-Gang},
	urldate = {2025-08-28},
	date = {2023-11-29},
	eprinttype = {arxiv},
	eprint = {2311.07574 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/26CDJ58E/Wang et al. - 2023 - To See is to Believe Prompting GPT-4V for Better Visual Instruction Tuning.pdf:application/pdf},
	}

	@misc{luo_mmevol_2024,
	title = {{MMEvol}: Empowering Multimodal Large Language Models with Evol-Instruct},
	url = {http://arxiv.org/abs/2409.05840},
	doi = {10.48550/arXiv.2409.05840},
	shorttitle = {{MMEvol}},
	abstract = {The development of Multimodal Large Language Models ({MLLMs}) has seen significant advancements with increasing demands in various fields (e.g., multimodal agents, embodied intelligence). While model-driven approaches attempt to enhance {MLLMs} capabilities through diverse architectures, the gains have become increasingly marginal. Conversely, data-driven methods, which scale up image-text instruction data, are more effective but face limited data diversity and complexity challenges. The absence of high-quality data constitutes a significant development barrier for {MLLMs}. To address the data quality bottleneck, we propose {MMEvol}, a novel multimodal instruction data evolution framework. This framework iteratively improve data quality through a refined combination of fine-grained perception, cognitive reasoning, and interaction evolution, generating a more complex and diverse image-text instruction dataset that empowers {MLLMs} with enhanced capabilities. Beginning with an initial set of instructions, {SEED}-163K, we utilize {MMEvol} to systematically broaden the diversity of instruction types, extend visual reasoning steps to improve cognitive reasoning abilities, and thoroughly explore fine-grained information within images to enhance visual understanding and robustness. To comprehensively evaluate the effectiveness of our approach, we conduct extensive qualitative analysis and quantitative experiments across 13 vision-language tasks. Compared to baseline models trained with the initial seed data, the results demonstrate that our method achieves an average accuracy improvement of 3.1 percentage points. Furthermore, our approach reaches state-of-the-art ({SOTA}) performance in nine tasks using significantly less data compared to state-of-the-art models.},
	number = {{arXiv}:2409.05840},
	publisher = {{arXiv}},
	author = {Luo, Run and Zhang, Haonan and Chen, Longze and Lin, Ting-En and Liu, Xiong and Wu, Yuchuan and Yang, Min and Wang, Minzheng and Zeng, Pengpeng and Gao, Lianli and Shen, Heng Tao and Li, Yunshui and Xia, Xiaobo and Huang, Fei and Song, Jingkuan and Li, Yongbin},
	urldate = {2025-08-28},
	date = {2024-12-31},
	eprinttype = {arxiv},
	eprint = {2409.05840 [cs]},
	keywords = {Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/IAPKVVNC/Luo et al. - 2024 - MMEvol Empowering Multimodal Large Language Models with Evol-Instruct.pdf:application/pdf},
	}

	@misc{wu_mmra_2024,
	title = {{MMRA}: A Benchmark for Evaluating Multi-Granularity and Multi-Image Relational Association Capabilities in Large Visual Language Models},
	url = {http://arxiv.org/abs/2407.17379},
	doi = {10.48550/arXiv.2407.17379},
	shorttitle = {{MMRA}},
	abstract = {Given the remarkable success that large visual language models ({LVLMs}) have achieved in image perception tasks, the endeavor to make {LVLMs} perceive the world like humans is drawing increasing attention. Current multi-modal benchmarks primarily focus on facts or specific topic-related knowledge contained within individual images. However, they often overlook the associative relations between multiple images, which require the identification and analysis of similarities among entities or content present in different images. Therefore, we propose the multi-image relation association task and a meticulously curated Multi-granularity Multi-image Relational Association ({MMRA}) benchmark, comprising 1,024 samples. In order to systematically and comprehensively evaluate current {LVLMs}, we establish an associational relation system among images that contain 11 subtasks (e.g, {UsageSimilarity}, {SubEvent}) at two granularity levels (i.e., image and entity) according to the relations in {ConceptNet}. Our experiments reveal that on the {MMRA} benchmark, current multi-image {LVLMs} exhibit distinct advantages and disadvantages across various subtasks. Notably, fine-grained, entity-level multi-image perception tasks pose a greater challenge for {LVLMs} compared to image-level tasks. Moreover, {LVLMs} perform poorly on spatial-related tasks, indicating that {LVLMs} still have limited spatial awareness. Additionally, our findings indicate that while {LVLMs} demonstrate a strong capability to perceive image details, enhancing their ability to associate information across multiple images hinges on improving the reasoning capabilities of their language model component. Moreover, we explored the ability of {LVLMs} to perceive image sequences within the context of our multi-image association task. Our experiments show that the majority of current {LVLMs} do not adequately model image sequences during the pre-training process.},
	number = {{arXiv}:2407.17379},
	publisher = {{arXiv}},
	author = {Wu, Siwei and Zhu, Kang and Bai, Yu and Liang, Yiming and Li, Yizhi and Wu, Haoning and Liu, J. H. and Liu, Ruibo and Qu, Xingwei and Cheng, Xuxin and Zhang, Ge and Huang, Wenhao and Lin, Chenghua},
	urldate = {2025-08-28},
	date = {2024-08-06},
	eprinttype = {arxiv},
	eprint = {2407.17379 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/SLZKEUUX/Wu et al. - 2024 - MMRA A Benchmark for Evaluating Multi-Granularity and Multi-Image Relational Association Capabiliti.pdf:application/pdf},
	}

	@misc{suhr_corpus_2019,
	title = {A Corpus for Reasoning About Natural Language Grounded in Photographs},
	url = {http://arxiv.org/abs/1811.00491},
	doi = {10.48550/arXiv.1811.00491},
	abstract = {We introduce a new dataset for joint reasoning about natural language and images, with a focus on semantic diversity, compositionality, and visual reasoning challenges. The data contains 107,292 examples of English sentences paired with web photographs. The task is to determine whether a natural language caption is true about a pair of photographs. We crowdsource the data using sets of visually rich images and a compare-and-contrast task to elicit linguistically diverse language. Qualitative analysis shows the data requires compositional joint reasoning, including about quantities, comparisons, and relations. Evaluation using state-of-the-art visual reasoning methods shows the data presents a strong challenge.},
	number = {{arXiv}:1811.00491},
	publisher = {{arXiv}},
	author = {Suhr, Alane and Zhou, Stephanie and Zhang, Ally and Zhang, Iris and Bai, Huajun and Artzi, Yoav},
	urldate = {2025-08-28},
	date = {2019-07-21},
	eprinttype = {arxiv},
	eprint = {1811.00491 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/GRZGR7GC/Suhr et al. - 2019 - A Corpus for Reasoning About Natural Language Grounded in Photographs.pdf:application/pdf},
	}

	@misc{tu_how_2023,
	title = {How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for Vision {LLMs}},
	url = {http://arxiv.org/abs/2311.16101},
	doi = {10.48550/arXiv.2311.16101},
	shorttitle = {How Many Unicorns Are in This Image?},
	abstract = {This work focuses on the potential of Vision {LLMs} ({VLLMs}) in visual reasoning. Different from prior studies, we shift our focus from evaluating standard performance to introducing a comprehensive safety evaluation suite, covering both out-of-distribution ({OOD}) generalization and adversarial robustness. For the {OOD} evaluation, we present two novel {VQA} datasets, each with one variant, designed to test model performance under challenging conditions. In exploring adversarial robustness, we propose a straightforward attack strategy for misleading {VLLMs} to produce visual-unrelated responses. Moreover, we assess the efficacy of two jailbreaking strategies, targeting either the vision or language component of {VLLMs}. Our evaluation of 21 diverse models, ranging from open-source {VLLMs} to {GPT}-4V, yields interesting observations: 1) Current {VLLMs} struggle with {OOD} texts but not images, unless the visual information is limited; and 2) These {VLLMs} can be easily misled by deceiving vision encoders only, and their vision-language training often compromise safety protocols. We release this safety evaluation suite at https://github.com/{UCSC}-{VLAA}/vllm-safety-benchmark.},
	number = {{arXiv}:2311.16101},
	publisher = {{arXiv}},
	author = {Tu, Haoqin and Cui, Chenhang and Wang, Zijun and Zhou, Yiyang and Zhao, Bingchen and Han, Junlin and Zhou, Wangchunshu and Yao, Huaxiu and Xie, Cihang},
	urldate = {2025-08-28},
	date = {2023-11-27},
	eprinttype = {arxiv},
	eprint = {2311.16101 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
	file = {Preprint PDF:/Users/luis/Zotero/storage/YU6YT5QB/Tu et al. - 2023 - How Many Unicorns Are in This Image A Safety Evaluation Benchmark for Vision LLMs.pdf:application/pdf},
	}

	@misc{yu_spark_2024,
	title = {{SPARK}: Multi-Vision Sensor Perception and Reasoning Benchmark for Large-scale Vision-Language Models},
	url = {http://arxiv.org/abs/2408.12114},
	doi = {10.48550/arXiv.2408.12114},
	shorttitle = {{SPARK}},
	abstract = {Large-scale Vision-Language Models ({LVLMs}) have significantly advanced with text-aligned vision inputs. They have made remarkable progress in computer vision tasks by aligning text modality with vision inputs. There are also endeavors to incorporate multi-vision sensors beyond {RGB}, including thermal, depth, and medical X-ray images. However, we observe that current {LVLMs} view images taken from multi-vision sensors as if they were in the same {RGB} domain without considering the physical characteristics of multi-vision sensors. They fail to convey the fundamental multi-vision sensor information from the dataset and the corresponding contextual knowledge properly. Consequently, alignment between the information from the actual physical environment and the text is not achieved correctly, making it difficult to answer complex sensor-related questions that consider the physical environment. In this paper, we aim to establish a multi-vision Sensor Perception And Reasoning {benchmarK} called {SPARK} that can reduce the fundamental multi-vision sensor information gap between images and multi-vision sensors. We generated 6,248 vision-language test samples to investigate multi-vision sensory perception and multi-vision sensory reasoning on physical sensor knowledge proficiency across different formats, covering different types of sensor-related questions. We utilized these samples to assess ten leading {LVLMs}. The results showed that most models displayed deficiencies in multi-vision sensory reasoning to varying extents. Codes and data are available at https://github.com/top-yun/{SPARK}},
	number = {{arXiv}:2408.12114},
	publisher = {{arXiv}},
	author = {Yu, Youngjoon and Chung, Sangyun and Lee, Byung-Kwan and Ro, Yong Man},
	urldate = {2025-08-28},
	date = {2024-10-11},
	eprinttype = {arxiv},
	eprint = {2408.12114 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/WUTRU3XM/Yu et al. - 2024 - SPARK Multi-Vision Sensor Perception and Reasoning Benchmark for Large-scale Vision-Language Models.pdf:application/pdf},
	}

	@inproceedings{yang_spatialsense_2019,
	title = {Spatialsense: An adversarially crowdsourced benchmark for spatial relation recognition},
	url = {http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_SpatialSense_An_Adversarially_Crowdsourced_Benchmark_for_Spatial_Relation_Recognition_ICCV_2019_paper.html},
	shorttitle = {Spatialsense},
	pages = {2051--2060},
	booktitle = {Proceedings of the {IEEE}/{CVF} International Conference on Computer Vision},
	author = {Yang, Kaiyu and Russakovsky, Olga and Deng, Jia},
	urldate = {2025-08-28},
	date = {2019},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/SFDNXZTL/Yang et al. - 2019 - Spatialsense An adversarially crowdsourced benchmark for spatial relation recognition.pdf:application/pdf},
	}

	@misc{jhamtani_learning_2018,
	title = {Learning to Describe Differences Between Pairs of Similar Images},
	url = {http://arxiv.org/abs/1808.10584},
	doi = {10.48550/arXiv.1808.10584},
	abstract = {In this paper, we introduce the task of automatically generating text to describe the differences between two similar images. We collect a new dataset by crowd-sourcing difference descriptions for pairs of image frames extracted from video-surveillance footage. Annotators were asked to succinctly describe all the differences in a short paragraph. As a result, our novel dataset provides an opportunity to explore models that align language and vision, and capture visual salience. The dataset may also be a useful benchmark for coherent multi-sentence generation. We perform a firstpass visual analysis that exposes clusters of differing pixels as a proxy for object-level differences. We propose a model that captures visual salience by using a latent variable to align clusters of differing pixels with output sentences. We find that, for both single-sentence generation and as well as multi-sentence generation, the proposed model outperforms the models that use attention alone.},
	number = {{arXiv}:1808.10584},
	publisher = {{arXiv}},
	author = {Jhamtani, Harsh and Berg-Kirkpatrick, Taylor},
	urldate = {2025-08-28},
	date = {2018-08-31},
	eprinttype = {arxiv},
	eprint = {1808.10584 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/4E4Q3HFG/Jhamtani and Berg-Kirkpatrick - 2018 - Learning to Describe Differences Between Pairs of Similar Images.pdf:application/pdf},
	}

	@misc{xu_vision-flan_2024,
	title = {Vision-Flan: Scaling Human-Labeled Tasks in Visual Instruction Tuning},
	url = {http://arxiv.org/abs/2402.11690},
	doi = {10.48550/arXiv.2402.11690},
	shorttitle = {Vision-Flan},
	abstract = {Despite vision-language models' ({VLMs}) remarkable capabilities as versatile visual assistants, two substantial challenges persist within the existing {VLM} frameworks: (1) lacking task diversity in pretraining and visual instruction tuning, and (2) annotation error and bias in {GPT}-4 synthesized instruction tuning data. Both challenges lead to issues such as poor generalizability, hallucination, and catastrophic forgetting. To address these challenges, we construct Vision-Flan, the most diverse publicly available visual instruction tuning dataset to date, comprising 187 diverse tasks and 1,664,261 instances sourced from academic datasets, and each task is accompanied by an expert-written instruction. In addition, we propose a two-stage instruction tuning framework, in which {VLMs} are firstly finetuned on Vision-Flan and further tuned on {GPT}-4 synthesized data. We find this two-stage tuning framework significantly outperforms the traditional single-stage visual instruction tuning framework and achieves the state-of-the-art performance across a wide range of multi-modal evaluation benchmarks. Finally, we conduct in-depth analyses to understand visual instruction tuning and our findings reveal that: (1) {GPT}-4 synthesized data does not substantially enhance {VLMs}' capabilities but rather modulates the model's responses to human-preferred formats; (2) A minimal quantity (e.g., 1,000) of {GPT}-4 synthesized data can effectively align {VLM} responses with human-preference; (3) Visual instruction tuning mainly helps large-language models ({LLMs}) to understand visual features.},
	number = {{arXiv}:2402.11690},
	publisher = {{arXiv}},
	author = {Xu, Zhiyang and Feng, Chao and Shao, Rulin and Ashby, Trevor and Shen, Ying and Jin, Di and Cheng, Yu and Wang, Qifan and Huang, Lifu},
	urldate = {2025-08-28},
	date = {2024-02-18},
	eprinttype = {arxiv},
	eprint = {2402.11690 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/C3U6SGAU/Xu et al. - 2024 - Vision-Flan Scaling Human-Labeled Tasks in Visual Instruction Tuning.pdf:application/pdf},
	}

	@inproceedings{zhu_visual7w_2016,
	title = {Visual7w: Grounded question answering in images},
	url = {http://openaccess.thecvf.com/content_cvpr_2016/html/Zhu_Visual7W_Grounded_Question_CVPR_2016_paper.html},
	shorttitle = {Visual7w},
	pages = {4995--5004},
	booktitle = {Proceedings of the {IEEE} conference on computer vision and pattern recognition},
	author = {Zhu, Yuke and Groth, Oliver and Bernstein, Michael and Fei-Fei, Li},
	urldate = {2025-08-28},
	date = {2016},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/C5U2AJV8/Zhu et al. - 2016 - Visual7w Grounded question answering in images.pdf:application/pdf},
	}

	@inproceedings{gurari_vizwiz_2018,
	title = {Vizwiz grand challenge: Answering visual questions from blind people},
	url = {http://openaccess.thecvf.com/content_cvpr_2018/html/Gurari_VizWiz_Grand_Challenge_CVPR_2018_paper.html},
	shorttitle = {Vizwiz grand challenge},
	pages = {3608--3617},
	booktitle = {Proceedings of the {IEEE} conference on computer vision and pattern recognition},
	author = {Gurari, Danna and Li, Qing and Stangl, Abigale J. and Guo, Anhong and Lin, Chi and Grauman, Kristen and Luo, Jiebo and Bigham, Jeffrey P.},
	urldate = {2025-08-28},
	date = {2018},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/D42P4K54/Gurari et al. - 2018 - Vizwiz grand challenge Answering visual questions from blind people.pdf:application/pdf},
	}

	@inproceedings{goyal_making_2017,
	title = {Making the v in vqa matter: Elevating the role of image understanding in visual question answering},
	url = {http://openaccess.thecvf.com/content_cvpr_2017/html/Goyal_Making_the_v_CVPR_2017_paper.html},
	shorttitle = {Making the v in vqa matter},
	pages = {6904--6913},
	booktitle = {Proceedings of the {IEEE} conference on computer vision and pattern recognition},
	author = {Goyal, Yash and Khot, Tejas and Summers-Stay, Douglas and Batra, Dhruv and Parikh, Devi},
	urldate = {2025-08-28},
	date = {2017},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/SEPZDPHF/Goyal et al. - 2017 - Making the v in vqa matter Elevating the role of image understanding in visual question answering.pdf:application/pdf},
	}

	@article{liu_visual_2023-1,
	title = {Visual spatial reasoning},
	volume = {11},
	url = {https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00566/116470},
	pages = {635--651},
	journaltitle = {Transactions of the Association for Computational Linguistics},
	author = {Liu, Fangyu and Emerson, Guy and Collier, Nigel},
	urldate = {2025-08-28},
	date = {2023},
	note = {Publisher: {MIT} Press One Broadway, 12th Floor, Cambridge, Massachusetts 02142, {USA} …},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/RQA5V34Z/Liu et al. - 2023 - Visual spatial reasoning.pdf:application/pdf},
	}

	@misc{laurencon_unlocking_2024,
	title = {Unlocking the conversion of Web Screenshots into {HTML} Code with the {WebSight} Dataset},
	url = {http://arxiv.org/abs/2403.09029},
	doi = {10.48550/arXiv.2403.09029},
	abstract = {Using vision-language models ({VLMs}) in web development presents a promising strategy to increase efficiency and unblock no-code solutions: by providing a screenshot or a sketch of a {UI}, a {VLM} could generate the code to reproduce it, for instance in a language like {HTML}. Despite the advancements in {VLMs} for various tasks, the specific challenge of converting a screenshot into a corresponding {HTML} has been minimally explored. We posit that this is mainly due to the absence of a suitable, high-quality dataset. This work introduces {WebSight}, a synthetic dataset consisting of 2 million pairs of {HTML} codes and their corresponding screenshots. We fine-tune a foundational {VLM} on our dataset and show proficiency in converting webpage screenshots to functional {HTML} code. To accelerate the research in this area, we open-source {WebSight}.},
	number = {{arXiv}:2403.09029},
	publisher = {{arXiv}},
	author = {Laurençon, Hugo and Tronchon, Léo and Sanh, Victor},
	urldate = {2025-08-28},
	date = {2024-03-14},
	eprinttype = {arxiv},
	eprint = {2403.09029 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Human-Computer Interaction},
	file = {Preprint PDF:/Users/luis/Zotero/storage/77YEIBI5/Laurençon et al. - 2024 - Unlocking the conversion of Web Screenshots into HTML Code with the WebSight Dataset.pdf:application/pdf},
	}

	@article{lu_wildvision_2024,
	title = {Wildvision: Evaluating vision-language models in the wild with human preferences},
	volume = {37},
	url = {https://proceedings.neurips.cc/paper_files/paper/2024/hash/563991b5c8b45fe75bea42db738223b2-Abstract-Datasets_and_Benchmarks_Track.html},
	shorttitle = {Wildvision},
	pages = {48224--48255},
	journaltitle = {Advances in Neural Information Processing Systems},
	author = {Lu, Yujie and Jiang, Dongfu and Chen, Wenhu and Wang, William Yang and Choi, Yejin and Lin, Bill Yuchen},
	urldate = {2025-08-28},
	date = {2024},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/TJ6NLWMA/Lu et al. - 2024 - Wildvision Evaluating vision-language models in the wild with human preferences.pdf:application/pdf},
	}

	@misc{nandy_yesbut_2024,
	title = {{YesBut}: A High-Quality Annotated Multimodal Dataset for evaluating Satire Comprehension capability of Vision-Language Models},
	url = {http://arxiv.org/abs/2409.13592},
	doi = {10.48550/arXiv.2409.13592},
	shorttitle = {{YesBut}},
	abstract = {Understanding satire and humor is a challenging task for even current Vision-Language models. In this paper, we propose the challenging tasks of Satirical Image Detection (detecting whether an image is satirical), Understanding (generating the reason behind the image being satirical), and Completion (given one half of the image, selecting the other half from 2 given options, such that the complete image is satirical) and release a high-quality dataset {YesBut}, consisting of 2547 images, 1084 satirical and 1463 non-satirical, containing different artistic styles, to evaluate those tasks. Each satirical image in the dataset depicts a normal scenario, along with a conflicting scenario which is funny or ironic. Despite the success of current Vision-Language Models on multimodal tasks such as Visual {QA} and Image Captioning, our benchmarking experiments show that such models perform poorly on the proposed tasks on the {YesBut} Dataset in Zero-Shot Settings w.r.t both automated as well as human evaluation. Additionally, we release a dataset of 119 real, satirical photographs for further research. The dataset and code are available at https://github.com/abhi1nandy2/yesbut\_dataset.},
	number = {{arXiv}:2409.13592},
	publisher = {{arXiv}},
	author = {Nandy, Abhilash and Agarwal, Yash and Patwa, Ashish and Das, Millon Madhur and Bansal, Aman and Raj, Ankit and Goyal, Pawan and Ganguly, Niloy},
	urldate = {2025-08-28},
	date = {2024-09-20},
	eprinttype = {arxiv},
	eprint = {2409.13592 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/5Z5CRURR/Nandy et al. - 2024 - YesBut A High-Quality Annotated Multimodal Dataset for evaluating Satire Comprehension capability o.pdf:application/pdf},
	}

	@misc{zheng_agentstudio_2025,
	title = {{AgentStudio}: A Toolkit for Building General Virtual Agents},
	url = {http://arxiv.org/abs/2403.17918},
	doi = {10.48550/arXiv.2403.17918},
	shorttitle = {{AgentStudio}},
	abstract = {General virtual agents need to handle multimodal observations, master complex action spaces, and self-improve in dynamic, open-domain environments. However, existing environments are often domain-specific and require complex setups, which limits agent development and evaluation in real-world settings. As a result, current evaluations lack in-depth analyses that decompose fundamental agent capabilities. We introduce {AgentStudio}, a trinity of environments, tools, and benchmarks to address these issues. {AgentStudio} provides a lightweight, interactive environment with highly generic observation and action spaces, e.g., video observations and {GUI}/{API} actions. It integrates tools for creating online benchmark tasks, annotating {GUI} elements, and labeling actions in videos. Based on our environment and tools, we curate an online task suite that benchmarks both {GUI} interactions and function calling with efficient auto-evaluation. We also reorganize existing datasets and collect new ones using our tools to establish three datasets: {GroundUI}, {IDMBench}, and {CriticBench}. These datasets evaluate fundamental agent abilities, including {GUI} grounding, learning from videos, and success detection, pointing to the desiderata for robust, general, and open-ended virtual agents.},
	number = {{arXiv}:2403.17918},
	publisher = {{arXiv}},
	author = {Zheng, Longtao and Huang, Zhiyuan and Xue, Zhenghai and Wang, Xinrun and An, Bo and Yan, Shuicheng},
	urldate = {2025-08-28},
	date = {2025-02-14},
	eprinttype = {arxiv},
	eprint = {2403.17918 [cs]},
	keywords = {Computer Science - Artificial Intelligence},
	file = {Preprint PDF:/Users/luis/Zotero/storage/MLTJDLLI/Zheng et al. - 2025 - AgentStudio A Toolkit for Building General Virtual Agents.pdf:application/pdf},
	}

	@inproceedings{acharya_tallyqa_2019,
	title = {Tallyqa: Answering complex counting questions},
	volume = {33},
	url = {https://ojs.aaai.org/index.php/AAAI/article/view/4815},
	shorttitle = {Tallyqa},
	pages = {8076--8084},
	booktitle = {Proceedings of the {AAAI} conference on artificial intelligence},
	author = {Acharya, Manoj and Kafle, Kushal and Kanan, Christopher},
	urldate = {2025-08-28},
	date = {2019},
	note = {Issue: 01},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/TTRXDMUE/Acharya et al. - 2019 - Tallyqa Answering complex counting questions.pdf:application/pdf},
	}

	@misc{lindstrom_clevr-math_2022,
	title = {{CLEVR}-Math: A Dataset for Compositional Language, Visual and Mathematical Reasoning},
	url = {http://arxiv.org/abs/2208.05358},
	doi = {10.48550/arXiv.2208.05358},
	shorttitle = {{CLEVR}-Math},
	abstract = {We introduce {CLEVR}-Math, a multi-modal math word problems dataset consisting of simple math word problems involving addition/subtraction, represented partly by a textual description and partly by an image illustrating the scenario. The text describes actions performed on the scene that is depicted in the image. Since the question posed may not be about the scene in the image, but about the state of the scene before or after the actions are applied, the solver envision or imagine the state changes due to these actions. Solving these word problems requires a combination of language, visual and mathematical reasoning. We apply state-of-the-art neural and neuro-symbolic models for visual question answering on {CLEVR}-Math and empirically evaluate their performances. Our results show how neither method generalise to chains of operations. We discuss the limitations of the two in addressing the task of multi-modal word problem solving.},
	number = {{arXiv}:2208.05358},
	publisher = {{arXiv}},
	author = {Lindström, Adam Dahlgren and Abraham, Savitha Sam},
	urldate = {2025-08-28},
	date = {2022-08-10},
	eprinttype = {arxiv},
	eprint = {2208.05358 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
	file = {Preprint PDF:/Users/luis/Zotero/storage/J4Y7CAZY/Lindström and Abraham - 2022 - CLEVR-Math A Dataset for Compositional Language, Visual and Mathematical Reasoning.pdf:application/pdf},
	}

	@misc{gao_g-llava_2025,
	title = {G-{LLaVA}: Solving Geometric Problem with Multi-Modal Large Language Model},
	url = {http://arxiv.org/abs/2312.11370},
	doi = {10.48550/arXiv.2312.11370},
	shorttitle = {G-{LLaVA}},
	abstract = {Large language models ({LLMs}) have shown remarkable proficiency in human-level reasoning and generation capabilities, which encourages extensive research on their application in mathematical problem solving. However, current work has been largely focused on text-based mathematical problems, with limited investigation in problems involving geometric information. Addressing this gap, we aim to enable {LLMs} to solve geometric problems by understanding image input. We first analyze the limitations of current Multimodal Large Language Models ({MLLMs}) in this area: they struggle to accurately comprehending basic geometric elements and their relationships. To overcome these challenges, we take advantage of the unique characteristics of geometric problems (such as unique geometric logical form, and geometric scalability) and the capacity of the textual {LLMs} to build an enriched multimodal geometry dataset based on existing data. The augmented dataset, Geo170K, contains more than 170K geometric image-caption and question-answer pairs. Utilizing our constructed Geo170K dataset, we develop G-{LLaVA}, which demonstrates exceptional performance in solving geometric problems, significantly outperforming {GPT}-4-V on the {MathVista} benchmark with only 7B parameters.},
	number = {{arXiv}:2312.11370},
	publisher = {{arXiv}},
	author = {Gao, Jiahui and Pi, Renjie and Zhang, Jipeng and Ye, Jiacheng and Zhong, Wanjun and Wang, Yufei and Hong, Lanqing and Han, Jianhua and Xu, Hang and Li, Zhenguo and Kong, Lingpeng},
	urldate = {2025-08-28},
	date = {2025-08-20},
	eprinttype = {arxiv},
	eprint = {2312.11370 [cs]},
	keywords = {Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/B4LTLCP3/Gao et al. - 2025 - G-LLaVA Solving Geometric Problem with Multi-Modal Large Language Model.pdf:application/pdf},
	}

	@misc{lu_inter-gps_2021,
	title = {Inter-{GPS}: Interpretable Geometry Problem Solving with Formal Language and Symbolic Reasoning},
	url = {http://arxiv.org/abs/2105.04165},
	doi = {10.48550/arXiv.2105.04165},
	shorttitle = {Inter-{GPS}},
	abstract = {Geometry problem solving has attracted much attention in the {NLP} community recently. The task is challenging as it requires abstract problem understanding and symbolic reasoning with axiomatic knowledge. However, current datasets are either small in scale or not publicly available. Thus, we construct a new large-scale benchmark, Geometry3K, consisting of 3,002 geometry problems with dense annotation in formal language. We further propose a novel geometry solving approach with formal language and symbolic reasoning, called Interpretable Geometry Problem Solver (Inter-{GPS}). Inter-{GPS} first parses the problem text and diagram into formal language automatically via rule-based text parsing and neural object detecting, respectively. Unlike implicit learning in existing methods, Inter-{GPS} incorporates theorem knowledge as conditional rules and performs symbolic reasoning step by step. Also, a theorem predictor is designed to infer the theorem application sequence fed to the symbolic solver for the more efficient and reasonable searching path. Extensive experiments on the Geometry3K and {GEOS} datasets demonstrate that Inter-{GPS} achieves significant improvements over existing methods. The project with code and data is available at https://lupantech.github.io/inter-gps.},
	number = {{arXiv}:2105.04165},
	publisher = {{arXiv}},
	author = {Lu, Pan and Gong, Ran and Jiang, Shibiao and Qiu, Liang and Huang, Siyuan and Liang, Xiaodan and Zhu, Song-Chun},
	urldate = {2025-08-28},
	date = {2021-07-20},
	eprinttype = {arxiv},
	eprint = {2105.04165 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Formal Languages and Automata Theory},
	file = {Preprint PDF:/Users/luis/Zotero/storage/DRWXWRLG/Lu et al. - 2021 - Inter-GPS Interpretable Geometry Problem Solving with Formal Language and Symbolic Reasoning.pdf:application/pdf},
	}

	@misc{kazemi_geomverse_2023,
	title = {{GeomVerse}: A Systematic Evaluation of Large Models for Geometric Reasoning},
	url = {http://arxiv.org/abs/2312.12241},
	doi = {10.48550/arXiv.2312.12241},
	shorttitle = {{GeomVerse}},
	abstract = {Large language models have shown impressive results for multi-hop mathematical reasoning when the input question is only textual. Many mathematical reasoning problems, however, contain both text and image. With the ever-increasing adoption of vision language models ({VLMs}), understanding their reasoning abilities for such problems is crucial. In this paper, we evaluate the reasoning capabilities of {VLMs} along various axes through the lens of geometry problems. We procedurally create a synthetic dataset of geometry questions with controllable difficulty levels along multiple axes, thus enabling a systematic evaluation. The empirical results obtained using our benchmark for state-of-the-art {VLMs} indicate that these models are not as capable in subjects like geometry (and, by generalization, other topics requiring similar reasoning) as suggested by previous benchmarks. This is made especially clear by the construction of our benchmark at various depth levels, since solving higher-depth problems requires long chains of reasoning rather than additional memorized knowledge. We release the dataset for further research in this area.},
	number = {{arXiv}:2312.12241},
	publisher = {{arXiv}},
	author = {Kazemi, Mehran and Alvari, Hamidreza and Anand, Ankit and Wu, Jialin and Chen, Xi and Soricut, Radu},
	urldate = {2025-08-28},
	date = {2023-12-19},
	eprinttype = {arxiv},
	eprint = {2312.12241 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/54X2GHCM/Kazemi et al. - 2023 - GeomVerse A Systematic Evaluation of Large Models for Geometric Reasoning.pdf:application/pdf},
	}

	@inproceedings{cao_augmented_2022,
	title = {An augmented benchmark dataset for geometric question answering through dual parallel text encoding},
	url = {https://aclanthology.org/2022.coling-1.130/},
	pages = {1511--1520},
	booktitle = {Proceedings of the 29th international conference on computational linguistics},
	author = {Cao, Jie and Xiao, Jing},
	urldate = {2025-08-28},
	date = {2022},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/B9LAWZSY/Cao and Xiao - 2022 - An augmented benchmark dataset for geometric question answering through dual parallel text encoding.pdf:application/pdf},
	}

	@inproceedings{seo_solving_2015,
	title = {Solving geometry problems: Combining text and diagram interpretation},
	url = {https://aclanthology.org/D15-1171.pdf},
	shorttitle = {Solving geometry problems},
	pages = {1466--1476},
	booktitle = {Proceedings of the 2015 conference on empirical methods in natural language processing},
	author = {Seo, Minjoon and Hajishirzi, Hannaneh and Farhadi, Ali and Etzioni, Oren and Malcolm, Clint},
	urldate = {2025-08-28},
	date = {2015},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/SETI5N9L/Seo et al. - 2015 - Solving geometry problems Combining text and diagram interpretation.pdf:application/pdf},
	}

	@misc{lu_inter-gps_2021-1,
	title = {Inter-{GPS}: Interpretable Geometry Problem Solving with Formal Language and Symbolic Reasoning},
	url = {http://arxiv.org/abs/2105.04165},
	doi = {10.48550/arXiv.2105.04165},
	shorttitle = {Inter-{GPS}},
	abstract = {Geometry problem solving has attracted much attention in the {NLP} community recently. The task is challenging as it requires abstract problem understanding and symbolic reasoning with axiomatic knowledge. However, current datasets are either small in scale or not publicly available. Thus, we construct a new large-scale benchmark, Geometry3K, consisting of 3,002 geometry problems with dense annotation in formal language. We further propose a novel geometry solving approach with formal language and symbolic reasoning, called Interpretable Geometry Problem Solver (Inter-{GPS}). Inter-{GPS} first parses the problem text and diagram into formal language automatically via rule-based text parsing and neural object detecting, respectively. Unlike implicit learning in existing methods, Inter-{GPS} incorporates theorem knowledge as conditional rules and performs symbolic reasoning step by step. Also, a theorem predictor is designed to infer the theorem application sequence fed to the symbolic solver for the more efficient and reasonable searching path. Extensive experiments on the Geometry3K and {GEOS} datasets demonstrate that Inter-{GPS} achieves significant improvements over existing methods. The project with code and data is available at https://lupantech.github.io/inter-gps.},
	number = {{arXiv}:2105.04165},
	publisher = {{arXiv}},
	author = {Lu, Pan and Gong, Ran and Jiang, Shibiao and Qiu, Liang and Huang, Siyuan and Liang, Xiaodan and Zhu, Song-Chun},
	urldate = {2025-08-28},
	date = {2021-07-20},
	eprinttype = {arxiv},
	eprint = {2105.04165 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Formal Languages and Automata Theory},
	file = {Preprint PDF:/Users/luis/Zotero/storage/GMRY8RHU/Lu et al. - 2021 - Inter-GPS Interpretable Geometry Problem Solving with Formal Language and Symbolic Reasoning.pdf:application/pdf},
	}

	@misc{zhang_mavis_2024,
	title = {{MAVIS}: Mathematical Visual Instruction Tuning with an Automatic Data Engine},
	url = {http://arxiv.org/abs/2407.08739},
	doi = {10.48550/arXiv.2407.08739},
	shorttitle = {{MAVIS}},
	abstract = {The mathematical capabilities of Multi-modal Large Language Models ({MLLMs}) remain under-explored with three areas to be improved: visual encoding of math diagrams, diagram-language alignment, and chain-of-thought ({CoT}) reasoning. This draws forth an urgent demand for an effective training paradigm and a large-scale, comprehensive dataset with detailed {CoT} rationales, which is challenging to collect and costly to annotate manually. To tackle this issue, we propose {MAVIS}, a {MAthematical} {VISual} instruction tuning pipeline for {MLLMs}, featuring an automatic data engine to efficiently create mathematical visual datasets. We design the data generation process to be entirely independent of human intervention or {GPT} {API} usage, while ensuring the diagram-caption correspondence, question-answer correctness, and {CoT} reasoning quality. With this approach, we curate two datasets, {MAVIS}-Caption (558K diagram-caption pairs) and {MAVIS}-Instruct (834K visual math problems with {CoT} rationales), and propose four progressive stages for training {MLLMs} from scratch. First, we utilize {MAVIS}-Caption to fine-tune a math-specific vision encoder ({CLIP}-Math) through contrastive learning, tailored for improved diagram visual encoding. Second, we also leverage {MAVIS}-Caption to align the {CLIP}-Math with a large language model ({LLM}) by a projection layer, enhancing vision-language alignment in mathematical domains. Third, we adopt {MAVIS}-Instruct to perform the instruction tuning for robust problem-solving skills, and term the resulting model as {MAVIS}-7B. Fourth, we apply Direct Preference Optimization ({DPO}) to enhance the {CoT} capabilities of our model, further refining its step-wise reasoning performance. Code and data will be released at https://github.com/{ZrrSkywalker}/{MAVIS}},
	number = {{arXiv}:2407.08739},
	publisher = {{arXiv}},
	author = {Zhang, Renrui and Wei, Xinyu and Jiang, Dongzhi and Guo, Ziyu and Li, Shicheng and Zhang, Yichi and Tong, Chengzhuo and Liu, Jiaming and Zhou, Aojun and Wei, Bin and Zhang, Shanghang and Gao, Peng and Li, Chunyuan and Li, Hongsheng},
	urldate = {2025-08-28},
	date = {2024-11-01},
	eprinttype = {arxiv},
	eprint = {2407.08739 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/ZHAY7T7E/Zhang et al. - 2024 - MAVIS Mathematical Visual Instruction Tuning with an Automatic Data Engine.pdf:application/pdf},
	}

	@inproceedings{zhang_raven_2019,
	title = {Raven: A dataset for relational and analogical visual reasoning},
	url = {http://openaccess.thecvf.com/content_CVPR_2019/html/Zhang_RAVEN_A_Dataset_for_Relational_and_Analogical_Visual_REasoNing_CVPR_2019_paper.html},
	shorttitle = {Raven},
	pages = {5317--5327},
	booktitle = {Proceedings of the {IEEE}/{CVF} conference on computer vision and pattern recognition},
	author = {Zhang, Chi and Gao, Feng and Jia, Baoxiong and Zhu, Yixin and Zhu, Song-Chun},
	urldate = {2025-08-28},
	date = {2019},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/KX6EITWS/Zhang et al. - 2019 - Raven A dataset for relational and analogical visual reasoning.pdf:application/pdf},
	}

	@inproceedings{li_super-clevr_2023,
	title = {Super-clevr: A virtual benchmark to diagnose domain robustness in visual reasoning},
	url = {http://openaccess.thecvf.com/content/CVPR2023/html/Li_Super-CLEVR_A_Virtual_Benchmark_To_Diagnose_Domain_Robustness_in_Visual_CVPR_2023_paper.html},
	shorttitle = {Super-clevr},
	pages = {14963--14973},
	booktitle = {Proceedings of the {IEEE}/{CVF} conference on computer vision and pattern recognition},
	author = {Li, Zhuowan and Wang, Xingrui and Stengel-Eskin, Elias and Kortylewski, Adam and Ma, Wufei and Van Durme, Benjamin and Yuille, Alan L.},
	urldate = {2025-08-28},
	date = {2023},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/XJDFSWK7/Li et al. - 2023 - Super-clevr A virtual benchmark to diagnose domain robustness in visual reasoning.pdf:application/pdf},
	}

	@inproceedings{mouchere_icdar_2013,
	title = {Icdar 2013 crohme: Third international competition on recognition of online handwritten mathematical expressions},
	url = {https://ieeexplore.ieee.org/abstract/document/6628849/},
	shorttitle = {Icdar 2013 crohme},
	pages = {1428--1432},
	booktitle = {2013 12th International Conference on Document Analysis and Recognition},
	publisher = {{IEEE}},
	author = {Mouchere, Harold and Viard-Gaudin, Christian and Zanibbi, Richard and Garain, Utpal and Kim, Dae Hwan and Kim, Jin Hyung},
	urldate = {2025-08-28},
	date = {2013},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/GSMF6J2S/Mouchere et al. - 2013 - Icdar 2013 crohme Third international competition on recognition of online handwritten mathematical.pdf:application/pdf},
	}

	@inproceedings{jaume_funsd_2019,
	title = {Funsd: A dataset for form understanding in noisy scanned documents},
	volume = {2},
	url = {https://ieeexplore.ieee.org/abstract/document/8892998/},
	shorttitle = {Funsd},
	pages = {1--6},
	booktitle = {2019 International Conference on Document Analysis and Recognition Workshops ({ICDARW})},
	publisher = {{IEEE}},
	author = {Jaume, Guillaume and Ekenel, Hazim Kemal and Thiran, Jean-Philippe},
	urldate = {2025-08-28},
	date = {2019},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/P7WQL3RE/Jaume et al. - 2019 - Funsd A dataset for form understanding in noisy scanned documents.pdf:application/pdf},
	}

	@inproceedings{yuan_syntax-aware_2022,
	title = {Syntax-aware network for handwritten mathematical expression recognition},
	url = {http://openaccess.thecvf.com/content/CVPR2022/html/Yuan_Syntax-Aware_Network_for_Handwritten_Mathematical_Expression_Recognition_CVPR_2022_paper.html},
	pages = {4553--4562},
	booktitle = {Proceedings of the {IEEE}/{CVF} conference on computer vision and pattern recognition},
	author = {Yuan, Ye and Liu, Xiao and Dikubab, Wondimu and Liu, Hui and Ji, Zhilong and Wu, Zhongqin and Bai, Xiang},
	urldate = {2025-08-28},
	date = {2022},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/KJ59QNFI/Yuan et al. - 2022 - Syntax-aware network for handwritten mathematical expression recognition.pdf:application/pdf},
	}

	@article{mathew_asking_2021,
	title = {Asking questions on handwritten document collections},
	volume = {24},
	issn = {1433-2833, 1433-2825},
	url = {https://link.springer.com/10.1007/s10032-021-00383-3},
	doi = {10.1007/s10032-021-00383-3},
	pages = {235--249},
	number = {3},
	journaltitle = {International Journal on Document Analysis and Recognition ({IJDAR})},
	shortjournal = {{IJDAR}},
	author = {Mathew, Minesh and Gomez, Lluis and Karatzas, Dimosthenis and Jawahar, C. V.},
	urldate = {2025-08-28},
	date = {2021-09},
	langid = {english},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/G2NCWNLD/Mathew et al. - 2021 - Asking questions on handwritten document collections.pdf:application/pdf},
	}

	@article{marti_iam-database_2002,
	title = {The {IAM}-database: an English sentence database for offline handwriting recognition},
	volume = {5},
	rights = {http://www.springer.com/tdm},
	issn = {1433-2833, 1433-2825},
	url = {http://link.springer.com/10.1007/s100320200071},
	doi = {10.1007/s100320200071},
	shorttitle = {The {IAM}-database},
	pages = {39--46},
	number = {1},
	journaltitle = {International Journal on Document Analysis and Recognition},
	shortjournal = {International Journal on Document Analysis and Recognition},
	author = {Marti, U.-V. and Bunke, H.},
	urldate = {2025-08-28},
	date = {2002-11-01},
	}

	@inproceedings{mishra_scene_2012,
	title = {Scene text recognition using higher order language priors},
	url = {https://inria.hal.science/hal-00818183/},
	booktitle = {{BMVC}-British machine vision conference},
	publisher = {{BMVA}},
	author = {Mishra, Anand and Alahari, Karteek and Jawahar, C. V.},
	urldate = {2025-08-28},
	date = {2012},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/A56CFS3G/Mishra et al. - 2012 - Scene text recognition using higher order language priors.pdf:application/pdf},
	}

	@article{krishnan_textstylebrush_2023,
	title = {Textstylebrush: transfer of text aesthetics from a single example},
	volume = {45},
	url = {https://ieeexplore.ieee.org/abstract/document/10027471/},
	shorttitle = {Textstylebrush},
	pages = {9122--9134},
	number = {7},
	journaltitle = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
	author = {Krishnan, Praveen and Kovvuri, Rama and Pang, Guan and Vassilev, Boris and Hassner, Tal},
	urldate = {2025-08-28},
	date = {2023},
	note = {Publisher: {IEEE}},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/KKSB87U3/Krishnan et al. - 2023 - Textstylebrush transfer of text aesthetics from a single example.pdf:application/pdf},
	}

	@online{noauthor_oleehyolatex-formulas_2024,
	title = {{OleehyO}/latex-formulas · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/OleehyO/latex-formulas},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-08-28},
	date = {2024-06-22},
	}

	@incollection{barney_smith_icdar_2024,
	location = {Cham},
	title = {{ICDAR} 2024 Competition on Historical Map Text Detection, Recognition, and Linking},
	volume = {14809},
	isbn = {978-3-031-70551-9 978-3-031-70552-6},
	url = {https://link.springer.com/10.1007/978-3-031-70552-6_22},
	pages = {363--380},
	booktitle = {Document Analysis and Recognition - {ICDAR} 2024},
	publisher = {Springer Nature Switzerland},
	author = {Li, Zekun and Lin, Yijun and Chiang, Yao-Yi and Weinman, Jerod and Tual, Solenn and Chazalon, Joseph and Perret, Julien and Duménieu, Bertrand and Abadie, Nathalie},
	editor = {Barney Smith, Elisa H. and Liwicki, Marcus and Peng, Liangrui},
	urldate = {2025-08-28},
	date = {2024},
	langid = {english},
	doi = {10.1007/978-3-031-70552-6_22},
	note = {Series Title: Lecture Notes in Computer Science},
	}

	@misc{sharma_semeval-2020_2020,
	title = {{SemEval}-2020 Task 8: Memotion Analysis -- The Visuo-Lingual Metaphor!},
	url = {http://arxiv.org/abs/2008.03781},
	doi = {10.48550/arXiv.2008.03781},
	shorttitle = {{SemEval}-2020 Task 8},
	abstract = {Information on social media comprises of various modalities such as textual, visual and audio. {NLP} and Computer Vision communities often leverage only one prominent modality in isolation to study social media. However, the computational processing of Internet memes needs a hybrid approach. The growing ubiquity of Internet memes on social media platforms such as Facebook, Instagram, and Twiter further suggests that we can not ignore such multimodal content anymore. To the best of our knowledge, there is not much attention towards meme emotion analysis. The objective of this proposal is to bring the attention of the research community towards the automatic processing of Internet memes. The task Memotion analysis released approx 10K annotated memes, with human-annotated labels namely sentiment (positive, negative, neutral), type of emotion (sarcastic, funny, offensive, motivation) and their corresponding intensity. The challenge consisted of three subtasks: sentiment (positive, negative, and neutral) analysis of memes, overall emotion (humour, sarcasm, offensive, and motivational) classification of memes, and classifying intensity of meme emotion. The best performances achieved were F1 (macro average) scores of 0.35, 0.51 and 0.32, respectively for each of the three subtasks.},
	number = {{arXiv}:2008.03781},
	publisher = {{arXiv}},
	author = {Sharma, Chhavi and Bhageria, Deepesh and Scott, William and {PYKL}, Srinivas and Das, Amitava and Chakraborty, Tanmoy and Pulabaigari, Viswanath and Gamback, Bjorn},
	urldate = {2025-08-28},
	date = {2020-08-09},
	eprinttype = {arxiv},
	eprint = {2008.03781 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/DZD9GDA7/Sharma et al. - 2020 - SemEval-2020 Task 8 Memotion Analysis -- The Visuo-Lingual Metaphor!.pdf:application/pdf},
	}

	@inproceedings{diem_icfhr_2014,
	title = {{ICFHR} 2014 competition on handwritten digit string recognition in challenging datasets ({HDSRC} 2014)},
	url = {https://ieeexplore.ieee.org/abstract/document/6981115/},
	pages = {779--784},
	booktitle = {2014 14th International Conference on Frontiers in Handwriting Recognition},
	publisher = {{IEEE}},
	author = {Diem, Markus and Fiel, Stefan and Kleber, Florian and Sablatnig, Robert and Saavedra, Jose M. and Contreras, David and Barrios, Juan Manuel and Oliveira, Luiz S.},
	urldate = {2025-08-28},
	date = {2014},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/5HXWCW8W/Diem et al. - 2014 - ICFHR 2014 competition on handwritten digit string recognition in challenging datasets (HDSRC 2014).pdf:application/pdf},
	}

	@online{noauthor_wendlercrenderedtext_2024,
	title = {wendlerc/{RenderedText} · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/wendlerc/RenderedText},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-08-28},
	date = {2024-10-14},
	file = {Snapshot:/Users/luis/Zotero/storage/238D48AZ/RenderedText.html:text/html},
	}

	@inproceedings{huang_icdar2019_2019,
	title = {Icdar2019 competition on scanned receipt ocr and information extraction},
	url = {https://ieeexplore.ieee.org/abstract/document/8977955/},
	pages = {1516--1520},
	booktitle = {2019 International Conference on Document Analysis and Recognition ({ICDAR})},
	publisher = {{IEEE}},
	author = {Huang, Zheng and Chen, Kai and He, Jianhua and Bai, Xiang and Karatzas, Dimosthenis and Lu, Shijian and Jawahar, C. V.},
	urldate = {2025-08-28},
	date = {2019},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/794W3CDK/Huang et al. - 2019 - Icdar2019 competition on scanned receipt ocr and information extraction.pdf:application/pdf},
	}

	@incollection{avidan_ocr-free_2022,
	location = {Cham},
	title = {{OCR}-Free Document Understanding Transformer},
	volume = {13688},
	isbn = {978-3-031-19814-4 978-3-031-19815-1},
	url = {https://link.springer.com/10.1007/978-3-031-19815-1_29},
	pages = {498--517},
	booktitle = {Computer Vision – {ECCV} 2022},
	publisher = {Springer Nature Switzerland},
	author = {Kim, Geewook and Hong, Teakgyu and Yim, Moonbin and Nam, {JeongYeon} and Park, Jinyoung and Yim, Jinyeong and Hwang, Wonseok and Yun, Sangdoo and Han, Dongyoon and Park, Seunghyun},
	editor = {Avidan, Shai and Brostow, Gabriel and Cissé, Moustapha and Farinella, Giovanni Maria and Hassner, Tal},
	urldate = {2025-08-28},
	date = {2022},
	langid = {english},
	doi = {10.1007/978-3-031-19815-1_29},
	note = {Series Title: Lecture Notes in Computer Science},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/ZAWUP572/Kim et al. - 2022 - OCR-Free Document Understanding Transformer.pdf:application/pdf},
	}

	@incollection{avidan_toward_2022,
	location = {Cham},
	title = {Toward Understanding {WordArt}: Corner-Guided Transformer for Scene Text Recognition},
	volume = {13688},
	isbn = {978-3-031-19814-4 978-3-031-19815-1},
	url = {https://link.springer.com/10.1007/978-3-031-19815-1_18},
	shorttitle = {Toward Understanding {WordArt}},
	pages = {303--321},
	booktitle = {Computer Vision – {ECCV} 2022},
	publisher = {Springer Nature Switzerland},
	author = {Xie, Xudong and Fu, Ling and Zhang, Zhifei and Wang, Zhaowen and Bai, Xiang},
	editor = {Avidan, Shai and Brostow, Gabriel and Cissé, Moustapha and Farinella, Giovanni Maria and Hassner, Tal},
	urldate = {2025-08-28},
	date = {2022},
	langid = {english},
	doi = {10.1007/978-3-031-19815-1_18},
	note = {Series Title: Lecture Notes in Computer Science},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/JCZBQNH9/Xie et al. - 2022 - Toward Understanding WordArt Corner-Guided Transformer for Scene Text Recognition.pdf:application/pdf},
	}

	@incollection{avidan_-okvqa_2022,
	location = {Cham},
	title = {A-{OKVQA}: A Benchmark for Visual Question Answering Using World Knowledge},
	volume = {13668},
	isbn = {978-3-031-20073-1 978-3-031-20074-8},
	url = {https://link.springer.com/10.1007/978-3-031-20074-8_9},
	shorttitle = {A-{OKVQA}},
	pages = {146--162},
	booktitle = {Computer Vision – {ECCV} 2022},
	publisher = {Springer Nature Switzerland},
	author = {Schwenk, Dustin and Khandelwal, Apoorv and Clark, Christopher and Marino, Kenneth and Mottaghi, Roozbeh},
	editor = {Avidan, Shai and Brostow, Gabriel and Cissé, Moustapha and Farinella, Giovanni Maria and Hassner, Tal},
	urldate = {2025-08-28},
	date = {2022},
	langid = {english},
	doi = {10.1007/978-3-031-20074-8_9},
	note = {Series Title: Lecture Notes in Computer Science},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/26DE43J2/Schwenk et al. - 2022 - A-OKVQA A Benchmark for Visual Question Answering Using World Knowledge.pdf:application/pdf},
	}

	@misc{li_multimodal_2024,
	title = {Multimodal {ArXiv}: A Dataset for Improving Scientific Comprehension of Large Vision-Language Models},
	url = {http://arxiv.org/abs/2403.00231},
	doi = {10.48550/arXiv.2403.00231},
	shorttitle = {Multimodal {ArXiv}},
	abstract = {Large vision-language models ({LVLMs}) excel across diverse tasks involving concrete images from natural scenes. However, their ability to interpret abstract figures, such as geometry shapes and scientific plots, remains limited due to a scarcity of training datasets in scientific domains. To fill this gap, we introduce Multimodal {ArXiv}, consisting of {ArXivCap} and {ArXivQA}, for enhancing {LVLMs} scientific comprehension. {ArXivCap} is a figure-caption dataset comprising 6.4M images and 3.9M captions, sourced from 572K {ArXiv} papers spanning various scientific domains. Drawing from {ArXivCap}, we introduce {ArXivQA}, a question-answering dataset generated by prompting {GPT}-4V based on scientific figures. {ArXivQA} greatly enhances open-sourced {LVLMs}' mathematical reasoning capabilities, achieving a 10.4{\textbackslash}\% absolute accuracy gain on a multimodal mathematical reasoning benchmark. Furthermore, employing {ArXivCap}, we devise four vision-to-text tasks for benchmarking {LVLMs}. Evaluation results with state-of-the-art {LVLMs} underscore their struggle with the nuanced semantics of academic figures, while domain-specific training yields substantial performance gains. Our error analysis uncovers misinterpretations of visual context, recognition errors, and the production of overly simplified captions by current {LVLMs}, shedding light on future improvements.},
	number = {{arXiv}:2403.00231},
	publisher = {{arXiv}},
	author = {Li, Lei and Wang, Yuqi and Xu, Runxin and Wang, Peiyi and Feng, Xiachong and Kong, Lingpeng and Liu, Qi},
	urldate = {2025-08-28},
	date = {2024-06-02},
	eprinttype = {arxiv},
	eprint = {2403.00231 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/MHNXGP3K/Li et al. - 2024 - Multimodal ArXiv A Dataset for Improving Scientific Comprehension of Large Vision-Language Models.pdf:application/pdf},
	}

	@online{noauthor_shreyanshu09block_diagram_2024,
	title = {shreyanshu09/Block\_Diagram · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/shreyanshu09/Block_Diagram},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-08-28},
	date = {2024-06-03},
	file = {Snapshot:/Users/luis/Zotero/storage/47SYTLWJ/Block_Diagram.html:text/html},
	}

	@online{noauthor_kamizuru00diagram_image_to_text_2024,
	title = {Kamizuru00/diagram\_image\_to\_text · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/Kamizuru00/diagram_image_to_text},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-08-28},
	date = {2024-10-22},
	file = {Snapshot:/Users/luis/Zotero/storage/258X5NAM/diagram_image_to_text.html:text/html},
	}

	@inproceedings{mathew_docvqa_2021,
	title = {Docvqa: A dataset for vqa on document images},
	url = {http://openaccess.thecvf.com/content/WACV2021/html/Mathew_DocVQA_A_Dataset_for_VQA_on_Document_Images_WACV_2021_paper.html},
	shorttitle = {Docvqa},
	pages = {2200--2209},
	booktitle = {Proceedings of the {IEEE}/{CVF} winter conference on applications of computer vision},
	author = {Mathew, Minesh and Karatzas, Dimosthenis and Jawahar, C. V.},
	urldate = {2025-08-28},
	date = {2021},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/BB92LCJL/Mathew et al. - 2021 - Docvqa A dataset for vqa on document images.pdf:application/pdf},
	}

	@inproceedings{wang_general_2020,
	title = {On the general value of evidence, and bilingual scene-text visual question answering},
	url = {http://openaccess.thecvf.com/content_CVPR_2020/html/Wang_On_the_General_Value_of_Evidence_and_Bilingual_Scene-Text_Visual_CVPR_2020_paper.html},
	pages = {10126--10135},
	booktitle = {Proceedings of the {IEEE}/{CVF} Conference on Computer Vision and Pattern Recognition},
	author = {Wang, Xinyu and Liu, Yuliang and Shen, Chunhua and Ng, Chun Chet and Luo, Canjie and Jin, Lianwen and Chan, Chee Seng and Hengel, Anton van den and Wang, Liangwei},
	urldate = {2025-08-28},
	date = {2020},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/Z7CA8MGY/Wang et al. - 2020 - On the general value of evidence, and bilingual scene-text visual question answering.pdf:application/pdf},
	}

	@inproceedings{mathew_infographicvqa_2022,
	title = {Infographicvqa},
	url = {http://openaccess.thecvf.com/content/WACV2022/html/Mathew_InfographicVQA_WACV_2022_paper.html},
	pages = {1697--1706},
	booktitle = {Proceedings of the {IEEE}/{CVF} Winter Conference on Applications of Computer Vision},
	author = {Mathew, Minesh and Bagal, Viraj and Tito, Rubèn and Karatzas, Dimosthenis and Valveny, Ernest and Jawahar, C. V.},
	urldate = {2025-08-28},
	date = {2022},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/ITMD36E7/Mathew et al. - 2022 - Infographicvqa.pdf:application/pdf},
	}

	@misc{chang_mapqa_2022,
	title = {{MapQA}: A Dataset for Question Answering on Choropleth Maps},
	url = {http://arxiv.org/abs/2211.08545},
	doi = {10.48550/arXiv.2211.08545},
	shorttitle = {{MapQA}},
	abstract = {Choropleth maps are a common visual representation for region-specific tabular data and are used in a number of different venues (newspapers, articles, etc). These maps are human-readable but are often challenging to deal with when trying to extract data for screen readers, analyses, or other related tasks. Recent research into Visual-Question Answering ({VQA}) has studied question answering on human-generated charts ({ChartQA}), such as bar, line, and pie charts. However, little work has paid attention to understanding maps; general {VQA} models, and {ChartQA} models, suffer when asked to perform this task. To facilitate and encourage research in this area, we present {MapQA}, a large-scale dataset of {\textasciitilde}800K question-answer pairs over {\textasciitilde}60K map images. Our task tests various levels of map understanding, from surface questions about map styles to complex questions that require reasoning on the underlying data. We present the unique challenges of {MapQA} that frustrate most strong baseline algorithms designed for {ChartQA} and general {VQA} tasks. We also present a novel algorithm, Visual Multi-Output Data Extraction based {QA} (V-{MODEQA}) for {MapQA}. V-{MODEQA} extracts the underlying structured data from a map image with a multi-output model and then performs reasoning on the extracted data. Our experimental results show that V-{MODEQA} has better overall performance and robustness on {MapQA} than the state-of-the-art {ChartQA} and {VQA} algorithms by capturing the unique properties in map question answering.},
	number = {{arXiv}:2211.08545},
	publisher = {{arXiv}},
	author = {Chang, Shuaichen and Palzer, David and Li, Jialin and Fosler-Lussier, Eric and Xiao, Ningchuan},
	urldate = {2025-08-28},
	date = {2022-11-15},
	eprinttype = {arxiv},
	eprint = {2211.08545 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/7YVYJ2MM/Chang et al. - 2022 - MapQA A Dataset for Question Answering on Choropleth Maps.pdf:application/pdf},
	}

	@inproceedings{mishra_ocr-vqa_2019,
	title = {Ocr-vqa: Visual question answering by reading text in images},
	url = {https://ieeexplore.ieee.org/abstract/document/8978122/},
	shorttitle = {Ocr-vqa},
	pages = {947--952},
	booktitle = {2019 international conference on document analysis and recognition ({ICDAR})},
	publisher = {{IEEE}},
	author = {Mishra, Anand and Shekhar, Shashank and Singh, Ajeet Kumar and Chakraborty, Anirban},
	urldate = {2025-09-01},
	date = {2019},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/HZHKC3K9/Mishra et al. - 2019 - Ocr-vqa Visual question answering by reading text in images.pdf:application/pdf},
	}

	@incollection{de_francisci_morales_pdf-vqa_2023,
	location = {Cham},
	title = {{PDF}-{VQA}: A New Dataset for Real-World {VQA} on {PDF} Documents},
	volume = {14174},
	isbn = {978-3-031-43426-6 978-3-031-43427-3},
	url = {https://link.springer.com/10.1007/978-3-031-43427-3_35},
	shorttitle = {{PDF}-{VQA}},
	pages = {585--601},
	booktitle = {Machine Learning and Knowledge Discovery in Databases: Applied Data Science and Demo Track},
	publisher = {Springer Nature Switzerland},
	author = {Ding, Yihao and Luo, Siwen and Chung, Hyunsuk and Han, Soyeon Caren},
	editor = {De Francisci Morales, Gianmarco and Perlich, Claudia and Ruchansky, Natali and Kourtellis, Nicolas and Baralis, Elena and Bonchi, Francesco},
	urldate = {2025-09-01},
	date = {2023},
	langid = {english},
	doi = {10.1007/978-3-031-43427-3_35},
	note = {Series Title: Lecture Notes in Computer Science},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/9P2QMB25/Ding et al. - 2023 - PDF-VQA A New Dataset for Real-World VQA on PDF Documents.pdf:application/pdf},
	}

	@inproceedings{wang_screen2words_2021,
	location = {Virtual Event {USA}},
	title = {Screen2Words: Automatic Mobile {UI} Summarization with Multimodal Learning},
	isbn = {978-1-4503-8635-7},
	url = {https://dl.acm.org/doi/10.1145/3472749.3474765},
	doi = {10.1145/3472749.3474765},
	shorttitle = {Screen2Words},
	eventtitle = {{UIST} '21: The 34th Annual {ACM} Symposium on User Interface Software and Technology},
	pages = {498--510},
	booktitle = {The 34th Annual {ACM} Symposium on User Interface Software and Technology},
	publisher = {{ACM}},
	author = {Wang, Bryan and Li, Gang and Zhou, Xin and Chen, Zhourong and Grossman, Tovi and Li, Yang},
	urldate = {2025-09-01},
	date = {2021-10-10},
	langid = {english},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/IGDFBN4I/Wang et al. - 2021 - Screen2Words Automatic Mobile UI Summarization with Multimodal Learning.pdf:application/pdf},
	}

	@misc{hsiao_screenqa_2025,
	title = {{ScreenQA}: Large-Scale Question-Answer Pairs over Mobile App Screenshots},
	url = {http://arxiv.org/abs/2209.08199},
	doi = {10.48550/arXiv.2209.08199},
	shorttitle = {{ScreenQA}},
	abstract = {We introduce {ScreenQA}, a novel benchmarking dataset designed to advance screen content understanding through question answering. The existing screen datasets are focused either on low-level structural and component understanding, or on a much higher-level composite task such as navigation and task completion for autonomous agents. {ScreenQA} attempts to bridge this gap. By annotating 86k question-answer pairs over the {RICO} dataset, we aim to benchmark the screen reading comprehension capacity, thereby laying the foundation for vision-based automation over screenshots. Our annotations encompass full answers, short answer phrases, and corresponding {UI} contents with bounding boxes, enabling four subtasks to address various application scenarios. We evaluate the dataset's efficacy using both open-weight and proprietary models in zero-shot, fine-tuned, and transfer learning settings. We further demonstrate positive transfer to web applications, highlighting its potential beyond mobile applications.},
	number = {{arXiv}:2209.08199},
	publisher = {{arXiv}},
	author = {Hsiao, Yu-Chung and Zubach, Fedir and Baechler, Gilles and Sunkara, Srinivas and Carbune, Victor and Lin, Jason and Wang, Maria and Zhu, Yun and Chen, Jindong},
	urldate = {2025-09-01},
	date = {2025-02-09},
	eprinttype = {arxiv},
	eprint = {2209.08199 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Human-Computer Interaction},
	file = {Preprint PDF:/Users/luis/Zotero/storage/VVGLSVIX/Hsiao et al. - 2025 - ScreenQA Large-Scale Question-Answer Pairs over Mobile App Screenshots.pdf:application/pdf},
	}

	@inproceedings{tanaka_slidevqa_2023,
	title = {Slidevqa: A dataset for document visual question answering on multiple images},
	volume = {37},
	url = {https://ojs.aaai.org/index.php/AAAI/article/view/26598},
	shorttitle = {Slidevqa},
	pages = {13636--13645},
	booktitle = {Proceedings of the {AAAI} Conference on Artificial Intelligence},
	author = {Tanaka, Ryota and Nishida, Kyosuke and Nishida, Kosuke and Hasegawa, Taku and Saito, Itsumi and Saito, Kuniko},
	urldate = {2025-09-01},
	date = {2023},
	note = {Issue: 11},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/7F9ZXAY9/Tanaka et al. - 2023 - Slidevqa A dataset for document visual question answering on multiple images.pdf:application/pdf},
	}

	@inproceedings{biten_scene_2019,
	title = {Scene text visual question answering},
	url = {http://openaccess.thecvf.com/content_ICCV_2019/html/Biten_Scene_Text_Visual_Question_Answering_ICCV_2019_paper.html},
	pages = {4291--4301},
	booktitle = {Proceedings of the {IEEE}/{CVF} international conference on computer vision},
	author = {Biten, Ali Furkan and Tito, Ruben and Mafla, Andres and Gomez, Lluis and Rusinol, Marçal and Valveny, Ernest and Jawahar, C. V. and Karatzas, Dimosthenis},
	urldate = {2025-09-01},
	date = {2019},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/5XAZ5M3J/Biten et al. - 2019 - Scene text visual question answering.pdf:application/pdf},
	}

	@online{noauthor_sujet-aisujet-finance-qa-vision-100k_2024,
	title = {sujet-ai/Sujet-Finance-{QA}-Vision-100k · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/sujet-ai/Sujet-Finance-QA-Vision-100k},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-01},
	date = {2024-07-14},
	file = {Snapshot:/Users/luis/Zotero/storage/DMHCHC3P/Sujet-Finance-QA-Vision-100k.html:text/html},
	}

	@online{noauthor_jimmycartertextocr-gpt4v_2024,
	title = {jimmycarter/textocr-gpt4v · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/jimmycarter/textocr-gpt4v},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-01},
	date = {2024-03-29},
	}

	@inproceedings{singh_towards_2019,
	title = {Towards vqa models that can read},
	url = {http://openaccess.thecvf.com/content_CVPR_2019/html/Singh_Towards_VQA_Models_That_Can_Read_CVPR_2019_paper.html},
	pages = {8317--8326},
	booktitle = {Proceedings of the {IEEE}/{CVF} conference on computer vision and pattern recognition},
	author = {Singh, Amanpreet and Natarajan, Vivek and Shah, Meet and Jiang, Yu and Chen, Xinlei and Batra, Dhruv and Parikh, Devi and Rohrbach, Marcus},
	urldate = {2025-09-01},
	date = {2019},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/Q5FZE994/Singh et al. - 2019 - Towards vqa models that can read.pdf:application/pdf},
	}

	@misc{ye_ureader_2023,
	title = {{UReader}: Universal {OCR}-free Visually-situated Language Understanding with Multimodal Large Language Model},
	url = {http://arxiv.org/abs/2310.05126},
	doi = {10.48550/arXiv.2310.05126},
	shorttitle = {{UReader}},
	abstract = {Text is ubiquitous in our visual world, conveying crucial information, such as in documents, websites, and everyday photographs. In this work, we propose {UReader}, a first exploration of universal {OCR}-free visually-situated language understanding based on the Multimodal Large Language Model ({MLLM}). By leveraging the shallow text recognition ability of the {MLLM}, we only finetuned 1.2\% parameters and the training cost is much lower than previous work following domain-specific pretraining and finetuning paradigms. Concretely, {UReader} is jointly finetuned on a wide range of Visually-situated Language Understanding tasks via a unified instruction format. To enhance the visual text and semantic understanding, we further apply two auxiliary tasks with the same format, namely text reading and key points generation tasks. We design a shape-adaptive cropping module before the encoder-decoder architecture of {MLLM} to leverage the frozen low-resolution vision encoder for processing high-resolution images. Without downstream finetuning, our single model achieves state-of-the-art ocr-free performance in 8 out of 10 visually-situated language understanding tasks, across 5 domains: documents, tables, charts, natural images, and webpage screenshots. Codes and instruction-tuning datasets will be released.},
	number = {{arXiv}:2310.05126},
	publisher = {{arXiv}},
	author = {Ye, Jiabo and Hu, Anwen and Xu, Haiyang and Ye, Qinghao and Yan, Ming and Xu, Guohai and Li, Chenliang and Tian, Junfeng and Qian, Qi and Zhang, Ji and Jin, Qin and He, Liang and Lin, Xin Alex and Huang, Fei},
	urldate = {2025-09-01},
	date = {2023-10-08},
	eprinttype = {arxiv},
	eprint = {2310.05126 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/UDU6YRJL/Ye et al. - 2023 - UReader Universal OCR-free Visually-situated Language Understanding with Multimodal Large Language.pdf:application/pdf},
	}

	@inproceedings{tanaka_visualmrc_2021,
	title = {Visualmrc: Machine reading comprehension on document images},
	volume = {35},
	url = {https://ojs.aaai.org/index.php/AAAI/article/view/17635},
	shorttitle = {Visualmrc},
	pages = {13878--13888},
	booktitle = {Proceedings of the {AAAI} Conference on Artificial Intelligence},
	author = {Tanaka, Ryota and Nishida, Kyosuke and Yoshida, Sen},
	urldate = {2025-09-01},
	date = {2021},
	note = {Issue: 15},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/8J543NK9/Tanaka et al. - 2021 - Visualmrc Machine reading comprehension on document images.pdf:application/pdf},
	}

	@misc{he_pathvqa_2020,
	title = {{PathVQA}: 30000+ Questions for Medical Visual Question Answering},
	url = {http://arxiv.org/abs/2003.10286},
	doi = {10.48550/arXiv.2003.10286},
	shorttitle = {{PathVQA}},
	abstract = {Is it possible to develop an "{AI} Pathologist" to pass the board-certified examination of the American Board of Pathology? To achieve this goal, the first step is to create a visual question answering ({VQA}) dataset where the {AI} agent is presented with a pathology image together with a question and is asked to give the correct answer. Our work makes the first attempt to build such a dataset. Different from creating general-domain {VQA} datasets where the images are widely accessible and there are many crowdsourcing workers available and capable of generating question-answer pairs, developing a medical {VQA} dataset is much more challenging. First, due to privacy concerns, pathology images are usually not publicly available. Second, only well-trained pathologists can understand pathology images, but they barely have time to help create datasets for {AI} research. To address these challenges, we resort to pathology textbooks and online digital libraries. We develop a semi-automated pipeline to extract pathology images and captions from textbooks and generate question-answer pairs from captions using natural language processing. We collect 32,799 open-ended questions from 4,998 pathology images where each question is manually checked to ensure correctness. To our best knowledge, this is the first dataset for pathology {VQA}. Our dataset will be released publicly to promote research in medical {VQA}.},
	number = {{arXiv}:2003.10286},
	publisher = {{arXiv}},
	author = {He, Xuehai and Zhang, Yichen and Mou, Luntian and Xing, Eric and Xie, Pengtao},
	urldate = {2025-09-01},
	date = {2020-03-07},
	eprinttype = {arxiv},
	eprint = {2003.10286 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/8IETYYKF/He et al. - 2020 - PathVQA 30000+ Questions for Medical Visual Question Answering.pdf:application/pdf},
	}

	@misc{zhang_pmc-vqa_2024,
	title = {{PMC}-{VQA}: Visual Instruction Tuning for Medical Visual Question Answering},
	url = {http://arxiv.org/abs/2305.10415},
	doi = {10.48550/arXiv.2305.10415},
	shorttitle = {{PMC}-{VQA}},
	abstract = {Medical Visual Question Answering ({MedVQA}) presents a significant opportunity to enhance diagnostic accuracy and healthcare delivery by leveraging artificial intelligence to interpret and answer questions based on medical images. In this study, we reframe the problem of {MedVQA} as a generation task that naturally follows the human-machine interaction and propose a generative-based model for medical visual understanding by aligning visual information from a pre-trained vision encoder with a large language model. We establish a scalable pipeline to construct a large-scale medical visual question-answering dataset, named {PMC}-{VQA}, which contains 227k {VQA} pairs of 149k images that cover various modalities or diseases. We train the proposed model on {PMC}-{VQA} and then fine-tune it on multiple public benchmarks, e.g., {VQA}-{RAD}, {SLAKE}, and Image-Clef-2019, significantly outperforming existing {MedVQA} models in generating relevant, accurate free-form answers. In addition, we propose a test set that has undergone manual verification, which is significantly more challenging, serving to better monitor the development of generative {MedVQA} methods. To facilitate comprehensive evaluation and comparison, we have maintained a leaderboard at https://paperswithcode.com/paper/pmc-vqa-visual-instruction-tuning-for-medical, offering a centralized resource for tracking progress and benchmarking state-of-the-art approaches. The {PMC}-{VQA} dataset emerges as a vital resource for the field of research, and the {MedVInT} presents a significant breakthrough in the area of {MedVQA}.},
	number = {{arXiv}:2305.10415},
	publisher = {{arXiv}},
	author = {Zhang, Xiaoman and Wu, Chaoyi and Zhao, Ziheng and Lin, Weixiong and Zhang, Ya and Wang, Yanfeng and Xie, Weidi},
	urldate = {2025-09-01},
	date = {2024-09-08},
	eprinttype = {arxiv},
	eprint = {2305.10415 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/HIEED5NH/Zhang et al. - 2024 - PMC-VQA Visual Instruction Tuning for Medical Visual Question Answering.pdf:application/pdf},
	}

	@article{lu_learn_2022,
	title = {Learn to explain: Multimodal reasoning via thought chains for science question answering},
	volume = {35},
	url = {https://proceedings.neurips.cc/paper_files/paper/2022/hash/11332b6b6cf4485b84afadb1352d3a9a-Abstract-Conference.html},
	shorttitle = {Learn to explain},
	pages = {2507--2521},
	journaltitle = {Advances in Neural Information Processing Systems},
	author = {Lu, Pan and Mishra, Swaroop and Xia, Tanglin and Qiu, Liang and Chang, Kai-Wei and Zhu, Song-Chun and Tafjord, Oyvind and Clark, Peter and Kalyan, Ashwin},
	urldate = {2025-09-01},
	date = {2022},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/25GW7BYS/Lu et al. - 2022 - Learn to explain Multimodal reasoning via thought chains for science question answering.pdf:application/pdf},
	}

	@inproceedings{kembhavi_are_2017,
	title = {Are you smarter than a sixth grader? textbook question answering for multimodal machine comprehension},
	url = {http://openaccess.thecvf.com/content_cvpr_2017/html/Kembhavi_Are_You_Smarter_CVPR_2017_paper.html},
	shorttitle = {Are you smarter than a sixth grader?},
	pages = {4999--5007},
	booktitle = {Proceedings of the {IEEE} Conference on Computer Vision and Pattern recognition},
	author = {Kembhavi, Aniruddha and Seo, Minjoon and Schwenk, Dustin and Choi, Jonghyun and Farhadi, Ali and Hajishirzi, Hannaneh},
	urldate = {2025-09-01},
	date = {2017},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/CTU4JLWX/Kembhavi et al. - 2017 - Are you smarter than a sixth grader textbook question answering for multimodal machine comprehensio.pdf:application/pdf},
	}

	@misc{jia_visualwebinstruct_2025,
	title = {{VisualWebInstruct}: Scaling up Multimodal Instruction Data through Web Search},
	url = {http://arxiv.org/abs/2503.10582},
	doi = {10.48550/arXiv.2503.10582},
	shorttitle = {{VisualWebInstruct}},
	abstract = {Vision-Language Models have made significant progress on many perception-focused tasks. However, their progress on reasoning-focused tasks remains limited due to the lack of high-quality and diverse training data. In this work, we aim to address the scarcity of reasoning-focused multimodal datasets. We propose {VisualWebInstruct}, a novel approach that leverages search engines to create a diverse and high-quality dataset spanning multiple disciplines, including mathematics, physics, finance, and chemistry, etc. Starting with a meticulously selected set of 30,000 seed images, we employ Google Image Search to identify websites containing similar images. We collect and process {HTML} data from over 700K unique {URLs}. Through a pipeline of content extraction, filtering, and synthesis, we construct a dataset of approximately 900K question-answer ({QA}) pairs, with 40\% consisting of visual {QA} pairs and the remaining comprising text-based {QA} pairs. Models fine-tuned on {VisualWebInstruct} demonstrate significant performance improvements: (1) fine-tuning on Llava-{OV} results in 10-20 absolute points improvement across benchmarks, and (2) fine-tuning from {MAmmoTH}-{VL} yields a 5 absolute points gain across benchmarks. Our best model, {MAmmoTH}-{VL}2, achieves state-of-the-art performance within the 10B parameter class on {MMMU}-Pro (40.7), {MathVerse} (42.6), and {DynaMath} (55.7). These results highlight the effectiveness of our dataset in enhancing the reasoning capabilities of vision-language models for complex multimodal tasks.},
	number = {{arXiv}:2503.10582},
	publisher = {{arXiv}},
	author = {Jia, Yiming and Li, Jiachen and Yue, Xiang and Li, Bo and Nie, Ping and Zou, Kai and Chen, Wenhu},
	urldate = {2025-09-01},
	date = {2025-03-15},
	eprinttype = {arxiv},
	eprint = {2503.10582 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/SWZURDKE/Jia et al. - 2025 - VisualWebInstruct Scaling up Multimodal Instruction Data through Web Search.pdf:application/pdf},
	}

	@article{lau_dataset_2018,
	title = {A dataset of clinically generated visual questions and answers about radiology images},
	volume = {5},
	url = {https://www.nature.com/articles/sdata2018251},
	pages = {1--10},
	number = {1},
	journaltitle = {Scientific data},
	author = {Lau, Jason J. and Gayen, Soumya and Ben Abacha, Asma and Demner-Fushman, Dina},
	urldate = {2025-09-01},
	date = {2018},
	note = {Publisher: Nature Publishing Group},
	}

	@misc{zheng_opencodeinterpreter_2025,
	title = {{OpenCodeInterpreter}: Integrating Code Generation with Execution and Refinement},
	url = {http://arxiv.org/abs/2402.14658},
	doi = {10.48550/arXiv.2402.14658},
	shorttitle = {{OpenCodeInterpreter}},
	abstract = {The introduction of large language models has significantly advanced code generation. However, open-source models often lack the execution capabilities and iterative refinement of advanced systems like the {GPT}-4 Code Interpreter. To address this, we introduce {OpenCodeInterpreter}, a family of open-source code systems designed for generating, executing, and iteratively refining code. Supported by Code-Feedback, a dataset featuring 68K multi-turn interactions, {OpenCodeInterpreter} integrates execution and human feedback for dynamic code refinement. Our comprehensive evaluation of {OpenCodeInterpreter} across key benchmarks such as {HumanEval}, {MBPP}, and their enhanced versions from {EvalPlus} reveals its exceptional performance. Notably, {OpenCodeInterpreter}-33B achieves an accuracy of 83.2 (76.4) on the average (and plus versions) of {HumanEval} and {MBPP}, closely rivaling {GPT}-4's 84.2 (76.2) and further elevates to 91.6 (84.6) with synthesized human feedback from {GPT}-4. {OpenCodeInterpreter} brings the gap between open-source code generation models and proprietary systems like {GPT}-4 Code Interpreter.},
	number = {{arXiv}:2402.14658},
	publisher = {{arXiv}},
	author = {Zheng, Tianyu and Zhang, Ge and Shen, Tianhao and Liu, Xueling and Lin, Bill Yuchen and Fu, Jie and Chen, Wenhu and Yue, Xiang},
	urldate = {2025-09-01},
	date = {2025-01-07},
	eprinttype = {arxiv},
	eprint = {2402.14658 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Software Engineering},
	file = {Preprint PDF:/Users/luis/Zotero/storage/EP9274EY/Zheng et al. - 2025 - OpenCodeInterpreter Integrating Code Generation with Execution and Refinement.pdf:application/pdf},
	}

	@inproceedings{zhang_infinity_2024,
	location = {Boise {ID} {USA}},
	title = {Infinity {\textless}span style="font-variant:small-caps;"{\textgreater}Math:{\textless}/span{\textgreater} A Scalable Instruction Tuning Dataset in Programmatic Mathematical Reasoning},
	isbn = {979-8-4007-0436-9},
	url = {https://dl.acm.org/doi/10.1145/3627673.3679122},
	doi = {10.1145/3627673.3679122},
	shorttitle = {Infinity {\textless}span style="font-variant},
	eventtitle = {{CIKM} '24: The 33rd {ACM} International Conference on Information and Knowledge Management},
	pages = {5405--5409},
	booktitle = {Proceedings of the 33rd {ACM} International Conference on Information and Knowledge Management},
	publisher = {{ACM}},
	author = {Zhang, Bo-Wen and Yan, Yan and Li, Lin and Liu, Guang},
	urldate = {2025-09-01},
	date = {2024-10-21},
	langid = {english},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/L9GVRU2G/Zhang et al. - 2024 - Infinity Math A Scalable Instruction Tuning Dataset i.pdf:application/pdf},
	}

	@misc{yue_mammoth_2023,
	title = {{MAmmoTH}: Building Math Generalist Models through Hybrid Instruction Tuning},
	url = {http://arxiv.org/abs/2309.05653},
	doi = {10.48550/arXiv.2309.05653},
	shorttitle = {{MAmmoTH}},
	abstract = {We introduce {MAmmoTH}, a series of open-source large language models ({LLMs}) specifically tailored for general math problem-solving. The {MAmmoTH} models are trained on {MathInstruct}, our meticulously curated instruction tuning dataset. {MathInstruct} is compiled from 13 math datasets with intermediate rationales, six of which have rationales newly curated by us. It presents a unique hybrid of chain-of-thought ({CoT}) and program-of-thought ({PoT}) rationales, and also ensures extensive coverage of diverse fields in math. The hybrid of {CoT} and {PoT} not only unleashes the potential of tool use but also allows different thought processes for different math problems. As a result, the {MAmmoTH} series substantially outperform existing open-source models on nine mathematical reasoning datasets across all scales with an average accuracy gain between 16\% and 32\%. Remarkably, our {MAmmoTH}-7B model reaches 33\% on {MATH} (a competition-level dataset), which exceeds the best open-source 7B model ({WizardMath}) by 23\%, and the {MAmmoTH}-34B model achieves 44\% accuracy on {MATH}, even surpassing {GPT}-4's {CoT} result. Our work underscores the importance of diverse problem coverage and the use of hybrid rationales in developing superior math generalist models.},
	number = {{arXiv}:2309.05653},
	publisher = {{arXiv}},
	author = {Yue, Xiang and Qu, Xingwei and Zhang, Ge and Fu, Yao and Huang, Wenhao and Sun, Huan and Su, Yu and Chen, Wenhu},
	urldate = {2025-09-01},
	date = {2023-10-03},
	eprinttype = {arxiv},
	eprint = {2309.05653 [cs]},
	keywords = {Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/VZPWS6RD/Yue et al. - 2023 - MAmmoTH Building Math Generalist Models through Hybrid Instruction Tuning.pdf:application/pdf},
	}

	@misc{amini_mathqa_2019,
	title = {{MathQA}: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms},
	url = {http://arxiv.org/abs/1905.13319},
	doi = {10.48550/arXiv.1905.13319},
	shorttitle = {{MathQA}},
	abstract = {We introduce a large-scale dataset of math word problems and an interpretable neural math problem solver that learns to map problems to operation programs. Due to annotation challenges, current datasets in this domain have been either relatively small in scale or did not offer precise operational annotations over diverse problem types. We introduce a new representation language to model precise operation programs corresponding to each math problem that aim to improve both the performance and the interpretability of the learned models. Using this representation language, our new dataset, {MathQA}, significantly enhances the {AQuA} dataset with fully-specified operational programs. We additionally introduce a neural sequence-to-program model enhanced with automatic problem categorization. Our experiments show improvements over competitive baselines in our {MathQA} as well as the {AQuA} dataset. The results are still significantly lower than human performance indicating that the dataset poses new challenges for future research. Our dataset is available at: https://math-qa.github.io/math-{QA}/},
	number = {{arXiv}:1905.13319},
	publisher = {{arXiv}},
	author = {Amini, Aida and Gabriel, Saadia and Lin, Peter and Koncel-Kedziorski, Rik and Choi, Yejin and Hajishirzi, Hannaneh},
	urldate = {2025-09-01},
	date = {2019-05-30},
	eprinttype = {arxiv},
	eprint = {1905.13319 [cs]},
	keywords = {Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/NQ3E8ULH/Amini et al. - 2019 - MathQA Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms.pdf:application/pdf},
	}

	@misc{lai_step-dpo_2024,
	title = {Step-{DPO}: Step-wise Preference Optimization for Long-chain Reasoning of {LLMs}},
	url = {http://arxiv.org/abs/2406.18629},
	doi = {10.48550/arXiv.2406.18629},
	shorttitle = {Step-{DPO}},
	abstract = {Mathematical reasoning presents a significant challenge for Large Language Models ({LLMs}) due to the extensive and precise chain of reasoning required for accuracy. Ensuring the correctness of each reasoning step is critical. To address this, we aim to enhance the robustness and factuality of {LLMs} by learning from human feedback. However, Direct Preference Optimization ({DPO}) has shown limited benefits for long-chain mathematical reasoning, as models employing {DPO} struggle to identify detailed errors in incorrect answers. This limitation stems from a lack of fine-grained process supervision. We propose a simple, effective, and data-efficient method called Step-{DPO}, which treats individual reasoning steps as units for preference optimization rather than evaluating answers holistically. Additionally, we have developed a data construction pipeline for Step-{DPO}, enabling the creation of a high-quality dataset containing 10K step-wise preference pairs. We also observe that in {DPO}, self-generated data is more effective than data generated by humans or {GPT}-4, due to the latter's out-of-distribution nature. Our findings demonstrate that as few as 10K preference data pairs and fewer than 500 Step-{DPO} training steps can yield a nearly 3\% gain in accuracy on {MATH} for models with over 70B parameters. Notably, Step-{DPO}, when applied to Qwen2-72B-Instruct, achieves scores of 70.8\% and 94.0\% on the test sets of {MATH} and {GSM}8K, respectively, surpassing a series of closed-source models, including {GPT}-4-1106, Claude-3-Opus, and Gemini-1.5-Pro. Our code, data, and models are available at https://github.com/dvlab-research/Step-{DPO}.},
	number = {{arXiv}:2406.18629},
	publisher = {{arXiv}},
	author = {Lai, Xin and Tian, Zhuotao and Chen, Yukang and Yang, Senqiao and Peng, Xiangru and Jia, Jiaya},
	urldate = {2025-09-01},
	date = {2024-06-26},
	eprinttype = {arxiv},
	eprint = {2406.18629 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Machine Learning},
	file = {Preprint PDF:/Users/luis/Zotero/storage/M8WBKQ22/Lai et al. - 2024 - Step-DPO Step-wise Preference Optimization for Long-chain Reasoning of LLMs.pdf:application/pdf},
	}

	@online{noauthor_ai-monuminamath-cot_2025,
	title = {{AI}-{MO}/{NuminaMath}-{CoT} · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/AI-MO/NuminaMath-CoT},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-01},
	date = {2025-02-25},
	}

	@online{noauthor_tekniumopenhermes-25_2024,
	title = {teknium/{OpenHermes}-2.5 · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/teknium/OpenHermes-2.5},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-01},
	date = {2024-05-20},
	file = {Snapshot:/Users/luis/Zotero/storage/7CIKXUCL/OpenHermes-2.html:text/html},
	}

	@online{noauthor_open-orcaopenorca_2024,
	title = {Open-Orca/{OpenOrca} · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/Open-Orca/OpenOrca},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-01},
	date = {2024-09-10},
	file = {Snapshot:/Users/luis/Zotero/storage/HXFU3Z6R/OpenOrca.html:text/html},
	}

	@misc{mitra_orca-math_2024,
	title = {Orca-Math: Unlocking the potential of {SLMs} in Grade School Math},
	url = {http://arxiv.org/abs/2402.14830},
	doi = {10.48550/arXiv.2402.14830},
	shorttitle = {Orca-Math},
	abstract = {Mathematical word problem-solving has long been recognized as a complex task for small language models ({SLMs}). A recent study hypothesized that the smallest model size, needed to achieve over 80\% accuracy on the {GSM}8K benchmark, is 34 billion parameters. To reach this level of performance with smaller models, researcher often train {SLMs} to generate Python code or use tools to help avoid calculation errors. Additionally, they employ ensembling, where outputs of up to 100 model runs are combined to arrive at a more accurate result. Result selection is done using consensus, majority vote or a separate a verifier model used in conjunction with the {SLM}. Ensembling provides a substantial boost in accuracy but at a significant cost increase with multiple calls to the model (e.g., Phi-{GSM} uses top-48 to boost the performance from 68.2 to 81.5). In this work, we present Orca-Math, a 7-billion-parameter {SLM} based on the Mistral-7B, which achieves 86.81\% on {GSM}8k without the need for multiple model calls or the use of verifiers, code execution or any other external tools. Our approach has the following key elements: (1) A high quality synthetic dataset of 200K math problems created using a multi-agent setup where agents collaborate to create the data, (2) An iterative learning techniques that enables the {SLM} to practice solving problems, receive feedback on its solutions and learn from preference pairs incorporating the {SLM} solutions and the feedback. When trained with Supervised Fine-Tuning alone, Orca-Math achieves 81.50\% on {GSM}8k pass@1 metric. With iterative preference learning, Orca-Math achieves 86.81\% pass@1. Orca-Math surpasses the performance of significantly larger models such as {LLAMA}-2-70B, {WizardMath}-70B, Gemini-Pro, {ChatGPT}-3.5. It also significantly outperforms other smaller models while using much smaller data (hundreds of thousands vs. millions of problems).},
	number = {{arXiv}:2402.14830},
	publisher = {{arXiv}},
	author = {Mitra, Arindam and Khanpour, Hamed and Rosset, Corby and Awadallah, Ahmed},
	urldate = {2025-09-01},
	date = {2024-02-16},
	eprinttype = {arxiv},
	eprint = {2402.14830 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/AVMME2BG/Mitra et al. - 2024 - Orca-Math Unlocking the potential of SLMs in Grade School Math.pdf:application/pdf},
	}

	@online{noauthor_flytechpython-codes-25k_2024,
	title = {flytech/python-codes-25k · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/flytech/python-codes-25k},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-01},
	date = {2024-08-30},
	file = {Snapshot:/Users/luis/Zotero/storage/IUUZFH77/python-codes-25k.html:text/html},
	}

	@online{noauthor_qywuruozhiba_en_nodate,
	title = {qywu/ruozhiba\_en · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/qywu/ruozhiba_en},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-01},
	file = {Snapshot:/Users/luis/Zotero/storage/SZ4UTDT9/ruozhiba_en.html:text/html},
	}

	@misc{chen_theoremqa_2023,
	title = {{TheoremQA}: A Theorem-driven Question Answering dataset},
	url = {http://arxiv.org/abs/2305.12524},
	doi = {10.48550/arXiv.2305.12524},
	shorttitle = {{TheoremQA}},
	abstract = {The recent {LLMs} like {GPT}-4 and {PaLM}-2 have made tremendous progress in solving fundamental math problems like {GSM}8K by achieving over 90\% accuracy. However, their capabilities to solve more challenging math problems which require domain-specific knowledge (i.e. theorem) have yet to be investigated. In this paper, we introduce {TheoremQA}, the first theorem-driven question-answering dataset designed to evaluate {AI} models' capabilities to apply theorems to solve challenging science problems. {TheoremQA} is curated by domain experts containing 800 high-quality questions covering 350 theorems (e.g. Taylor's theorem, Lagrange's theorem, Huffman coding, Quantum Theorem, Elasticity Theorem, etc) from Math, Physics, {EE}\&{CS}, and Finance. We evaluate a wide spectrum of 16 large language and code models with different prompting strategies like Chain-of-Thoughts and Program-of-Thoughts. We found that {GPT}-4's capabilities to solve these problems are unparalleled, achieving an accuracy of 51\% with Program-of-Thoughts Prompting. All the existing open-sourced models are below 15\%, barely surpassing the random-guess baseline. Given the diversity and broad coverage of {TheoremQA}, we believe it can be used as a better benchmark to evaluate {LLMs}' capabilities to solve challenging science problems. The data and code are released in https://github.com/wenhuchen/{TheoremQA}.},
	number = {{arXiv}:2305.12524},
	publisher = {{arXiv}},
	author = {Chen, Wenhu and Yin, Ming and Ku, Max and Lu, Pan and Wan, Yixin and Ma, Xueguang and Xu, Jianyu and Wang, Xinyi and Xia, Tony},
	urldate = {2025-09-01},
	date = {2023-12-06},
	eprinttype = {arxiv},
	eprint = {2305.12524 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/PSP5N9I2/Chen et al. - 2023 - TheoremQA A Theorem-driven Question Answering dataset.pdf:application/pdf},
	}

	@online{noauthor_wizardlmteamwizardlm_evol_instruct_70k_2024,
	title = {{WizardLMTeam}/{WizardLM}\_evol\_instruct\_70k · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/WizardLMTeam/WizardLM_evol_instruct_70k},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-01},
	date = {2024-03-05},
	file = {Snapshot:/Users/luis/Zotero/storage/WR9D4WRG/WizardLM_evol_instruct_70k.html:text/html},
	}

	@article{mollahosseini_affectnet_2017,
	title = {Affectnet: A database for facial expression, valence, and arousal computing in the wild},
	volume = {10},
	url = {https://ieeexplore.ieee.org/abstract/document/8013713/},
	shorttitle = {Affectnet},
	pages = {18--31},
	number = {1},
	journaltitle = {{IEEE} Transactions on Affective Computing},
	author = {Mollahosseini, Ali and Hasani, Behzad and Mahoor, Mohammad H.},
	urldate = {2025-09-02},
	date = {2017},
	note = {Publisher: {IEEE}},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/7MRUJI93/Mollahosseini et al. - 2017 - Affectnet A database for facial expression, valence, and arousal computing in the wild.pdf:application/pdf},
	}

	@online{noauthor_laiongpt4v-dataset_2023,
	title = {laion/gpt4v-dataset · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/laion/gpt4v-dataset},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-02},
	date = {2023-11-05},
	}

	@incollection{leonardis_sharegpt4v_2025,
	location = {Cham},
	title = {{ShareGPT}4V: Improving Large Multi-modal Models with Better Captions},
	volume = {15075},
	isbn = {978-3-031-72642-2 978-3-031-72643-9},
	url = {https://link.springer.com/10.1007/978-3-031-72643-9_22},
	shorttitle = {{ShareGPT}4V},
	pages = {370--387},
	booktitle = {Computer Vision – {ECCV} 2024},
	publisher = {Springer Nature Switzerland},
	author = {Chen, Lin and Li, Jinsong and Dong, Xiaoyi and Zhang, Pan and He, Conghui and Wang, Jiaqi and Zhao, Feng and Lin, Dahua},
	editor = {Leonardis, Aleš and Ricci, Elisa and Roth, Stefan and Russakovsky, Olga and Sattler, Torsten and Varol, Gül},
	urldate = {2025-09-02},
	date = {2025},
	langid = {english},
	doi = {10.1007/978-3-031-72643-9_22},
	note = {Series Title: Lecture Notes in Computer Science},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/YJM33647/Chen et al. - 2025 - ShareGPT4V Improving Large Multi-modal Models with Better Captions.pdf:application/pdf},
	}

	@misc{shi_math-llava_2024,
	title = {Math-{LLaVA}: Bootstrapping Mathematical Reasoning for Multimodal Large Language Models},
	url = {http://arxiv.org/abs/2406.17294},
	doi = {10.48550/arXiv.2406.17294},
	shorttitle = {Math-{LLaVA}},
	abstract = {Large language models ({LLMs}) have demonstrated impressive reasoning capabilities, particularly in textual mathematical problem-solving. However, existing open-source image instruction fine-tuning datasets, containing limited question-answer pairs per image, do not fully exploit visual information to enhance the multimodal mathematical reasoning capabilities of Multimodal {LLMs} ({MLLMs}). To bridge this gap, we address the lack of high-quality, diverse multimodal mathematical datasets by collecting 40K high-quality images with question-answer pairs from 24 existing datasets and synthesizing 320K new pairs, creating the {MathV}360K dataset, which enhances both the breadth and depth of multimodal mathematical questions. We introduce Math-{LLaVA}, a {LLaVA}-1.5-based model fine-tuned with {MathV}360K. This novel approach significantly improves the multimodal mathematical reasoning capabilities of {LLaVA}-1.5, achieving a 19-point increase and comparable performance to {GPT}-4V on {MathVista}'s minitest split, and yielding leading performance on Math-V and {MathVerse}. Furthermore, Math-{LLaVA} demonstrates enhanced generalizability, showing substantial improvements on the {MMMU} benchmark. Our research highlights the importance of dataset diversity and synthesis in advancing {MLLMs}' mathematical reasoning abilities. The code and data are available at: {\textbackslash}url\{https://github.com/{HZQ}950419/Math-{LLaVA}\}.},
	number = {{arXiv}:2406.17294},
	publisher = {{arXiv}},
	author = {Shi, Wenhao and Hu, Zhiqiang and Bin, Yi and Liu, Junhua and Yang, Yang and Ng, See-Kiong and Bing, Lidong and Lee, Roy Ka-Wei},
	urldate = {2025-09-02},
	date = {2024-10-08},
	eprinttype = {arxiv},
	eprint = {2406.17294 [cs]},
	keywords = {Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/PUM5TJXX/Shi et al. - 2024 - Math-LLaVA Bootstrapping Mathematical Reasoning for Multimodal Large Language Models.pdf:application/pdf},
	}

	@online{noauthor_lmms-labllava-onevision-data_2025,
	title = {lmms-lab/{LLaVA}-{OneVision}-Data · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-02},
	date = {2025-06-18},
	file = {Snapshot:/Users/luis/Zotero/storage/ZX8AWDL6/LLaVA-OneVision-Data.html:text/html},
	}

	@misc{nassar_smoldocling_2025,
	title = {{SmolDocling}: An ultra-compact vision-language model for end-to-end multi-modal document conversion},
	url = {http://arxiv.org/abs/2503.11576},
	doi = {10.48550/arXiv.2503.11576},
	shorttitle = {{SmolDocling}},
	abstract = {We introduce {SmolDocling}, an ultra-compact vision-language model targeting end-to-end document conversion. Our model comprehensively processes entire pages by generating {DocTags}, a new universal markup format that captures all page elements in their full context with location. Unlike existing approaches that rely on large foundational models, or ensemble solutions that rely on handcrafted pipelines of multiple specialized models, {SmolDocling} offers an end-to-end conversion for accurately capturing content, structure and spatial location of document elements in a 256M parameters vision-language model. {SmolDocling} exhibits robust performance in correctly reproducing document features such as code listings, tables, equations, charts, lists, and more across a diverse range of document types including business documents, academic papers, technical reports, patents, and forms -- significantly extending beyond the commonly observed focus on scientific papers. Additionally, we contribute novel publicly sourced datasets for charts, tables, equations, and code recognition. Experimental results demonstrate that {SmolDocling} competes with other Vision Language Models that are up to 27 times larger in size, while reducing computational requirements substantially. The model is currently available, datasets will be publicly available soon.},
	number = {{arXiv}:2503.11576},
	publisher = {{arXiv}},
	author = {Nassar, Ahmed and Marafioti, Andres and Omenetti, Matteo and Lysak, Maksym and Livathinos, Nikolaos and Auer, Christoph and Morin, Lucas and Lima, Rafael Teixeira de and Kim, Yusik and Gurbuz, A. Said and Dolfi, Michele and Farré, Miquel and Staar, Peter W. J.},
	urldate = {2025-09-02},
	date = {2025-03-14},
	eprinttype = {arxiv},
	eprint = {2503.11576 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/4DHRST9C/Nassar et al. - 2025 - SmolDocling An ultra-compact vision-language model for end-to-end multi-modal document conversion.pdf:application/pdf},
	}

	@online{noauthor_jp1924vqaonbd_nodate,
	title = {jp1924/{VQAonBD} · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/jp1924/VQAonBD},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-02},
	file = {Snapshot:/Users/luis/Zotero/storage/LIY4I9DV/VQAonBD.html:text/html},
	}

	@misc{shridhar_alfworld_2021,
	title = {{ALFWorld}: Aligning Text and Embodied Environments for Interactive Learning},
	url = {http://arxiv.org/abs/2010.03768},
	doi = {10.48550/arXiv.2010.03768},
	shorttitle = {{ALFWorld}},
	abstract = {Given a simple request like Put a washed apple in the kitchen fridge, humans can reason in purely abstract terms by imagining action sequences and scoring their likelihood of success, prototypicality, and efficiency, all without moving a muscle. Once we see the kitchen in question, we can update our abstract plans to fit the scene. Embodied agents require the same abilities, but existing work does not yet provide the infrastructure necessary for both reasoning abstractly and executing concretely. We address this limitation by introducing {ALFWorld}, a simulator that enables agents to learn abstract, text based policies in {TextWorld} (C{\textbackslash}{\textasciicircum}ot{\textbackslash}'e et al., 2018) and then execute goals from the {ALFRED} benchmark (Shridhar et al., 2020) in a rich visual environment. {ALFWorld} enables the creation of a new {BUTLER} agent whose abstract knowledge, learned in {TextWorld}, corresponds directly to concrete, visually grounded actions. In turn, as we demonstrate empirically, this fosters better agent generalization than training only in the visually grounded environment. {BUTLER}'s simple, modular design factors the problem to allow researchers to focus on models for improving every piece of the pipeline (language understanding, planning, navigation, and visual scene understanding).},
	number = {{arXiv}:2010.03768},
	publisher = {{arXiv}},
	author = {Shridhar, Mohit and Yuan, Xingdi and Côté, Marc-Alexandre and Bisk, Yonatan and Trischler, Adam and Hausknecht, Matthew},
	urldate = {2025-09-02},
	date = {2021-03-14},
	eprinttype = {arxiv},
	eprint = {2010.03768 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Computer Science - Robotics},
	file = {Preprint PDF:/Users/luis/Zotero/storage/YTHQNPD6/Shridhar et al. - 2021 - ALFWorld Aligning Text and Embodied Environments for Interactive Learning.pdf:application/pdf},
	}

	@online{noauthor_reilxchinese-meme-description-dataset_2024,
	title = {{REILX}/chinese-meme-description-dataset · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/REILX/chinese-meme-description-dataset},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-02},
	date = {2024-10-24},
	file = {Snapshot:/Users/luis/Zotero/storage/AVRVGLH5/chinese-meme-description-dataset.html:text/html},
	}

	@misc{li_mimic-it_2023,
	title = {{MIMIC}-{IT}: Multi-Modal In-Context Instruction Tuning},
	url = {http://arxiv.org/abs/2306.05425},
	doi = {10.48550/arXiv.2306.05425},
	shorttitle = {{MIMIC}-{IT}},
	abstract = {High-quality instructions and responses are essential for the zero-shot performance of large language models on interactive natural language tasks. For interactive vision-language tasks involving intricate visual scenes, a large quantity of diverse and creative instruction-response pairs should be imperative to tune vision-language models ({VLMs}). Nevertheless, the current availability of vision-language instruction-response pairs in terms of quantity, diversity, and creativity remains limited, posing challenges to the generalization of interactive {VLMs}. Here we present {MultI}-Modal In-Context Instruction Tuning ({MIMIC}-{IT}), a dataset comprising 2.8 million multimodal instruction-response pairs, with 2.2 million unique instructions derived from images and videos. Each pair is accompanied by multi-modal in-context information, forming conversational contexts aimed at empowering {VLMs} in perception, reasoning, and planning. The instruction-response collection process, dubbed as Syphus, is scaled using an automatic annotation pipeline that combines human expertise with {GPT}'s capabilities. Using the {MIMIC}-{IT} dataset, we train a large {VLM} named Otter. Based on extensive evaluations conducted on vision-language benchmarks, it has been observed that Otter demonstrates remarkable proficiency in multi-modal perception, reasoning, and in-context learning. Human evaluation reveals it effectively aligns with the user's intentions. We release the {MIMIC}-{IT} dataset, instruction-response collection pipeline, benchmarks, and the Otter model.},
	number = {{arXiv}:2306.05425},
	publisher = {{arXiv}},
	author = {Li, Bo and Zhang, Yuanhan and Chen, Liangyu and Wang, Jinghao and Pu, Fanyi and Yang, Jingkang and Li, Chunyuan and Liu, Ziwei},
	urldate = {2025-09-02},
	date = {2023-06-08},
	eprinttype = {arxiv},
	eprint = {2306.05425 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Human-Computer Interaction},
	file = {Preprint PDF:/Users/luis/Zotero/storage/KAMRH6GA/Li et al. - 2023 - MIMIC-IT Multi-Modal In-Context Instruction Tuning.pdf:application/pdf;Snapshot:/Users/luis/Zotero/storage/LURADRB6/2306.html:text/html},
	}

	@misc{xu_aguvis_2025,
	title = {Aguvis: Unified Pure Vision Agents for Autonomous {GUI} Interaction},
	url = {http://arxiv.org/abs/2412.04454},
	doi = {10.48550/arXiv.2412.04454},
	shorttitle = {Aguvis},
	abstract = {Automating {GUI} tasks remains challenging due to reliance on textual representations, platform-specific action spaces, and limited reasoning capabilities. We introduce Aguvis, a unified vision-based framework for autonomous {GUI} agents that directly operates on screen images, standardizes cross-platform interactions and incorporates structured reasoning via inner monologue. To enable this, we construct Aguvis Data Collection, a large-scale dataset with multimodal grounding and reasoning annotations, and develop a two-stage training pipeline that separates {GUI} grounding from planning and reasoning. Experiments show that Aguvis achieves state-of-the-art performance across offline and real-world online benchmarks, marking the first fully autonomous vision-based {GUI} agent that operates without closed-source models. We open-source all datasets, models, and training recipes at https://aguvis-project.github.io to advance future research.},
	number = {{arXiv}:2412.04454},
	publisher = {{arXiv}},
	author = {Xu, Yiheng and Wang, Zekun and Wang, Junli and Lu, Dunjie and Xie, Tianbao and Saha, Amrita and Sahoo, Doyen and Yu, Tao and Xiong, Caiming},
	urldate = {2025-09-02},
	date = {2025-05-05},
	eprinttype = {arxiv},
	eprint = {2412.04454 [cs]},
	keywords = {Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/RXYFFLVR/Xu et al. - 2025 - Aguvis Unified Pure Vision Agents for Autonomous GUI Interaction.pdf:application/pdf},
	}

	@inproceedings{shao_objects365_2019,
	location = {Seoul, Korea (South)},
	title = {Objects365: A Large-Scale, High-Quality Dataset for Object Detection},
	rights = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/{IEEE}.html},
	isbn = {978-1-7281-4803-8},
	url = {https://ieeexplore.ieee.org/document/9009553/},
	doi = {10.1109/ICCV.2019.00852},
	shorttitle = {Objects365},
	abstract = {In this paper, we introduce a new large-scale object detection dataset, Objects365, which has 365 object categories over 600K training images. More than 10 million, high-quality bounding boxes are manually labeled through a three-step, carefully designed annotation pipeline. It is the largest object detection dataset (with full annotation) so far and establishes a more challenging benchmark for the community. Objects365 can serve as a better feature learning dataset for localization-sensitive tasks like object detection and semantic segmentation. The Objects365 pre-trained models signiﬁcantly outperform {ImageNet} pre-trained models with 5.6 points gain (42 vs 36.4) based on the standard setting of 90K iterations on {COCO} benchmark. Even compared with much long training time like 540K iterations, our Objects365 pretrained model with 90K iterations still have 2.7 points gain (42 vs 39.3). Meanwhile, the ﬁnetuning time can be greatly reduced (up to 10 times) when reaching the same accuracy. Better generalization ability of Object365 has also been veriﬁed on {CityPersons}, {VOC} segmentation, and {ADE} tasks. The dataset as well as the pretrainedmodels have been released at www.objects365.org.},
	eventtitle = {2019 {IEEE}/{CVF} International Conference on Computer Vision ({ICCV})},
	pages = {8429--8438},
	booktitle = {2019 {IEEE}/{CVF} International Conference on Computer Vision ({ICCV})},
	publisher = {{IEEE}},
	author = {Shao, Shuai and Li, Zeming and Zhang, Tianyuan and Peng, Chao and Yu, Gang and Zhang, Xiangyu and Li, Jing and Sun, Jian},
	urldate = {2025-09-02},
	date = {2019-10},
	langid = {english},
	file = {PDF:/Users/luis/Zotero/storage/KT28EV4H/Shao et al. - 2019 - Objects365 A Large-Scale, High-Quality Dataset for Object Detection.pdf:application/pdf},
	}

	@misc{lindstrom_clevr-math_2022-1,
	title = {{CLEVR}-Math: A Dataset for Compositional Language, Visual and Mathematical Reasoning},
	url = {http://arxiv.org/abs/2208.05358},
	doi = {10.48550/arXiv.2208.05358},
	shorttitle = {{CLEVR}-Math},
	abstract = {We introduce {CLEVR}-Math, a multi-modal math word problems dataset consisting of simple math word problems involving addition/subtraction, represented partly by a textual description and partly by an image illustrating the scenario. The text describes actions performed on the scene that is depicted in the image. Since the question posed may not be about the scene in the image, but about the state of the scene before or after the actions are applied, the solver envision or imagine the state changes due to these actions. Solving these word problems requires a combination of language, visual and mathematical reasoning. We apply state-of-the-art neural and neuro-symbolic models for visual question answering on {CLEVR}-Math and empirically evaluate their performances. Our results show how neither method generalise to chains of operations. We discuss the limitations of the two in addressing the task of multi-modal word problem solving.},
	number = {{arXiv}:2208.05358},
	publisher = {{arXiv}},
	author = {Lindström, Adam Dahlgren and Abraham, Savitha Sam},
	urldate = {2025-09-02},
	date = {2022-08-10},
	eprinttype = {arxiv},
	eprint = {2208.05358 [cs]},
	keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
	file = {Preprint PDF:/Users/luis/Zotero/storage/WLYZ5DF3/Lindström and Abraham - 2022 - CLEVR-Math A Dataset for Compositional Language, Visual and Mathematical Reasoning.pdf:application/pdf},
	}

	@inproceedings{chng_icdar2019_2019,
	title = {Icdar2019 robust reading challenge on arbitrary-shaped text-rrc-art},
	url = {https://ieeexplore.ieee.org/abstract/document/8978157/},
	pages = {1571--1576},
	booktitle = {2019 International Conference on Document Analysis and Recognition ({ICDAR})},
	publisher = {{IEEE}},
	author = {Chng, Chee Kheng and Liu, Yuliang and Sun, Yipeng and Ng, Chun Chet and Luo, Canjie and Ni, Zihan and Fang, {ChuanMing} and Zhang, Shuaitao and Han, Junyu and Ding, Errui},
	urldate = {2025-09-02},
	date = {2019},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/3CLGX47M/Chng et al. - 2019 - Icdar2019 robust reading challenge on arbitrary-shaped text-rrc-art.pdf:application/pdf},
	}

	@misc{veit_coco-text_2016,
	title = {{COCO}-Text: Dataset and Benchmark for Text Detection and Recognition in Natural Images},
	url = {http://arxiv.org/abs/1601.07140},
	doi = {10.48550/arXiv.1601.07140},
	shorttitle = {{COCO}-Text},
	abstract = {This paper describes the {COCO}-Text dataset. In recent years large-scale datasets like {SUN} and Imagenet drove the advancement of scene understanding and object recognition. The goal of {COCO}-Text is to advance state-of-the-art in text detection and recognition in natural images. The dataset is based on the {MS} {COCO} dataset, which contains images of complex everyday scenes. The images were not collected with text in mind and thus contain a broad variety of text instances. To reflect the diversity of text in natural scenes, we annotate text with (a) location in terms of a bounding box, (b) fine-grained classification into machine printed text and handwritten text, (c) classification into legible and illegible text, (d) script of the text and (e) transcriptions of legible text. The dataset contains over 173k text annotations in over 63k images. We provide a statistical analysis of the accuracy of our annotations. In addition, we present an analysis of three leading state-of-the-art photo Optical Character Recognition ({OCR}) approaches on our dataset. While scene text detection and recognition enjoys strong advances in recent years, we identify significant shortcomings motivating future work.},
	number = {{arXiv}:1601.07140},
	publisher = {{arXiv}},
	author = {Veit, Andreas and Matera, Tomas and Neumann, Lukas and Matas, Jiri and Belongie, Serge},
	urldate = {2025-09-02},
	date = {2016-06-19},
	eprinttype = {arxiv},
	eprint = {1601.07140 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {Preprint PDF:/Users/luis/Zotero/storage/XWFKBJR7/Veit et al. - 2016 - COCO-Text Dataset and Benchmark for Text Detection and Recognition in Natural Images.pdf:application/pdf},
	}

	@article{yuan_large_2019,
	title = {A Large Chinese Text Dataset in the Wild},
	volume = {34},
	issn = {1000-9000, 1860-4749},
	url = {http://link.springer.com/10.1007/s11390-019-1923-y},
	doi = {10.1007/s11390-019-1923-y},
	pages = {509--521},
	number = {3},
	journaltitle = {Journal of Computer Science and Technology},
	shortjournal = {J. Comput. Sci. Technol.},
	author = {Yuan, Tai-Ling and Zhu, Zhe and Xu, Kun and Li, Cheng-Jun and Mu, Tai-Jiang and Hu, Shi-Min},
	urldate = {2025-09-02},
	date = {2019-05},
	langid = {english},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/IHYU76H4/Yuan et al. - 2019 - A Large Chinese Text Dataset in the Wild.pdf:application/pdf},
	}

	@inproceedings{gervais_mathwriting_2025,
	location = {Toronto {ON} Canada},
	title = {{MathWriting}: A Dataset For Handwritten Mathematical Expression Recognition},
	isbn = {979-8-4007-1454-2},
	url = {https://dl.acm.org/doi/10.1145/3711896.3737436},
	doi = {10.1145/3711896.3737436},
	shorttitle = {{MathWriting}},
	eventtitle = {{KDD} '25: The 31st {ACM} {SIGKDD} Conference on Knowledge Discovery and Data Mining},
	pages = {5459--5469},
	booktitle = {Proceedings of the 31st {ACM} {SIGKDD} Conference on Knowledge Discovery and Data Mining V.2},
	publisher = {{ACM}},
	author = {Gervais, Philippe and Fadeeva, Anastasiia and Maksai, Andrii},
	urldate = {2025-09-02},
	date = {2025-08-03},
	langid = {english},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/HDVJVBH3/Gervais et al. - 2025 - MathWriting A Dataset For Handwritten Mathematical Expression Recognition.pdf:application/pdf},
	}

	@online{noauthor_im2latex_nodate,
	title = {Im2Latex},
	url = {https://sujayr91.github.io/Im2Latex/},
	urldate = {2025-09-02},
	file = {Im2Latex:/Users/luis/Zotero/storage/WHR6ZQ9Z/Im2Latex.html:text/html},
	}

	@incollection{fink_icdar_2023,
	location = {Cham},
	title = {{ICDAR} 2023 Competition on Structured Text Extraction from Visually-Rich Document Images},
	volume = {14188},
	isbn = {978-3-031-41678-1 978-3-031-41679-8},
	url = {https://link.springer.com/10.1007/978-3-031-41679-8_32},
	pages = {536--552},
	booktitle = {Document Analysis and Recognition - {ICDAR} 2023},
	publisher = {Springer Nature Switzerland},
	author = {Yu, Wenwen and Zhang, Chengquan and Cao, Haoyu and Hua, Wei and Li, Bohan and Chen, Huang and Liu, Mingyu and Chen, Mingrui and Kuang, Jianfeng and Cheng, Mengjun and Du, Yuning and Feng, Shikun and Hu, Xiaoguang and Lyu, Pengyuan and Yao, Kun and Yu, Yuechen and Liu, Yuliang and Che, Wanxiang and Ding, Errui and Liu, Cheng-Lin and Luo, Jiebo and Yan, Shuicheng and Zhang, Min and Karatzas, Dimosthenis and Sun, Xing and Wang, Jingdong and Bai, Xiang},
	editor = {Fink, Gernot A. and Jain, Rajiv and Kise, Koichi and Zanibbi, Richard},
	urldate = {2025-09-02},
	date = {2023},
	langid = {english},
	doi = {10.1007/978-3-031-41679-8_32},
	note = {Series Title: Lecture Notes in Computer Science},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/AKRGINTT/Yu et al. - 2023 - ICDAR 2023 Competition on Structured Text Extraction from Visually-Rich Document Images.pdf:application/pdf},
	}

	@online{noauthor_httpsai100talcomdataset_nodate,
	title = {https://ai.100tal.com/dataset},
	url = {https://ai.100tal.com/dataset},
	shorttitle = {https},
	abstract = {好未来{AI开放平台}，依托多年教育行业经验与海量行业数据优势，深耕教育领域人工智能技术创新，为广大教育行业伙伴提供领先的{AI能力与解决方案}，助力教育智能化发展},
	urldate = {2025-09-02},
	langid = {english},
	file = {Snapshot:/Users/luis/Zotero/storage/P7KI4UWZ/dataset.html:text/html},
	}

	@misc{poznanski_olmocr_2025,
	title = {{olmOCR}: Unlocking Trillions of Tokens in {PDFs} with Vision Language Models},
	url = {http://arxiv.org/abs/2502.18443},
	doi = {10.48550/arXiv.2502.18443},
	shorttitle = {{olmOCR}},
	abstract = {{PDF} documents have the potential to provide trillions of novel, high-quality tokens for training language models. However, these documents come in a diversity of types with differing formats and visual layouts that pose a challenge when attempting to extract and faithfully represent the underlying content for language model use. Traditional open source tools often produce lower quality extractions compared to vision language models ({VLMs}), but reliance on the best {VLMs} can be prohibitively costly (e.g., over 6,240 {USD} per million {PDF} pages for {GPT}-4o) or infeasible if the {PDFs} cannot be sent to proprietary {APIs}. We present {olmOCR}, an open-source toolkit for processing {PDFs} into clean, linearized plain text in natural reading order while preserving structured content like sections, tables, lists, equations, and more. Our toolkit runs a fine-tuned 7B vision language model ({VLM}) trained on {olmOCR}-mix-0225, a sample of 260,000 pages from over 100,000 crawled {PDFs} with diverse properties, including graphics, handwritten text and poor quality scans. {olmOCR} is optimized for large-scale batch processing, able to scale flexibly to different hardware setups and can convert a million {PDF} pages for only 176 {USD}. To aid comparison with existing systems, we also introduce {olmOCR}-Bench, a curated set of 1,400 {PDFs} capturing many content types that remain challenging even for the best tools and {VLMs}, including formulas, tables, tiny fonts, old scans, and more. We find {olmOCR} outperforms even top {VLMs} including {GPT}-4o, Gemini Flash 2 and Qwen-2.5-{VL}. We openly release all components of {olmOCR}: our fine-tuned {VLM} model, training code and data, an efficient inference pipeline that supports {vLLM} and {SGLang} backends, and benchmark {olmOCR}-Bench.},
	number = {{arXiv}:2502.18443},
	publisher = {{arXiv}},
	author = {Poznanski, Jake and Rangapur, Aman and Borchardt, Jon and Dunkelberger, Jason and Huff, Regan and Lin, Daniel and Rangapur, Aman and Wilhelm, Christopher and Lo, Kyle and Soldaini, Luca},
	urldate = {2025-09-02},
	date = {2025-07-02},
	eprinttype = {arxiv},
	eprint = {2502.18443 [cs]},
	keywords = {Computer Science - Computation and Language},
	file = {Preprint PDF:/Users/luis/Zotero/storage/7A9GRHK4/Poznanski et al. - 2025 - olmOCR Unlocking Trillions of Tokens in PDFs with Vision Language Models.pdf:application/pdf},
	}

	@inproceedings{bhushan_block_2022,
	title = {Block diagram-to-text: Understanding block diagram images by generating natural language descriptors},
	url = {https://aclanthology.org/2022.findings-aacl.15/},
	shorttitle = {Block diagram-to-text},
	pages = {153--168},
	booktitle = {Findings of the Association for Computational Linguistics: {AACL}-{IJCNLP} 2022},
	author = {Bhushan, Shreyanshu and Lee, Minho},
	urldate = {2025-09-02},
	date = {2022},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/ZZXQWUJE/Bhushan and Lee - 2022 - Block diagram-to-text Understanding block diagram images by generating natural language descriptors.pdf:application/pdf},
	}

	@online{noauthor_ifthandwriting_forms_nodate,
	title = {ift/handwriting\_forms · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/ift/handwriting_forms},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-02},
	file = {Snapshot:/Users/luis/Zotero/storage/3GQGDS9G/handwriting_forms.html:text/html},
	}

	@inproceedings{mathew_infographicvqa_2022-1,
	title = {Infographicvqa},
	url = {http://openaccess.thecvf.com/content/WACV2022/html/Mathew_InfographicVQA_WACV_2022_paper.html},
	pages = {1697--1706},
	booktitle = {Proceedings of the {IEEE}/{CVF} Winter Conference on Applications of Computer Vision},
	author = {Mathew, Minesh and Bagal, Viraj and Tito, Rubèn and Karatzas, Dimosthenis and Valveny, Ernest and Jawahar, C. V.},
	urldate = {2025-09-02},
	date = {2022},
	file = {Available Version (via Google Scholar):/Users/luis/Zotero/storage/SMXT36SE/Mathew et al. - 2022 - Infographicvqa.pdf:application/pdf},
	}

	@online{noauthor_mychen76invoices-and-receipts_ocr_v1_2025,
	title = {mychen76/invoices-and-receipts\_ocr\_v1 · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/mychen76/invoices-and-receipts_ocr_v1},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-02},
	date = {2025-07-31},
	file = {Snapshot:/Users/luis/Zotero/storage/CLSH4UY2/invoices-and-receipts_ocr_v1.html:text/html},
	}

	@inproceedings{sharma_task_2020,
	title = {Task report: Memotion analysis 1.0@ semeval 2020: The visuo-lingual metaphor},
	shorttitle = {Task report},
	pages = {759--773},
	booktitle = {Proceedings of the 14th International Workshop on Semantic Evaluation ({SemEval}-2020), Barcelona, Spain, Sep. Association for Computational Linguistics},
	author = {Sharma, Chhavi and Paka, William and Scott, Deepesh Bhageria and Das, Amitava and Poria, Soujanya and Chakraborty, Tanmoy and Gambäck, Björn},
	date = {2020},
	}

	@online{noauthor_anditoai2d-merged_nodate,
	title = {andito/ai2d-merged · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/andito/ai2d-merged},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-02},
	file = {Snapshot:/Users/luis/Zotero/storage/ZSP5BNSW/ai2d-merged.html:text/html},
	}

	@misc{zheng_opencodeinterpreter_2025-1,
	title = {{OpenCodeInterpreter}: Integrating Code Generation with Execution and Refinement},
	url = {http://arxiv.org/abs/2402.14658},
	doi = {10.48550/arXiv.2402.14658},
	shorttitle = {{OpenCodeInterpreter}},
	abstract = {The introduction of large language models has significantly advanced code generation. However, open-source models often lack the execution capabilities and iterative refinement of advanced systems like the {GPT}-4 Code Interpreter. To address this, we introduce {OpenCodeInterpreter}, a family of open-source code systems designed for generating, executing, and iteratively refining code. Supported by Code-Feedback, a dataset featuring 68K multi-turn interactions, {OpenCodeInterpreter} integrates execution and human feedback for dynamic code refinement. Our comprehensive evaluation of {OpenCodeInterpreter} across key benchmarks such as {HumanEval}, {MBPP}, and their enhanced versions from {EvalPlus} reveals its exceptional performance. Notably, {OpenCodeInterpreter}-33B achieves an accuracy of 83.2 (76.4) on the average (and plus versions) of {HumanEval} and {MBPP}, closely rivaling {GPT}-4's 84.2 (76.2) and further elevates to 91.6 (84.6) with synthesized human feedback from {GPT}-4. {OpenCodeInterpreter} brings the gap between open-source code generation models and proprietary systems like {GPT}-4 Code Interpreter.},
	number = {{arXiv}:2402.14658},
	publisher = {{arXiv}},
	author = {Zheng, Tianyu and Zhang, Ge and Shen, Tianhao and Liu, Xueling and Lin, Bill Yuchen and Fu, Jie and Chen, Wenhu and Yue, Xiang},
	urldate = {2025-09-02},
	date = {2025-01-07},
	eprinttype = {arxiv},
	eprint = {2402.14658 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Software Engineering},
	file = {Preprint PDF:/Users/luis/Zotero/storage/XWTXC8YA/Zheng et al. - 2025 - OpenCodeInterpreter Integrating Code Generation with Execution and Refinement.pdf:application/pdf},
	}

	@online{noauthor_sahil2801codealpaca-20k_2023,
	title = {sahil2801/{CodeAlpaca}-20k · Datasets at Hugging Face},
	url = {https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2025-09-02},
	date = {2023-06-13},
	file = {Snapshot:/Users/luis/Zotero/storage/N3TPDDL8/CodeAlpaca-20k.html:text/html},
	}

	@misc{toshniwal_openmathinstruct-2_2024,
	title = {{OpenMathInstruct}-2: Accelerating {AI} for Math with Massive Open-Source Instruction Data},
	url = {http://arxiv.org/abs/2410.01560},
	doi = {10.48550/arXiv.2410.01560},
	shorttitle = {{OpenMathInstruct}-2},
	abstract = {Mathematical reasoning continues to be a critical challenge in large language model ({LLM}) development with significant interest. However, most of the cutting-edge progress in mathematical reasoning with {LLMs} has become {\textbackslash}emph\{closed-source\} due to lack of access to training data. This lack of data access limits researchers from understanding the impact of different choices for synthesizing and utilizing the data. With the goal of creating a high-quality finetuning ({SFT}) dataset for math reasoning, we conduct careful ablation experiments on data synthesis using the recently released {\textbackslash}texttt\{Llama3.1\} family of models. Our experiments show that: (a) solution format matters, with excessively verbose solutions proving detrimental to {SFT} performance, (b) data generated by a strong teacher outperforms equally-sized data generated by a weak student model, (c) {SFT} is robust to low-quality solutions, allowing for imprecise data filtering, and (d) question diversity is crucial for achieving data scaling gains. Based on these insights, we create the {OpenMathInstruct}-2 dataset, which consists of 14M question-solution pairs (\${\textbackslash}approx\$ 600K unique questions), making it nearly eight times larger than the previous largest open-source math reasoning dataset. Finetuning the {\textbackslash}texttt\{Llama-3.1-8B-Base\} using {OpenMathInstruct}-2 outperforms {\textbackslash}texttt\{Llama3.1-8B-Instruct\} on {MATH} by an absolute 15.9{\textbackslash}\% (51.9{\textbackslash}\% \${\textbackslash}rightarrow\$ 67.8{\textbackslash}\%). Finally, to accelerate the open-source efforts, we release the code, the finetuned models, and the {OpenMathInstruct}-2 dataset under a commercially permissive license.},
	number = {{arXiv}:2410.01560},
	publisher = {{arXiv}},
	author = {Toshniwal, Shubham and Du, Wei and Moshkov, Ivan and Kisacanin, Branislav and Ayrapetyan, Alexan and Gitman, Igor},
	urldate = {2025-09-02},
	date = {2024-10-05},
	eprinttype = {arxiv},
	eprint = {2410.01560 [cs]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Machine Learning},
	file = {Preprint PDF:/Users/luis/Zotero/storage/5JNXNX76/Toshniwal et al. - 2024 - OpenMathInstruct-2 Accelerating AI for Math with Massive Open-Source Instruction Data.pdf:application/pdf},
	}

	@misc{weyand2020googlelandmarksdatasetv2,
	title={Google Landmarks Dataset v2 -- A Large-Scale Benchmark for Instance-Level Recognition and Retrieval},
	author={Tobias Weyand and Andre Araujo and Bingyi Cao and Jack Sim},
	year={2020},
	eprint={2004.01804},
	archivePrefix={arXiv},
	primaryClass={cs.CV},
	url={https://arxiv.org/abs/2004.01804},
	}

	@misc{li2025eagle2buildingposttraining,
	title={Eagle 2: Building Post-Training Data Strategies from Scratch for Frontier Vision-Language Models},
	author={Zhiqi Li and Guo Chen and Shilong Liu and Shihao Wang and Vibashan VS and Yishen Ji and Shiyi Lan and Hao Zhang and Yilin Zhao and Subhashree Radhakrishnan and Nadine Chang and Karan Sapra and Amala Sanjay Deshmukh and Tuomas Rintamaki and Matthieu Le and Ilia Karmanov and Lukas Voegtle and Philipp Fischer and De-An Huang and Timo Roman and Tong Lu and Jose M. Alvarez and Bryan Catanzaro and Jan Kautz and Andrew Tao and Guilin Liu and Zhiding Yu},
	year={2025},
	eprint={2501.14818},
	archivePrefix={arXiv},
	primaryClass={cs.CV},
	url={https://arxiv.org/abs/2501.14818},
	}