@misc{kuo2026vlmsneedvisiontransformers, title={Do VLMs Need Vision Transformers? Evaluating State Space Models as Vision Encoders}, author={Shang-Jui Ray Kuo and Paola Cascante-Bonilla}, year={2026}, eprint={2603.19209}, archivePrefix={arXiv}, primaryClass={cs.CV}, url={https://arxiv.org/abs/2603.19209}, }