@article{shi2025maestro,
 abstract = {Today's best-explored routes towards generalist robots center on collecting ever larger "observations-in actions-out" robotics datasets to train large end-to-end models, copying a recipe that has worked for vision-language models (VLMs). We pursue a road less traveled: building generalist policies directly around VLMs by augmenting their general capabilities with specific robot capabilities encapsulated in a carefully curated set of perception, planning, and control modules. In Maestro, a VLM coding agent dynamically composes these modules into a programmatic policy for the current task and scenario. Maestro's architecture benefits from a streamlined closed-loop interface without many manually imposed structural constraints, and a comprehensive and diverse tool repertoire. As a result, it largely surpasses today's VLA models for zero-shot performance on challenging manipulation skills. Further, Maestro is easily extensible to incorporate new modules, easily editable to suit new embodiments such as a quadruped-mounted arm, and even easily adapts from minimal real-world experiences through local code edits.},
 author = {Junyao Shi and Rujia Yang and Kaitian Chao and Selina Bingqing Wan and Yifei Shao and Jiahui Lei and Jianing Qian and Long Le and Pratik Chaudhari and Kostas Daniilidis and Chuan Wen and Dinesh Jayaraman},
 journal = {(Under review)},
 title = {Maestro: Orchestrating Robotics Modules with Vision-Language Models for Zero-Shot Generalist Robots},
 year = {2025}
}
