@article{shi2024plug,
 abstract = {There have recently been large advances both in
pre-training visual representations for robotic control and
segmenting unknown category objects in general images. To
leverage these for improved robot learning, we propose
POCR, a new framework for building pre-trained
object-centric representations (OCR) for robotic control.
Building on theories of "what-where" representations in
psychology and computer vision, we use segmentations from a
pre-trained model to stably locate across timesteps,
various entities in the scene, capturing "where"
information. To each such segmented entity, we apply other
pre-trained models that build vector descriptions suitable
for robotic control tasks, thus capturing "what" the entity
is. Thus, our OCR for control is constructed by
appropriately combining the outputs of off-the-shelf
pre-trained models, with no new training. On various
simulated and real robotic tasks, we show that imitation
policies for robotic manipulators trained on our OCR
achieve better performance and systematic generalization
than state of the art pre-trained representations for
robotics, as well as prior OCRs that are typically trained
from scratch.},
 author = {Shi*, Junyao and Qian*, Jianing and Ma, Yecheng Jason and Jayaraman, Dinesh},
 journal = {ICRA},
 title = {Plug-And-Play Object-Centric Representations From “What” and “Where” Foundation Models},
 url = {https://sites.google.com/view/pocr},
 year = {2024}
}