_bibliography/papers.bib

---
---

@article{gao2024intentionnet,
    title={IntentionNet: Map-Lite Visual Navigation at the Kilometre Scale},
    author={Wei Gao and Bo Ai and Joel Loo and Vinay and David Hsu},
    year={2024},
    journal={arXiv},
    url={https://arxiv.org/abs/2407.03122},
    pdf={https://arxiv.org/abs/2407.03122},
    selected={true},
    preview={2024ijrr-kilo-nav-15x-480p.gif},
    abstract={This work explores the challenges of creating a scalable and robust robot navigation system that can traverse both indoor and outdoor environments to reach distant goals. We propose a navigation system architecture called IntentionNet that employs a monolithic neural network as the low-level planner/controller, and uses a general interface that we call intentions to steer the controller. The paper proposes two types of intentions, Local Path and Environment (LPE) and Discretised Local Move (DLM), and shows that DLM is robust to significant metric positioning and mapping errors. The paper also presents Kilo-IntentionNet, an instance of the IntentionNet system using the DLM intention that is deployed on a Boston Dynamics Spot robot, and which successfully navigates through complex indoor and outdoor environments over distances of up to a kilometre with only noisy odometry.}
}

@article{DBLP:journals/corr/abs-2408-15903,
  author       = {Ruirui Chen and
                  Weifeng Jiang and
                  Chengwei Qin and
                  Ishaan Singh Rawal and
                  Cheston Tan and
                  Dongkyu Choi and
                  Bo Xiong and
                  Bo Ai},
  title        = {LLM-Based Multi-Hop Question Answering with Knowledge Graph Integration
                  in Evolving Environments},
  journal      = {EMNLP Findings},
  volume       = {abs/2408.15903},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2408.15903},
  pdf          = {https://doi.org/10.48550/arXiv.2408.15903},
  doi          = {10.48550/ARXIV.2408.15903},
  timestamp    = {Sat, 28 Sep 2024 20:40:09 +0200},
  preview      = {2024EMNLP-GMeLLo.jpg},
}

@inproceedings{
ai2024robopack,
title={RoboPack: Learning Tactile-Informed Dynamics Models for Dense Packing},
author={Bo Ai* and Stephen Tian* and Haochen Shi and Yixuan Wang and Cheston Tan and Yunzhu Li and Jiajun Wu},
booktitle={Robotics: Science and Systems (RSS)},
year={2024},
url={https://arxiv.org/abs/2407.01418},
bibtex_show={false},
pdf={https://arxiv.org/abs/2407.01418},
selected={true},
preview={2024RSS-RoboPack.gif},
html={https://robo-pack.github.io/},
  note={Abridged in ICRA 2024 workshops
  [ViTac](https://shanluo.github.io/ViTacWorkshops/),
  [3DVRM](https://3d-manipulation-workshop.github.io/),
  [Future Roadmap for Sensorimotor Skills](https://icra-manipulation-skill.github.io/), and RSS 2024 workshop
  [Priors4Robots](https://sites.google.com/alora.tech/priors4robots24).},
  abstract={Tactile feedback is critical for understanding the dynamics of both rigid and deformable objects in many manipulation tasks, such as non-prehensile manipulation and dense packing. We introduce an approach that combines visual and tactile sensing for robotic manipulation by learning a neural, tactile-informed dynamics model. Our proposed framework, RoboPack, employs a recurrent graph neural network to estimate object states, including particles and object-level latent physics information, from historical visuo-tactile observations and to perform future state predictions. Our tactile-informed dynamics model, learned from real-world data, can solve downstream robotics tasks with model-predictive control. We demonstrate our approach on a real robot equipped with a compliant Soft-Bubble tactile sensor on non-prehensile manipulation and dense packing tasks, where the robot must infer the physics properties of objects from direct and indirect interactions. Trained on only an average of 30 minutes of real-world interaction data per task, our model can perform online adaptation and make touch-informed predictions. Through extensive evaluations in both long-horizon dynamics prediction and real-world manipulation, our method demonstrates superior effectiveness compared to previous learning-based and physics-based simulation systems.}
}

@InProceedings{ai2023invariance,
    author="Ai, Bo
    and Wu, Zhanxin
    and Hsu, David",
    editor="Ang Jr, Marcelo H.
    and Khatib, Oussama",
    title="Invariance is Key to Generalization: Examining the Role of Representation in Sim-to-Real Transfer for Visual Navigation",
    booktitle="International Symposium on Experimental Robotics (ISER)",
    year="2023",
    publisher="Springer Nature Switzerland",
    address="Cham",
    pages="69--80",
    abstract="The data-driven approach to robot control has been gathering pace rapidly, yet generalization to unseen task domains remains a critical challenge. We argue that the key to generalization is representations that are (i) rich enough to capture all task-relevant information and (ii) invariant to superfluous variability between the training and the test domains. We experimentally study such a representation---containing both depth and semantic information---for visual navigation and show that it enables a control policy trained entirely in simulated indoor scenes to generalize to diverse real-world environments, both indoors and outdoors. Further, we show that our representation reduces the A-distance between the training and test domains, improving the generalization error bound as a result. Our proposed approach is scalable: the learned policy improves continuously, as the foundation models that it exploits absorb more diverse data during pre-training.",
    isbn="978-3-031-63596-0",
    pdf={https://arxiv.org/abs/2310.15020},
    selected={true},
    preview={2023ISER-SEER.gif},
    note={Published within [Springer Proceedings in Advanced Robotics (SPAR)](https://link.springer.com/chapter/10.1007/978-3-031-63596-0_7).}
}

@inproceedings{
    wu2023integrating,
    title={Integrating Common Sense and Planning with Large Language Models for Room Tidying},
    author={Zhanxin Wu and Bo Ai and David Hsu},
    booktitle={RSS 2023 Workshop on Learning for Task and Motion Planning},
    year={2023},
    url={https://openreview.net/forum?id=vuSI9mhDaBZ},
    bibtex_show={false},
    pdf={https://openreview.net/forum?id=vuSI9mhDaBZ},
    preview={2023RSSW-TAMP.gif},
    abstract={Do you want a personal housekeeper robot? This project seeks to endow robots with the capability of tidying up messy rooms with brief natural language descriptions of the environment. We address three key challenges: (i) incomplete map information in the description, (ii) commonsense understanding of object locations, and (iii) long-horizon planning and acting to achieve the objective. To tackle these challenges, we leverage Large Language Models' (LLMs) understanding of typical layouts of human-living environments and object locations, as well as programming and control skills for action execution. Specifically, we prompt ChatGPT to reconstruct complete map representations from partial descriptions, then generate a high-level action plan in the form of Python functions, and finally refine the plans with atomic actions executable by the robot. We show that our framework enables effective room rearrangement with limited human instruction guidance. On simulation and real-world maps, it is able to find a place missing out from human description within three interactions with humans. In the simulation environment, it is capable of putting more than 80\% household objects in their desired place. This study provides preliminary evidence that LLMs have common sense about the spatial layout of human-living environments and object arrangements, and this work connects this knowledge to robotics tasks.}
}

@inproceedings{ai2022deep,
  author       = {Bo Ai and
                  Wei Gao and
                  Vinay and
                  David Hsu},
  title        = {Deep Visual Navigation under Partial Observability},
  booktitle    = {International Conference on Robotics and Automation (ICRA)},
  pages        = {9439--9446},
  publisher    = {{IEEE}},
  year         = {2022},
  url          = {https://doi.org/10.1109/ICRA46639.2022.9811598},
  doi          = {10.1109/ICRA46639.2022.9811598},
  timestamp    = {Mon, 04 Dec 2023 21:29:46 +0100},
  biburl       = {https://dblp.org/rec/conf/icra/AiGVH22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org},
  bibtex_show={false},
  pdf={https://arxiv.org/abs/2109.07752},
  html={https://adacomp.comp.nus.edu.sg/inet/},
  selected={true},
  preview={2022ICRA-DECISION.gif},
  abstract={How can a robot navigate successfully in rich and diverse environments, indoors or outdoors, along office corridors or trails on the grassland, on the flat ground or the staircase? To this end, this work aims to address three challenges: (i) complex visual observations, (ii) partial observability of local visual sensing, and (iii) multimodal robot behaviors conditioned on both the local environment and the global navigation objective. We propose to train a neural network (NN) controller for local navigation via imitation learning. To tackle complex visual observations, we extract multi-scale spatial representations through CNNs. To tackle partial observability, we aggregate multi-scale spatial information over time and encode it in LSTMs. To learn multimodal behaviors, we use a separate memory module for each behavior mode. Importantly, we integrate the multiple neural network modules into a unified controller that achieves robust performance for visual navigation in complex, partially observable environments. We implemented the controller on the quadrupedal Spot robot and evaluated it on three challenging tasks: adversarial pedestrian avoidance, blind-spot obstacle avoidance, and elevator riding. The experiments show that the proposed NN architecture significantly improves navigation performance.}
}

@inproceedings{ai2022whodunit,
  author       = {Bo Ai and
                  Yuchen Wang and
                  Yugin Tan and
                  Samson Tan},
  editor       = {Yulan He and
                  Heng Ji and
                  Yang Liu and
                  Sujian Li and
                  Chia{-}Hui Chang and
                  Soujanya Poria and
                  Chenghua Lin and
                  Wray L. Buntine and
                  Maria Liakata and
                  Hanqi Yan and
                  Zonghan Yan and
                  Sebastian Ruder and
                  Xiaojun Wan and
                  Miguel Arana{-}Catania and
                  Zhongyu Wei and
                  Hen{-}Hsen Huang and
                  Jheng{-}Long Wu and
                  Min{-}Yuh Day and
                  Pengfei Liu and
                  Ruifeng Xu},
  title        = {Whodunit? Learning to Contrast for Authorship Attribution},
  booktitle    = {International Joint Conference on Natural Language Processing (IJCNLP)},
  pages        = {1142--1157},
  publisher    = {Association for Computational Linguistics},
  year         = {2022},
  url          = {https://aclanthology.org/2022.aacl-main.84},
  timestamp    = {Tue, 29 Nov 2022 14:53:03 +0100},
  biburl       = {https://dblp.org/rec/conf/ijcnlp/AiWTT22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org},
  bibtex_show={false},
  pdf={https://aclanthology.org/2022.aacl-main.84},
  preview={2022IJCNLP-Contrax.png},
  abstract={Authorship attribution is the task of identifying the author of a given text. The key is finding representations that can differentiate between authors. Existing approaches typically use manually designed features that capture a dataset's content and style, but these approaches are dataset-dependent and yield inconsistent performance across corpora. In this work, we propose \textit{learning} author-specific representations by fine-tuning pre-trained generic language representations with a contrastive objective (Contra-X). We show that Contra-X learns representations that form highly separable clusters for different authors. It advances the state-of-the-art on multiple human and machine authorship attribution benchmarks, enabling improvements of up to 6.8% over cross-entropy fine-tuning. However, we find that Contra-X improves overall accuracy at the cost of sacrificing performance for some authors. Resolving this tension will be an important direction for future work. To the best of our knowledge, we are the first to integrate contrastive learning with pre-trained language model fine-tuning for authorship attribution.}
}