index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="Foundation models-based deadlock resolution">
  <meta name="keywords" content="Multi-agent systems; LLMs; VLMs; Safe control; Control barrier functions; Graph neural networks">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>Foundation models-based deadlock resolution</title>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <!-- <link rel="icon" href="./static/images/favicon.png"> -->
  <link rel="icon" type="image/svg+xml" href="./static/images/favicon.svg" />
  <link rel="icon" type="image/png" href="./static/images/favicon.png" />

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <!-- <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script> -->
  <script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=default'></script>
</head>
<body>

<script type="text/x-mathjax-config">
  MathJax.Hub.Config({tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]}});
</script>

<nav class="navbar" role="navigation" aria-label="main navigation">
  <div class="navbar-brand">
    <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
      <span aria-hidden="true"></span>
      <span aria-hidden="true"></span>
      <span aria-hidden="true"></span>
    </a>
  </div>
  <div class="navbar-menu">
    <div class="navbar-start" style="flex-grow: 1; justify-content: center;">
      <a class="navbar-item" href="https://kunalgarg.mit.edu/">
      <span class="icon">
          <i class="fas fa-home"></i>
      </span>
      </a>

      <div class="navbar-item has-dropdown is-hoverable">
        <a class="navbar-link">
          More Research
        </a>
        <div class="navbar-dropdown">
          <a class="navbar-item" href="https://realm.mit.edu">
            REALM Website
          </a>
          <a class="navbar-item" href="https://mit-realm.github.io/gcbfplus-website/">
            GCBF+: A Neural Graph Control Barrier Function Framework for Distributed Safe Multi-Agent Control
          </a>
          <a class="navbar-item" href="https://arxiv.org/abs/2311.13714">
            Survey: Learning Safe Control for Multi-Robot Systems: Methods, Verification, and Open Challenges
          </a>
        </div>
        <!-- <div class="navbar-dropdown">
          <a class="navbar-item" href="https://mit-realm.github.io/gcbf-website/">
            GCBFv0
          </a>
        </div> -->
      </div>
    </div>

  </div>
</nav>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">Foundation Models to the Rescue: Deadlock Resolution in Multi-Robot Systems</h1>
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a href="https://kunalgarg.mit.edu/">Kunal Garg</a>,</span>
              <span class="author-block">
                <a href="https://syzhang092218-source.github.io/">Songyuan Zhang</a>,</span>
              <span class="author-block">
                <a href="https://aeroastro.mit.edu/realm/team/jake-arkin/">Jacob Arkin</a>,</span>  
                <!-- <span class="author-block">
                <a href="https://groups.csail.mit.edu/rrg/index.php?n=Main.HomePage">Nicholas Roy</a></span> -->
              <span class="author-block">
                <a href="http://chuchu.mit.edu/">Chuchu Fan</a></span>
          </div>

          <div class="is-size-5 publication-authors">
            <span class="author-block">Massachusetts Institute of Technology</span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <span class="link-block">
                <a href="./files/Multi_agent_coordination_using_VLMs.pdf"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Paper</span>
                </a>
              </span>
              <span class="link-block">
                <a href="https://arxiv.org/abs/2404.06413"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
              <!-- Video Link. -->
              <!-- <span class="link-block">
                <a href="https://youtu.be/TODO"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-youtube"></i>
                  </span>
                  <span>Supplementary Video</span>
                </a>
              </span>
              <span class="link-block">
                <a href="https://youtu.be/TODO"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-youtube"></i>
                  </span>
                  <span>Presentation Video</span>
                </a>
              </span> -->
              <!-- Code Link. -->
              <span class="link-block">
                <a href="https://github.com/MIT-REALM/VLM_gcbfplus"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
            </div>

          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <div class="hero-body">
        <h2 class="title is-4">Overview of the hierarchical control framework where an LLM-based high-level planner assigns a leader for a multi-robot system, resulting in a leader-follower formation.</h2>
        <center>
          <img width=70% src="./figs/vlm_overview.png">
        </center>
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <!-- Paper video. -->
    <!-- <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Video</h2>
        <div class="publication-video">
          <iframe src="https://www.youtube.com/embed/TODO?rel=0&amp;showinfo=0"
                  frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>
        </div>
      </div>
    </div> -->
    <!--/ Paper video. -->

    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Multi-agent robotic systems (MRS) are prone to deadlocks in an obstacle environment where the robots can get stuck away from their desired locations under a smooth low-level control policy. Without an external intervention, often in terms of a high-level command, a low-level control policy can not resolve such deadlocks. Utilizing the generalizability and low data requirements of foundation models, this paper explores the possibility of using text-based models, i.e., large language models (LLMs), and text-and-image-based models, i.e., vision-language models (VLMs) for deadlock resolution. We propose a hierarchical control framework in which a foundation model-based high-level planner resolves deadlocks by assigning a leader and a set of waypoints to the leader of the MRS. Then, a low-level distributed control policy based on graph neural networks is executed. We conduct extensive experiments on various MRS environments using the best available pre-trained LLMs and VLMs. We compare their performance with a grid-based planner, in terms of the effectiveness in assisting the MRS to reach their goal locations and the computational time. Our results illustrate that foundation models can assist MRS operating in complex obstacle-cluttered environments to resolve deadlocks efficiently. In particular, compared to grid-based planners, the foundation models have better performance in terms of goal-reaching rate and computational time for complex environments. 
          </p>
        </div>
      </div>
    </div>
    <!--/ Abstract. -->
  </div>
</section>


<section class="section">
  
  <div class="container is-max-desktop">
    <!-- Main point 1 -->
    <div class="column">
      <h2 class="title is-3">Prompt engineering</h2>
      <div class="content">
        <div class="content has-text-justified">
          <b>
            User prompts used for querying LLMs are given below. 
          </b>
          <!-- Create a list -->
          <ul>
              <ul>
                <li> 
                  <b>Task description:</b> We are working with a multi-robot system navigating in an obstacle environment to reach their respective goal locations. The objective is to move the robots toward their goal locations while maintaining safety with obstacles, safety with each other, and inter-agent connectivity. Safety is based on agents maintaining a minimum inter-agent "safety radius" while connectivity is based on connected agents remaining within a "connectivity radius".  
                  Your role as the helpful assistant to provide a high-level command when the system gets stuck near obstacles. The high-level command is in terms of a leader assignment for the multi-robot system and a direction of motion for the leader. 
                  An optimal choice of leader and moving direction minimizes the traveling distance of agents toward their goals and maintains safety and connectivity.               
                </li>
                <li> 
                  <b> 
                    Environment state: 
                  </b>
                  The multi-robot environment description consists of the tuple: (Number of agents, Safety radius, Connectivity radius, Agent locations, Agent goals, Obstacles, Number of waypoints).
                  The environment consists of robot Agents with information ("AgentId"=id, "current state"=(x,y), "goal location"=(xg,yg)). 
                  The obstacles are represented as a bottom-left corner, its width, and height. The obstacles are represented as a list of tuples [(x,y,w,h)].
                  In addition, there are global environment variables "Number of agents" = N, "Safety radius" = r, "Connectivity radius" = R.
                  The task is to provide a leader assignment for the multi-robot system and a set of waypoints for the leader. The leader assignment is an integer value in the range (1, Number of agents) and the waypoints for the leader are (x,y) coordinates. The number of waypoints is described by the variable "Number of waypoints" = M.
                   
                </li>
                <li> 
                  <b> 
                    Desired output: 
                  </b>
                  The expected output is a JSON format file with the keys "Leader" and "Waypoints". The key "Leader" can take integer values in the range (1, Number of agents) and "Waypoints" are of the form [(x1, y1), (x2, y2), ..., (xM, yM)].
                  The waypoints are ordered in the sequence the leader should visit them. The first point should NOT be the current location of the leader. All the waypoints should be at least 2r distance from all the obstacles. 
                  The waypoints should be such that the leader can move toward its goal location while maintaining safety with the obstacles. 
                  The path connecting the leader and the waypoints should NOT intersect with any of the obstacles. 
                  The waypoints should be in the free space of the environment, away from ALL the known obstacles. The waypoints can be chosen to wrap around the obstacles to allow the leader to move toward its goal location while evading the obstacles.
                  If the leader cannot move directly in the direction of its goal location, the first waypoint should be to the left or right of the leader to avoid obstacles. 
                  The consecutive waypoints should be such that the leader moves toward its goal location while maintaining safety with the obstacles.
                </li>
                <li>
                  <b>Example Scenario</b>
                  An example environment description is as follows.<br>
                  <center>
                    <img width=100% src="./figs/LLM_env_prompt.png">
                  </center>
                  
                </li>
              </ul>
        
        </div>
      </div>

      <div class="content">
        <div class="content has-text-justified">
          <b>
            User prompts used for querying VLMs are given below. 
          </b>
          <!-- Create a list -->
          <ul>
              <ul>
                <li> 
                  <b>Task description:</b> We are working with a multi-robot system navigating in an obstacle environment to reach their respective goal locations. The objective is to move the robots toward their goal locations while maintaining safety with obstacles, safety with each other, and inter-agent connectivity. Safety is based on agents maintaining a minimum inter-agent "safety radius" while connectivity is based on connected agents remaining within a "connectivity radius".  
                  Your role as the helpful assistant to provide a high-level command when the system gets stuck near obstacles. The high-level command is in terms of a leader assignment for the multi-robot system and a direction of motion for the leader. 
                  An optimal choice of leader and moving direction minimizes the traveling distance of agents toward their goals and maintains safety and connectivity. 
                  
                </li>
                <li> 
                  <b> 
                    Environment state: 
                  </b>
                  The input image represents a grid world where the obstacles are given in black color.
                  The location of the agents are given in blue color and the goal locations are given in green color. 
                  The task is to provide a high-level command in terms of a leader assignment for the multi-robot system and a set of waypoints for the leader. The leader assignment is an integer value in the range (1, Number of agents) and the waypoints for the leader are (x,y) coordinates. The number of waypoints is described by the variable "Number of waypoints" = M.                   
                </li>
                <li> 
                  <b> 
                    Desired output: 
                  </b>
                  The expected output is a JSON format file with the keys "Leader" and "Waypoints". The key "Leader" can take integer values in the range (1, Number of agents) and "Waypoints" are of the form [(x1, y1), (x2, y2), ..., (xM, yM)].
                  The leader should be assigned as the agent that can move freely in the environment. The leader should not be assigned to an agent that is blocked by obstacles or other agents.
                  The waypoints are ordered in the sequence the leader should visit them. The first point should NOT be the current location of the leader. All the waypoints should be at least 2r distance from all the obstacles. 
                  The consecutive waypoints should be such that the leader moves toward its goal location.
                  The waypoints should be such that the leader can move toward its goal location while maintaining safety with the obstacles. 
                  The path connecting the leader and the waypoints should NOT intersect with any of the obstacles. 
                  The waypoints should be in the free space of the environment, away from ALL the known obstacles. The obstacles can be chosen to wrap around the obstacles to allow the leader to move toward its goal location while evading the obstacles.
                  The leader assignment is based on agent being able to freely move. That means there should be no obstacle or other agents in its path connected to its goal. 
                  If the leader cannot move directly in the direction of its goal location, the first waypoint should be to the left or right of the leader to avoid obstacles. The consecutive waypoints should be such that the leader moves toward its goal location while maintaining safety with the obstacles.
                </li>
                <li>
                  <b>Example Scenario</b>
                  An example environment description is as follows.<br>
                  <center>
                    <img width=50% src="./figs/VLM_input.png">
                  </center>
                  
                </li>
              </ul>
        
        </div>
      </div>
    </div>
    <!--/ Main point 1 -->

  </div>
  <!--/ Concurrent Work. -->
</section>


<section class="section">
  
  <div class="container is-max-desktop">
    <!-- Main point 1 -->
    <div class="column">
      <h2 class="title is-3">Evaluations</h2>
      <div class="content">
        <div class="content has-text-justified">
          <b>
            Performance of various high-level planners for ``Room'' environments with $N=5$ agents (Top plots), ``Maze'' environments with $N=25$ agents (Middle plots), and ``Maze'' environments with $N=50$ agents (Bottom plots). 
            From left to right: 1) The bar shows the ratio of the trajectories where all the agents reach their goals over the total number of trajectories, and the orange dot shows the ratio of agents that reach their goals over all agents; 2) Box plot of the number of times the high-level planner intervened; 3) Box plot of the time spent for each high-level planner intervention; and 4) Box plot for the input + output token per intervention. In the box plots, the median values are in orange and the mean values are in green.          
          </b>
          <center>
            <img width=100% src="./figs/plot_5_new.png">
          </center>

          <center>
            <img width=100% src="./figs/plot_25_new.png">
          </center>
          
          <center>
            <img width=100% src="./figs/plot_50_new.png">
          </center>
        </div>
      </div>
      <div class="content">
        <div class="content has-text-justified">
          <b>
            Comparison of distance traveled by agents under various high-level planners. 
          </b>
          <center>
            <img width=100% src="./figs/plot_5_dist.png">
          </center>

          <center>
            <img width=100% src="./figs/plot_25_dist.png">
          </center>
          
          <center>
            <img width=100% src="./figs/plot_50_dist.png">
          </center>
          <b> 
            LLM-based high level planner in action. The LLM suggests a leader and a direction for it to move along when the agents get stuck in a deadlock. 
          </b>
          <center>
            <video width="50%" autoplay="" muted="" loop="">
              <source src="vids/LLM_in_action.mp4" type="video/mp4">
            </video>
          </center>
          
        </div>
      </div>
      <div class="content">
        <div class="content has-text-justified">
          <b>
            Ablation on environment information: 
          </b>
          <!-- <b> -->
            Performance of various high-level planners for  ``Maze'' environments with $N=50$ agents with all known environment information and partial information (the case with all known environment information is indicated with the suffix ``-All", e.g. ``GPT4-VLM-All"). 
          <center>
            <img width=100% src="./figs/plot_50_comb.png">
          </center>
        </div>
      </div>
      <div class="content">
        <div class="content has-text-justified">
          <b> Ablation on number of leaders: 
          </b>
          Performance of Claude3-Sonnet-VLM planner for ``Maze'' environments with $N=50$ agents and GPT3.5-LLM for ``Maze" environment with $N=25$ with a single leader and multi-leader assignment (the case with one leader is indicated with suffix ``-One", and that with multi-leader with ``-Multi''. 
          <center>
            <img width=100% src="./figs/plot_one_multi.png">
          </center>
        </div>
      </div>
    </div>
    <!--/ Main point 1 -->

  </div>
  <!--/ Concurrent Work. -->
</section>


<!-- <section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <pre><code>@article{TODO,
author={TODO, TODO and TODO, TODO},
journal={TODO},
title={TODO},
year={TODO},
volume={TODO},
number={TODO},
pages={TODO},
doi={TODO}}
}</code></pre>
  </div>
</section> -->


<footer class="footer">
  <div class="container">
    <div class="content has-text-centered">
    </div>
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <p>
            This website is licensed under a <a rel="license"
                                                href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
            This webpage template is based that used by <a href="https://github.com/nerfies/nerfies.github.io">Nerfies</a>. 
            We sincerely thank <a href="https://keunhong.com/">Keunhong Park</a> for developing and open-sourcing this template.
          </p>
        </div>
      </div>
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>


</body>
</html>