index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="RLHF-V">
  <meta name="keywords" content="RLHF-V, open-source, vision-language, MLLM">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>RLHF-V</title>

  <!-- Global site tag (gtag.js) - Google Analytics
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');
  </script> -->

  <meta name="google-site-verification" content="6lbYN1vX7A4sD8SrVniq84UEKyEUSBgxeP7d3FjuuK0" />

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="icon" href="images/icon.jpg">
  <link rel="stylesheet" href="./static/css/index.css">

  <link rel="shortcut icon" href="images/icon.jpg" type="image/x-icon">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
  </head>

  <style>

    #main{
        position: relative;;
        width: 1200px;
    }

    .box{
        float: left;
        padding: 15px 0 0 15px;
/*        background-color: red;*/
    }

    .pic{
        width: 500px;
        padding: 10px;
        border: 1px solid #ccc;
        border-radius: 5px;
        background-color: #fff;
    }

    .pic img{
        width: 500px;
    }

  </style>


  <body>

<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">RLHF-V</h1>
          <h2 class="title is-3 publication-title">Towards Trustworthy MLLMs via Behavior Alignment from Fine-grained Correctional Human Feedback</h2>
          <div class="is-size-5">
            <span class="author-block">
                <a href="https://github.com/yiranyyu" style="color:#008AD7;font-weight:normal;">Tianyu Yu<sup>1</sup>
                </a>,
            </span>
            <span class="author-block">
              <a href="https://yaoyuanthu.github.io/" style="color:#008AD7;font-weight:normal;">Yuan Yao<sup>2*</sup></a>,
            </span>
            <span class="author-block">
              Haoye Zhang<sup>1</sup>,
            </span>
            <span class="author-block">
              Taiwen He<sup>1</sup>,
            </span>
            <span class="author-block">
              Yifeng Han<sup>1</sup>,
            </span>
            <span class="author-block">

              <a href="https://cgq15.github.io" style="color:#008AD7;font-weight:normal;">Ganqu Cui<sup>1</sup></a>,
            </span>
            <span class="author-block">
              <a href="https://jameshujy.github.io/" style="color:#008AD7;font-weight:normal;">Jinyi Hu<sup>1</sup></a>,
            </span>
            <br>
            <span class="author-block">
              <a href="http://nlp.csai.tsinghua.edu.cn/~lzy/" style="color:#008AD7;font-weight:normal;">Zhiyuan Liu<sup>1*</sup></a>,
            </span>
            <span class="author-block">

              <a href="https://ke.sigs.tsinghua.edu.cn/main.psp" style="color:#008AD7;font-weight:normal;">Hai-Tao Zheng<sup>1*</sup></a>,
            </span>
            <span class="author-block">
              <a href="https://www.cs.tsinghua.edu.cn/csen/info/1180/4033.htm" style="color:#008AD7;font-weight:normal;">Maosong Sun<sup>1</sup></a>
            </span>
            <span class="author-block">
              <a href="https://www.chuatatseng.com/" style="color:#008AD7;font-weight:normal;">Tat-Seng Chua<sup>2</sup></a>
            </span>

          </div>

          <br>
          <div class="is-size-5 publication-authors">
            <span class="author-block"><b style="color:#F2A900; font-weight:normal">&#x25B6 </b>1. Tsinghua University </span>

          <br>
            <span class="author-block"><b style="color:#00A4EF; font-weight:normal">&#x25B6 </b>2. National University of Singapore </span>
            <!-- <span class="author-block"><b style="color:#00A4EF; font-weight:normal">&#x25B6 </b>Microsoft Research, Redmond; </span> -->
            <!-- <span class="author-block"><b style="color:#008AD7; font-weight:normal">&#x25B6 </b>Microsoft Cloud & AI </span> -->
          </div>

          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup>*</sup>Correspondence</span>
            <!-- <span class="author-block"><sup>&#x2628;</sup>Equal Advisory Contribution, </span> -->
            <!-- <span class="author-block"><sup>&#x2691;</sup>Project Lead </span> -->
          </div>

          <br>
         <!--  <div class="is-size-5 publication-authors">
            <span class="author-block"><b style="color:#e08ba0; font-weight:normal"> <b>In CVPR2023</b> </b></span>
          </div> -->


          <div class="column has-text-centered">
            <div class="publication-links">
              <span class="link-block">
                <a href="https://arxiv.org/abs/2312.00849" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>Paper</span>
                </a>
              </span>

              <span class="link-block">
                <a href="https://github.com/RLHF-V/RLHF-V" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>

<!--              <span class="link-block">-->
<!--                      <a href="https://huggingface.co/spaces/##" target="_blank"-->
<!--                         class="external-link button is-normal is-rounded is-dark">-->
<!--                      <span class="icon">-->
<!--                        🤗-->
<!--                      </span>-->
<!--                      <span>Space</span>-->
<!--                    </a>-->
<!--              </span>-->

              <span class="link-block">
                <a id="randomLink" href="http://120.92.209.146:8081" target="_blank" class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="fa fa-play"></i>
                  </span>
                  <span>Demo</span>
                </a>
              </span>

<!--              <span class="link-block">-->
<!--                <a href="https://youtu.be/###" target="_blank"-->
<!--                   class="external-link button is-normal is-rounded is-dark">-->
<!--                  <span class="icon">-->
<!--                    <i class="fab fa-youtube"></i>-->
<!--                  </span>-->
<!--                  <span>Video</span>-->
<!--                  </a>-->
<!--              </span>-->

              <span class="link-block">
                <a href="https://huggingface.co/datasets/HaoyeZhang/RLHF-V-Dataset" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="fa fa-database"></i>
                  </span>
                  <span>Dataset</span>
                  </a>
              </span>

             <span class="link-block">
               <a href="https://huggingface.co/openbmb/RLHF-V" target="_blank"
                  class="external-link button is-normal is-rounded is-dark">
                 <span class="icon">
                   <i class="fa fa-laugh"></i>
                 </span>
                 <span>Model</span>
                 </a>
             </span>

            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- <section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <div class="content has-text-justified">
          <p>
            Thanks for your interest in our work. Currently, the number of users has exceeded our expectations. We provide <strong><font color="#008AD7">alternative demo links</font></strong> here:
            <a href="https://b2517615b965687635.gradio.live" target="_blank">Demo1</a>
            <a href="https://c8de8ff74b6a6c6a9b.gradio.live" target="_blank">Demo2</a>
            <a href="https://90bc0bac96e6457e8f.gradio.live" target="_blank">Demo3</a>
            <a href="https://cd772059965a71f9e6.gradio.live" target="_blank">Demo4</a>
            <a href="https://48da7e23bcadec7551.gradio.live" target="_blank">Demo5</a>
            <a href="https://687d119023cd37e5fb.gradio.live" target="_blank">Demo6</a>
            <a href="https://0810e8582bcad31944.gradio.live" target="_blank">Demo7</a>
            <a href="https://31c7cdb7e3594e851e.gradio.live" target="_blank">Demo8</a>

            <strong><font>News</font></strong>: We now provide a pretrained MiniGPT-4 aligned with <strong><font color="#008AD7">Vicuna-7B</font></strong>! The demo GPU memory consumption now can be <strong><font color="#008AD7">as low as 12GB</font></strong>.
            <br>
            </p>
        </div>
      </div>
    </div>
</section>
 -->

<!-- <link rel="stylesheet" href="js/ft-carousel.css" />
<script src="js/jquery.min.js"></script>
<script src="js/ft-carousel.min.js"></script>
<script type="text/javascript">
  $("#carousel_1").FtCarousel();

  $("#carousel_2").FtCarousel({
    index: 1,
    auto: false
  });

  $("#carousel_3").FtCarousel({
    index: 0,
    auto: true,
    time: 3000,
    indicators: false,
    buttons: true
  });
</script> -->

<!--
<section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <div id="results-carousel" class="carousel results-carousel">
       <div class="item">
        <img src="demos/wop_2.png" alt="MY ALT TEXT"/>
        <h2 class="subtitle has-text-centered">
        </h2>
      </div>
      <div class="item">
        <img src="demos/cook_1.png" alt="MY ALT TEXT"/>
        <h2 class="subtitle has-text-centered">
        </h2>
      </div>
      <div class="item">
        <img src="demos/fix_1.png" alt="MY ALT TEXT"/>
        <h2 class="subtitle has-text-centered">
       </h2>
     </div>
     <div class="item">
      <img src="demos/rhyme_1.png" alt="MY ALT TEXT"/>
      <h2 class="subtitle has-text-centered">
      </h2>
    </div>
  </div>
</div>
</div>
</section>
 -->

<link rel="stylesheet" type="text/css" href="js/simple_style.css" />
<script type="text/javascript" src="js/simple_swiper.js"></script>


<!-- <div class="app">
  <div id="swiper-demo" class="simple-swiper-container">
    <a id="prev" class="btn btn-prev"></a>
    <a id="next" class="btn btn-next"></a>
    <div class="pagination"></div>
  </div>
</div>
<p id="index"></p>

<script type="text/javascript">
  new SimSwiper("#swiper-demo", {
    autoplay: 4000,
    duration: 300,
    easing: 'ease',
    button: {
      prev: "#prev", // 前进后退按钮
      next: "#next"
    },
    pagination: {
      el: '.pagination',
      click: true// 是否可以点击
    },
    // 轮播图数据
    data: [{
      index: 0,
      href: '#',
      src: 'demos/wop_2.png'
    }, {
      index: 1,
      href: '#',
      src: 'demos/cook_1.png'
    }, {
      index: 2,
      href: '#',
      src: 'demos/fix_1.png'
    }, {
      index: 3,
      href: '#',
      src: 'demos/rhyme_1.png'
    }]
  });
</script> -->


<section class="section interpolation-panel">
  <div class="container">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-10">
        <br>
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified" style="font-size: large;">
          <p>
            Existing Multimodal Large Language Models prevalently suffer from serious <b>hallucination</b> problems, generating text that is not factually grounded in associated images. Our <b>RLHF-V framework</b> enhances MLLM trustworthiness via behavior alignment from fine-grained correctional human feedback.
            <ul>
              <li>
                <b style="font-family:Arial, Helvetica, sans-serif">Fine-Grained and Diverse Human Preference Data</b>: <span style="font-size: 95%;">We collect 1.4K fine-grained human feedback consisting of 3.7k pieces of segment-level corrections, covering hallucination types including objects (41.2%), positions (20.3%), numbers (16.5%), attributes (10%), actions (5.3%), and others (6.8%).</span>
              </li>
              <li>
                <b style="font-family:Arial, Helvetica, sans-serif">High Data Efficiency and Scalability</b>: <span style="font-size: 95%;">With just 1.4K annotated data, we achieve a 34.8% reduction in model hallucinations. Moreover, the decrease in hallucinations becomes more significant as more data used.</span>
              </li>
              <li>
                <b style="font-family:Arial, Helvetica, sans-serif">Enhanced Performance and Computational Efficiency</b>: <span style="font-size: 95%;">Our fine-grained correctional human feedback data can better credit the desired behavior, allowing efficient training in 1 hour on 8 A100 GPUs to achieve promising results.</span>
              </li>
              <li>
                <b style="font-family:Arial, Helvetica, sans-serif">Outstanding Trustworthiness without Compromising Helpfulness</b>: <span style="font-size: 95%;">Our model surpasses existing open-source MLLMs in reducing hallucination rates, mitigates hallucination from over-generalization, and maintains informativeness. Surprisingly, RLHF-V is even more resistant to the over-generalization problem compared with <a href="https://openai.com/research/gpt-4v-system-card">GPT-4V</a>.</span>
              </li>
            </ul>
<!--             <br>
            The
            </b> -->
          </p>
        </div>
      </div>
    </div>
    <!--/ Abstract. -->
    <!-- <img id="model" width="100%" src="images/rlhf-v-main_exp.jpg"> -->
       <!--  <h3 class="subtitle has-text-centered">
          <p style="font-family:Times New Roman"></p>
        </h3> -->
    <br>
    <!-- Paper video. -->
    <!-- <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Video</h2>
        <div class="publication-video">
          <iframe width=“560” height=“315" src=“https://www.youtube.com/embed/__tftoxpBAw” title=“YouTube video player” frameborder=“0” allow=“accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share” allowfullscreen></iframe>
        </div>
      </div>
    </div> -->
  </div>
</section>

<!-- <section class="hero is-small is-light">
    <div class="hero-body"> -->

<!--
        <div class="container">
            <h2 class="title has-text-centered">Video Presentation</h2>
            <div class="columns is-centered has-text-centered">
                <div class="column is-four-fifths">

                    <div class="publication-video">
                        <iframe width="560" height="315" src="https://www.youtube.com/embed/__tftoxpBAw" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>
                    </div>
                </div>
            </div>
        </div>
 -->


<!--     </div>
</section> -->


    <!--/ Demo. -->
    <!-- <br>
    <br>

    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Demo</h2>
      </div>
    </div>

    <div class="column is-full-width">
      <div class="columns is-centered">
        <img id="teaser" width="90%" src="images/demo6_AdobeExpress.gif">
      </div>
      <div class="columns is-centered">
      <h1>
        <p style="font-family:Times New Roman"><b>X-GPT: Connecting generalist X-Decoder with GPT-3</b>
      </h1>
      </div>
    </div>

    <br>

    <div class="column is-full-width">
      <div class="columns is-centered">
        <img id="teaser" width="90%" src="images/inpaint.gif">
      </div>
      <div class="columns is-centered">
      <h1>
        <p style="font-family:Times New Roman"><b>Instruct-X-Decoder: Object-centric instructional image editing</b>
      </h1>
      </div>
    </div> -->

<section class="section">
  <div class="container is-max-desktop">
    <!--/ Paper video. -->
    <br>
    <br>
    <!-- Paper Model. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-six-fifths">
        <h2 class="title is-3">Method</h2>
        <div class="content has-text-justified">
          <p>
            <b>The proposed RLHF-V framework</b>:
          </p>
          <p>
            We collect <a href="https://huggingface.co/datasets/HaoyeZhang/RLHF-V-Hall_v0/tree/main">1.4k fine-grained dense feedback data</a> by asking human annotators to correct the hallucinated segments in model responses.

            The training takes only 1 hour with 8 A100 GPUs to get <a href="https://huggingface.co/openbmb/RLHF-V_v0">RLHF-V-13B</a> which is initialized from our <a href="https://huggingface.co/Yirany/RLHF-V_v0_SFT/tree/main">RLHF-V_SFT-13B</a>.
          </p>
          <ul>
            <!-- <li>It has two types of queries (latent queries and text queries) and outputs (semantic outputs and pixel-level outputs).</li>
            <li>It uses a single text encoder for all text corpus, ranging from class concepts, referring phrases to image captions.</li>
            <li>It decouples image and text encoder to accomadate cross-image tasks (e.g., image-text retrieval) and within-image tasks (e.g., segmentation and captioning).</li> -->

          </ul>
        </div>
        <img id="model" width="100%" src="images/rlhf-v_framework.jpg", alt="Illustration of the RLHF-V frmework">
       <!--  <h3 class="subtitle has-text-centered">
          <p style="font-family:Times New Roman"></p>
        </h3> -->
        <br>
        <br>

      </div>
    </div>
    <br>
    <br>
    <!--/ Paper video. -->
        <div class="columns is-centered has-text-centered">
      <div class="column is-six-fifths">
        <h2 class="title is-3">Highlights</h2>
        <br>
        <div class="content has-text-justified">
          <p>
            <b>Low hallucination rate while being informative</b>:
          </p>
          <ul>
            <!-- <li>It has two types of queries (latent queries and text queries) and outputs (semantic outputs and pixel-level outputs).</li>
            <li>It uses a single text encoder for all text corpus, ranging from class concepts, referring phrases to image captions.</li>
            <li>It decouples image and text encoder to accomadate cross-image tasks (e.g., image-text retrieval) and within-image tasks (e.g., segmentation and captioning).</li> -->

          </ul>
        </div>
        <img id="model" width="80%" src="images/rlhf-v-main_exp.jpg" alt="Main experimental results of RLHF-V">
       <!--  <h3 class="subtitle has-text-centered">
          <p style="font-family:Times New Roman"></p>
        </h3> -->
        <br>
        <br>

        <div class="content has-text-justified">
          <p>
            <b>Data-efficient and showing good scaling results</b>:
          </p>
        </div>
        <img id="model" width="40%" src="images/data_scaling.png">

        <br>
        <br>

        <div class="content has-text-justified">
          <p>
            <b>More resistant to over-generalization</b>:
          </p>
        </div>
        <img id="model" width="80%" src="images/over-generalization.jpg">

        <br>
        <br>

      </div>
    </div>

  </div>
</section>


<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <div class="columns is-centered has-text-centered">
    <h2 class="title is-3">BibTeX</h2>
    </div>
    <pre><code>
@article{2023rlhf-v,
  author      = {Tianyu Yu and Yuan Yao and Haoye Zhang and Taiwen He and Yifeng Han and Ganqu Cui and Jinyi Hu and Zhiyuan Liu and Hai-Tao Zheng and Maosong Sun and Tat-Seng Chua},
  title       = {RLHF-V: Towards Trustworthy MLLMs via Behavior Alignment from Fine-grained Correctional Human Feedback},
  journal      = {arxiv},
  year         = {2023},
}
</code></pre>
  </div>
  <br>
</section>


<section class="section">
  <!-- Results. -->
  <div class="columns is-centered has-text-centered">
    <div class="column is-six-fifths">
      <h2 class="title is-3">Examples</h2>
    </div>
  </div>
  <!--/ Results. -->
  <div class="container is-max-desktop">

    <!-- <section class="section"> -->
    <!-- <div id="main">
      <div align="center" ><div align="center" ><img src="demos/case1.png" width="75%" alt=""></div></div>
      <div align="center" ><div align="center" ><img src="demos/case2.png" width="75%" alt=""></div></div>
      <div align="center" ><div align="center" ><img src="demos/case3-1.png" width="75%" alt=""></div></div>
      <div align="center" ><div align="center" ><img src="demos/case3-2.png" width="75%" alt=""></div></div>
      <div align="center" ><div align="center" ><img src="demos/case4.png" width="75%" alt=""></div></div>
      <div align="center" ><div align="center" ><img src="demos/case5.png" width="75%" alt=""></div></div>
      <!-- <div class="box"><div class="pic"><img src="demos/p7.png" alt=""></div></div>
      <div class="box"><div class="pic"><img src="demos/p8.png" alt=""></div></div>
      <div class="box"><div class="pic"><img src="demos/p9.png" alt=""></div></div>
    </div> -->

    <div class="columns is-centered has-text-centered">
      <div class="column is-six-fifths">
        <div class="content has-text-justified">
          <ul>
            <li><b>Short-form QA</b>: RLHF-V can give a more trustworthy answer in short-form QA.</li>
          </ul>
        </div>
        <img src="demos/case1.png" width="90%" alt="">
        <div class="content has-text-justified">
          <ul>
            <li><b>Long-form QA</b>: RLHF-V can generate informative image description with less hallucinations.</li>
          </ul>
        </div>
        <img src="demos/case2.png" width="90%" alt="">
        <div class="content has-text-justified">
          <ul>
            <li><b>Long-form QA</b>: RLHF-V can provide detailed reasoning with less hallucinations.</li>
          </ul>
        </div>
        <img src="demos/case3-1.png" width="90%" alt="">
        <img src="demos/case3-2.png" width="90%" alt="">
        <div class="content has-text-justified">
          <ul>
            <li><b>Long-form QA</b>: RLHF-V is more resistant to over-generalization.</li>
          </ul>
        </div>
        <img src="demos/case4.png" width="90%" alt="">
        <div class="content has-text-justified">
          <ul>
            <li><b>Long-form QA</b>: RLHF-V is more resistant to over-generalization.</li>
          </ul>
        </div>
        <img src="demos/case5.png" width="90%" alt="">
      </div>
      <!-- <div class="box"><div class="pic"><img src="demos/p7.png" alt=""></div></div>
      <div class="box"><div class="pic"><img src="demos/p8.png" alt=""></div></div>
      <div class="box"><div class="pic"><img src="demos/p9.png" alt=""></div></div> -->
    </div>
  </div>
</section>


<!--
<section class="section" id="Acknowledgement">
  <div class="container is-max-desktop content">
    <h2 class="title is-3">Acknowledgement</h2>
    <p>
      This website is adapted from <a
      href="https://github.com/nerfies/nerfies.github.io">Nerfies</a>, licensed under a <a rel="license"
                                          href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
      Commons Attribution-ShareAlike 4.0 International License</a>.
    </p>
  </div>
</section>
 -->


<script src="js/Underscore-min.js"></script>
<script src="js/index.js"></script>


</body>

</html>