index.html

<!DOCTYPE html>
<html>

<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="">
  <meta property="og:title" content="[CVPR 2024] Language Models as Black-Box Optimizers for Vision-Language Models" />
  <meta property="og:description" content="" />
  <meta property="og:url" content="" />
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/image/your_banner_image.png" />
  <meta property="og:image:width" content="1200" />
  <meta property="og:image:height" content="630" />


  <meta name="twitter:title" content="Language Models as Black-Box Optimizers for Vision-Language Models">
  <meta name="twitter:description" content="">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/twitter_banner_image.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="KEYWORDS SHOULD BE PLACED HERE">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>[CVPR 2024] Language Models as Black-Box Optimizers for Vision-Language</title>
  <!-- <link rel="icon" type="image/x-icon" href="static/images/favicon.ico"> -->
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>

<body>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title"><span style="color: green;">[CVPR 2024]</span></h1>
            <h1 class="title is-1 publication-title">Language Models as Black-Box Optimizers for Vision-Language Models
            </h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a target="_blank">Shihong Liu</a><sup>*</sup>,</span>
              <span class="author-block">
                <a target="_blank">Samuel Yu</a><sup>*</sup>,</span>
              <span class="author-block">
                <a target="_blank">Zhiqiu Lin</a><sup>*</sup>,</span>
              <span class="author-block">
                <a target="_blank">Ryan Lee</a>,</span>
              <span class="author-block">
                <a target="_blank">Tiffany Ling</a>,</span>
              <span class="author-block">
                <a target="_blank">Deepak Pathak</a>,</span>
              <span class="author-block">
                <a target="_blank">Deva Ramanan</a>
              </span>
            </div>

            <div class="is-size-5 publication-authors">
              <span class="author-block">Carnegie Mellon University<br></span>
              <span class="eql-cntrb"><small><br><sup>*</sup>Indicates Equal Contribution</small></span>
            </div>

            <div class="column has-text-centered">
              <div class="publication-links">
                <!-- Arxiv PDF link -->
                <span class="link-block">
                  <a href="https://arxiv.org/pdf/2309.05950.pdf" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                    </span>
                    <span>Paper</span>
                  </a>
                </span>

                <!-- Github link -->
                <span class="link-block">
                  <a href="https://github.com/shihongl1998/LLM-as-a-blackbox-optimizer" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>

                <!-- ArXiv abstract Link -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2309.05950" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                  </a>
                </span>
              </div>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>

  <!-- Paper abstract -->
  <section class="section hero is-light">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Abstract</h2>
          <div class="content has-text-justified">
            <p>
              Vision-language models (VLMs) pre-trained on webscale datasets have demonstrated remarkable capabilities
              on downstream tasks when fine-tuned with minimal data. However, many VLMs rely on proprietary data and are
              not open-source, which restricts the use of white-box approaches for fine-tuning. As such, we aim to
              develop a black-box approach to optimize VLMs through natural language prompts, thereby avoiding the need
              to access model parameters, feature embeddings, or even output logits. We propose employing chat-based
              LLMs to search for the best text prompt for VLMs. Specifically, we adopt an automatic “hill-climbing”
              procedure that converges to an effective prompt by evaluating the performance of current prompts and
              asking LLMs to refine them based on textual feedback, all within a conversational process without
              human-in-the-loop. In a challenging 1-shot image classification setup, our simple approach surpasses the
              white-box continuous prompting method (CoOp) by an average of 1.5% across 11 datasets including ImageNet.
              Our approach also outperforms both human-engineered and LLM-generated prompts. We highlight the
              advantage of conversational feedback that incorporates both positive and negative prompts, suggesting
              that LLMs can utilize the implicit “gradient” direction in textual feedback for a more efficient search.
              In addition, we find that the text prompts generated through our strategy are not only more interpretable
              but also transfer well across different VLM architectures in a black-box manner. Lastly, we demonstrate
              our framework on a state-of-the-art black-box VLM (DALL-E 3) for text-to-image optimization.
            </p>
          </div>
        </div>
      </div>
    </div>
  </section>
  <!-- End paper abstract -->


  <!-- Method Overview -->
  <section class="section hero is-light2">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Methods Overview</h2>
          <div class="content has-text-justified">

            <div class="item">
              <!-- Your image here -->
              <img src="static/images/promptgpt.png" alt="Image illustrating ChatGPT interaction with VLMs"
                style="width: 850px; height: auto; display: block; margin: 0 auto;">

              <p style="text-align: justify; font-size: 16px; line-height: 1.5; margin-top: 10px; color: #333;">
                Similar to how human prompt engineers iteratively test and refine prompts, we employ ChatGPT to
                continuously optimize prompts for vision-language models (VLMs). Our iterative approach assesses the
                performance of ChatGPT-generated prompts on a few-shot dataset (highlighted in <span
                  style="color: blue;">blue</span>) and provides feedback (marked in <span
                  style="color: violet;">violet</span>) to ChatGPT through simple conversations, as depicted in the
                illustrative figure. This straightforward method delivers state-of-the-art results for one-shot image
                classification across 11 datasets using CLIP, operated in a black-box manner without accessing model
                weights, feature embeddings, or output logits. Remarkably, our approach outperforms both white-box
                methods such as
                gradient-based continuous prompting (CoOp) and human-engineered prompts in this extremely low-shot
                scenario. This figure only shows a typical conversation using ChatGPT's web user interface. Our code
                implementation follows this pattern using the ChatGPT API.
              </p>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>
  <!-- End Method Overview -->


  <!-- Text-Image generation -->
  <section class="section hero is-light">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Optimizing Text-to-Image (T2I) Generation</h2>
          <div class="content has-text-justified">

            <div class="item">
              <!-- Your image here -->
              <img src="static/images/dalle3.png" alt="Image showing DALL-E 3 output" />

              <p style="text-align: justify; font-size: 16px; line-height: 1.5; color: #333;">
                Using chat-based multimodal LLMs, we apply our framework to optimize prompts for the state-of-the-art
                black-box generative VLM, DALL-E 3, using the multimodal GPT4-V. For complicated user queries that
                DALL-E 3 may initially fail to generate, we send the generated image (highlighted in <span
                  style="color: violet;">violet</span>) along with the current prompt to GPT4-V to ask for feedback on
                improvements (highlighted in <span style="color: red;">red</span>) and then generate a new prompt
                (highlighted in <span style="color: blue;">blue</span>). We show that such a simple framework is
                surprisingly effective at correcting DALL-E 3 mistakes on some challenging Winoground text queries
                that involve action, logical, and spatial reasoning. We open-source our code at <a
                  href="https://github.com/shihongl1998/LLM-as-a-blackbox-optimizer" target="_blank"
                  style="color: #0066cc; text-decoration: none;">this link</a>
                to facilitate future research on AI-driven content generation.
              </p>
            </div>


            <section class="hero is-small">
              <div class="hero-body">
                <div class="container">
                  <div id="results-carousel" class="carousel results-carousel">

                    <div class="item">

                      <div class="query-container">
                        <p class="user-query">The unmasked wrestler
                          hits the masked wrestler.</p>
                      </div>

                      <!-- Image container -->
                      <div class="images-container">

                        <div class="image-with-subtitle">
                          <img src="static/images/2-1-1.png" alt="1" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/2-1-n.png" alt="2" />
                          <p class="subtitle-text">Optimized Image</p>
                        </div>

                      </div>

                    </div>

                    <div class="item">

                      <div class="query-container">
                        <p class="user-query">The person with earrings pays the person without earrings.</p>
                      </div>

                      <!-- Image container -->
                      <div class="images-container">

                        <div class="image-with-subtitle">
                          <img src="static/images/4-1-1.png" alt="1" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/4-1-n.png" alt="2" />
                          <p class="subtitle-text">Optimized Image</p>
                        </div>

                      </div>

                    </div>

                    <div class="item">

                      <div class="query-container">
                        <p class="user-query">A bird eats a snake.</p>
                      </div>

                      <!-- Image container -->
                      <div class="images-container">

                        <div class="image-with-subtitle">
                          <img src="static/images/5-1-1.png" alt="1" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/5-1-n.png" alt="2" />
                          <p class="subtitle-text">Optimized Image</p>
                        </div>

                      </div>

                    </div>


                    <div class="item">

                      <div class="query-container">
                        <p class="user-query">A shorter person is covering the eyes of a taller person.</p>
                      </div>

                      <!-- Image container -->
                      <div class="images-container">

                        <div class="image-with-subtitle">
                          <img src="static/images/308-1-1.png" alt="1" style="width: 1230px;" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/308-1-n.png" alt="2" />
                          <p class="subtitle-text">Optimized Image</p>
                        </div>

                      </div>

                    </div>


                    <div class="item">

                      <div class="query-container">
                        <p class="user-query">There is less milk than orange juice.</p>
                      </div>

                      <!-- Image container -->
                      <div class="images-container">

                        <div class="image-with-subtitle">
                          <img src="static/images/show_example2_init.png" alt="1" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/show_example2_final.png" alt="2" />
                          <p class="subtitle-text">Optimized Image</p>
                        </div>

                      </div>

                    </div>

                    <div class="item">

                      <div class="query-container">
                        <p class="user-query">Getting a horse wet.</p>
                      </div>

                      <!-- Image container -->
                      <div class="images-container">

                        <div class="image-with-subtitle">
                          <img src="static/images/show_example6_init.png" alt="1" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/show_example6_final.png" alt="2" />
                          <p class="subtitle-text">Optimized Image</p>
                        </div>

                      </div>

                    </div>

                    <div class="item">

                      <div class="query-container">
                        <p class="user-query">Some are parking in a train.</p>
                      </div>

                      <!-- Image container -->
                      <div class="images-container">

                        <div class="image-with-subtitle">
                          <img src="static/images/show_example7_init.png" alt="1" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/show_example7_final.png" alt="2" />
                          <p class="subtitle-text">Optimized Image</p>
                        </div>

                      </div>

                    </div>

                    <div class="item">

                      <div class="query-container">
                        <p class="user-query">The white wall will soon be painted blue.</p>
                      </div>

                      <!-- Image container -->
                      <div class="images-container">

                        <div class="image-with-subtitle">
                          <img src="static/images/show_example8_init.png" alt="1" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/show_example8_final.png" alt="2" />
                          <p class="subtitle-text">Optimized Image</p>
                        </div>

                      </div>

                    </div>


                  </div>
                </div>
              </div>
            </section>
          </div>
        </div>
      </div>
    </div>
  </section>
  <!-- End Prompt Inversion -->


  <!-- Prompt Inversion -->
  <section class="section hero is-light2">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Prompt Inversion using Chat-based Multimodal LLMs</h2>
          <div class="content has-text-justified">
            <p>
              Given a user-specified reference (query) image, our framework reverse-engineers the prompt to have DALL-E
              generate the same object or scene in the query image. This enables users to easily make customizations,
              such as having the character in a reference image perform various actions or change scenes.
            </p>
            <div class="item">
              <!-- Your image here -->
              <img src="static/images/dalle3_inversion_new.png" alt="Visualization of Prompt Inversion Process"
                style="max-width: 95%; height: auto; display: block; margin: 0 auto;">

              <p style="text-align: justify; font-size: 16px; line-height: 1.5; margin-top: 10px; color: #333;">
                We apply our framework to reverse engineer the text prompt to generate the same user-queried image. We
                send the generated image (highlighted in <span style="color: violet;">violet</span>) along with the
                original image to GPT4-V to ask for feedback on improvements (highlighted in <span
                  style="color: red;">red</span>) and then generate a new prompt (highlighted in <span
                  style="color: blue;">blue</span>).
              </p>
            </div>

            <section class="hero is-small">
              <div class="hero-body">
                <div class="container">
                  <div id="results-carousel" class="carousel results-carousel">
                    <div class="item">
                      <!-- Image container -->
                      <div class="images-container">
                        <div class="image-with-subtitle">
                          <img src="static/images/rhino_original.png" alt="1" />
                          <p class="subtitle-text">User Query</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/rhino_initial.png" alt="2" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/rhino_final.png" alt="3" />
                          <p class="subtitle-text">Final Image</p>
                        </div>
                      </div>
                      <h2 class="subtitle has-text-centered">
                        Create a digital artwork of a stylized, geometric rhinoceros head with a dynamic array of sharp,
                        crystalline facets in a monochromatic palette of black, white, and gray. The design should
                        feature intricate shadows and highlights to produce a three-dimensional illusion, with a focus
                        on accurately representing the creature's contours and muscle structure. Adjust the composition
                        to show the rhinoceros head from a frontal perspective, ensuring that both the horn and the ears
                        are symmetrically aligned in the center. Emphasize the geometric nature of the facets by making
                        them more pronounced and varied in shape, creating a complex mosaic that captures the interplay
                        of light and shadow. Add a slight glow to the edges of the facets to enhance the
                        three-dimensional effect and the metallic quality of the artwork. Display the rhinoceros head
                        against a pitch-black background, with a light source positioned to cast dramatic, high-contrast
                        illumination that emphasizes its multifaceted texture. Incorporate a subtle reflective sheen on
                        the surface to suggest a sleek, metallic finish, and ensure the rhinoceros's eye is detailed and
                        expressive, contributing to the overall lifelike appearance of the artwork.
                      </h2>
                    </div>

                    <div class="item">
                      <!-- Image container -->
                      <div class="images-container">
                        <div class="image-with-subtitle">
                          <img src="static/images/orange_original.png" alt="1" />
                          <p class="subtitle-text">User Query</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/orange_initial.png" alt="2" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/orange_final.png" alt="3" />
                          <p class="subtitle-text">Final Image</p>
                        </div>
                      </div>
                      <h2 class="subtitle has-text-centered">
                        A hyper-realistic full slice of an orange with intricate details, including the textured pulp
                        and clearly defined rind, positioned off-center on a reflective gradient surface transitioning
                        from white to dark. The orange's juicy texture is accentuated by a dynamic splash of juice, with
                        droplets captured mid-air, creating an energetic and lively scene. The lighting is dramatic and
                        contrasting, with a spotlight effect casting a pronounced shadow to one side to enhance the
                        three-dimensional effect and emphasize the vibrant orange color. Include a clear reflection on
                        the surface and a small stem attached to the orange slice to underscore the realism and
                        freshness. Enhance the composition by ensuring the orange slice is angled slightly, with the
                        splash of juice originating from the lower right side, to add a sense of motion and vitality.
                      </h2>
                    </div>

                    <div class="item">
                      <!-- Image container -->
                      <div class="images-container">
                        <div class="image-with-subtitle">
                          <img src="static/images/knight_original.png" alt="1" />
                          <p class="subtitle-text">User Query</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/knight_initial.png" alt="2" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/knight_final.png" alt="3" />
                          <p class="subtitle-text">Final Image</p>
                        </div>
                      </div>
                      <h2 class="subtitle has-text-centered">
                        A medieval knight in full armor stands with a shield, the dark background highlighting his
                        silhouette against a subtle warm glow. His helmet features a visor with a single vertical slit,
                        and his armor includes a chainmail coif beneath a segmented plate gorget and articulated plate
                        gauntlets, with layered plate armor and flared ridged pauldrons. The knight's shield is centered
                        and bears a detailed, embossed golden fleur-de-lis on a field of weathered steel, surrounded by
                        rivets. The vibrant orange cloak drapes over both shoulders and behind his back, adding a touch
                        of regal color to the composition. His stance is grounded and balanced, with his left arm
                        extended, presenting the shield, and his right hand resting on the pommel of his sword, exuding
                        a calm and noble demeanor.
                      </h2>
                    </div>

                    <div class="item">
                      <!-- Image container -->
                      <div class="images-container">
                        <div class="image-with-subtitle">
                          <img src="static/images/dove_original.png" alt="1" />
                          <p class="subtitle-text">User Query</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/dove_initial.png" alt="2" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/dove_final.png" alt="3" />
                          <p class="subtitle-text">Final Image</p>
                        </div>
                      </div>
                      <h2 class="subtitle has-text-centered">
                        Create a stylized illustration of a dove in flight, with feathers that transition smoothly
                        through a spectrum of colors including red, orange, yellow, green, blue, indigo, and violet. The
                        dove's plumage should resemble a dynamic, three-dimensional arrangement of vibrant, overlapping
                        feathers, giving a sense of movement and freedom. The style should be a fusion of semi-realistic
                        and digital art, with a focus on vivid colors and a clean, light background that emphasizes the
                        artwork's lively and spirited nature. Adjust the feather arrangement to be more structured and
                        flame-like, with the feathers at the tips being more elongated and pointed to enhance the sense
                        of elegance and flow.
                      </h2>
                    </div>

                    <div class="item">
                      <!-- Image container -->
                      <div class="images-container">
                        <div class="image-with-subtitle">
                          <img src="static/images/dino_original.png" alt="1" />
                          <p class="subtitle-text">User Query</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/dino_initial.png" alt="2" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/dino_final.png" alt="3" />
                          <p class="subtitle-text">Final Image</p>
                        </div>
                      </div>
                      <h2 class="subtitle has-text-centered">
                        Create an illustration of a stylized, geometric dinosaur with a textured body in two shades of
                        green: a lighter green for the main body and a darker green for the spiky plates along its back.
                        The dinosaur should have a friendly demeanor, with a long, curved tail and a smooth, rounded
                        head featuring two small, circular white eyes with black pupils. It should stand on two legs
                        with small, rounded feet, each with three visible toes. The background should be a flat, light
                        beige color, with a simple, elongated shadow extending to the right of the dinosaur, indicating
                        a soft light source to the left.
                      </h2>
                    </div>

                    <div class="item">
                      <!-- Image container -->
                      <div class="images-container">
                        <div class="image-with-subtitle">
                          <img src="static/images/show_example3_original.png" alt="1" />
                          <p class="subtitle-text">User Query</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/show_example3_init.png" alt="2" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/show_example3_final.png" alt="3" />
                          <p class="subtitle-text">Final Image</p>
                        </div>
                      </div>
                      <h2 class="subtitle has-text-centered">
                        Generate an image of a cartoon-style polar bear with gleefully closed eyes and a wide, toothy
                        grin, revealing just a hint of its tongue. The bear should look exuberant, standing on its hind
                        legs with arms open wide as if ready for a hug. The bear's fur should appear extremely soft and
                        fluffy, with a pronounced blush of rosy pink on both cheeks and belly, enhancing its charm.
                        Adorn the bear with a cozy, chunky-knit scarf, vibrant red with prominent, horizontal white
                        stripes, stylishly wrapped around its neck and draping with a dense tassel fringe at the ends.
                        Situate the bear against a gentle pastel pink backdrop, scattered with delicate, small
                        snowflakes, conveying the splendor and coziness of festive winter cheer.
                      </h2>
                    </div>

                    <div class="item">
                      <!-- Image container -->
                      <div class="images-container">
                        <div class="image-with-subtitle">
                          <img src="static/images/show_example5_original.png" alt="1" />
                          <p class="subtitle-text">User Query</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/show_example5_init.png" alt="2" />
                          <p class="subtitle-text">Initial Image</p>
                        </div>
                        <div class="image-with-subtitle">
                          <img src="static/images/show_example5_final.png" alt="3" />
                          <p class="subtitle-text">Final Image</p>
                        </div>
                      </div>
                      <h2 class="subtitle has-text-centered">
                        An anthropomorphic duck standing confidently with hands on hips, styled as a classic film noir
                        detective. The duck has a calm and cool expression, wearing a tan detective's fedora hat and a
                        matching double-breasted trench coat, buttoned up, with a broad collar, epaulets, and a belted
                        waist. The character has a white shirt and a patterned tie with a diagonal stripe design
                        underneath. The character has orange webbed feet and a large, prominent beak. The lighting is
                        dramatic, with a strong contrast between light and shadow, creating a focused shadow on the
                        background that mimics the character's silhouette. The overall color palette is warm with a
                        gentle light source coming from the side, casting the background in a gradient from warm beige
                        to shadows, giving the image a mysterious and dramatic appearance.
                      </h2>
                    </div>
                  </div>
                </div>
              </div>
            </section>

            <table>
              <h3 class="subtitle has-text-centered">
                Image customization based on inverted images 
              </h3>
              <tr>
                <th>User Query</th>
                <th>Inverted Image</th>
                <th>Example 1</th>
                <th>Example 2</th>
                <th>Example 3</th>
                <th>Example 4</th>
                <th>Example 5</th>
              </tr>
              <tr>
                <td><img src="static/images/shiba_original.png" alt="User Query Image"></td>
                <td><img src="static/images/shiba_final.png" alt="Inverted Image"></td>
                <td>
                  <img src="static/images/shiba_give_the_dog_a_cat_friend.png" alt="Example 1">
                  <div class="caption">Give the dog a cat friend.</div>
                </td>
                <td>
                  <img src="static/images/shiba_make_the_dog_be_in_the_middle_of_a_jump.png" alt="Example 2">
                  <div class="caption">Make the dog be in the middle of a jump.</div>
                </td>
                <td>
                  <img src="static/images/shiba_make_the_dog_do_a_handstand.png" alt="Example 3">
                  <div class="caption">Make the dog do a handstand.</div>
                </td>
                <td>
                  <img src="static/images/shiba_make_the_dog_lie_down_on_its_side.png" alt="Example 4">
                  <div class="caption">Make the dog lie down on its side.</div>
                </td>
                <td>
                  <img src="static/images/shiba_make_the_dog_swim_in_water.png" alt="Example 5">
                  <div class="caption">Make the dog swim in water.</div>
                </td>
              </tr>
              <!-- Repeat the row structure for each additional set of queries and images -->
              <tr>
                <td><img src="static/images/owl_original.png" alt="User Query Image"></td>
                <td><img src="static/images/owl_final.png" alt="Inverted Image"></td>
                <td>
                  <img src="static/images/owl_make_the_owl_fight_a_hawk.png" alt="Example 1">
                  <div class="caption">Make the owl fight a hawk.</div>
                </td>
                <td>
                  <img src="static/images/owl_make_the_owl_flap_its_wings.png" alt="Example 2">
                  <div class="caption">Make the owl flap its wings.</div>
                </td>
                <td>
                  <img src="static/images/owl_make_the_owl_fully_green.png" alt="Example 3">
                  <div class="caption">Make the owl fully green.</div>
                </td>
                <td>
                  <img src="static/images/owl_make_the_owl_stand_in_front_of_the_moon.png" alt="Example 4">
                  <div class="caption">Make the owl stand in front of the moon.</div>
                </td>
                <td>
                  <img src="static/images/owl_make_the_owl_walk_in_the_city.png" alt="Example 5">
                  <div class="caption">Make the owl walk in the city.</div>
                </td>
              </tr>
              <p>

              </p>
            </table>

          </div>
        </div>
      </div>
    </div>
  </section>
  <!-- End Prompt Inversion -->

  <!-- Quantitative Results -->
  <section class="section hero is-light">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Comparison of our method with other baselines on one-shot classification tasks.</h2>
          <div class="content has-text-justified">

            <div class="item">
              <!-- Your image here -->
              <img src="static/images/table.png" alt="Image showing DALL-E 3 output" />

              <p style="text-align: justify; font-size: 16px; line-height: 1.5; color: #333;">
                We report the average accuracy of each method across three folds, optimized using 1-shot training sets.
                We bold the best black-box result for each dataset, and underline the second best result. First, we note
                that our approach can effectively improve upon the initial prompts selected from LAIONCOCO-1M from 56%
                to 61%. Our approach is also competitive against the best Human-Engineered prompts released by OpenAI
                searched using test set performance. Additionally, we show that using both positive and negative
                prompts improves the overall accuracy by 1%. For reference, we report oracle white-box approaches in
                gray. Remarkably, we also surpass white-box solutions such as WiSE-FT and CoOp by 1.5%. These
                methods require either gradient-based fine-tuning (CoOp/WiSE-FT/Cross-Modal) or prompt ensembling using
                output logits (DCLIP). While our approach is less effective than the SOTA white-box method (Cross-Modal
                Adaptation), we stress that our black-box setup is significantly more challenging, because we restrict
                the optimization space to natural language and do not access the pre-trained weights, model
                architectures, feature embeddings, and output logits of VLMs.
              </p>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>
  <!-- End Quantitative Results -->

  <!-- Qualitative Results -->
  <section class="section hero is-light2">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Example resulting templates on each dataset.</h2>
          <div class="content has-text-justified">

            <div class="item">
              <!-- Your image here -->

              <table>
                <tr>
                  <th>Dataset</th>
                  <th>Example of Top Templates</th>
                </tr>
                <tr>
                  <td>Caltech </td>
                  <td>An image of a {} with a <strong>blurred</strong> background that emphasizes the subject</td>
                </tr>
                <tr>
                  <td>DTD </td>
                  <td>The essential elements of {} are <strong>amplified with visual simplicity</strong></td>
                </tr>
                <tr>
                  <td>EuroSAT</td>
                  <td>A <strong>top-down</strong> view of {} arranged in a pattern {}</td>
                </tr>
                <tr>
                  <td>Aircraft</td>
                  <td>A clear, high-quality image of a single {} with a <strong>white background</strong> </td>
                </tr>
                <tr>
                  <td>Food</td>
                  <td>A {} featuring diverse <strong>cuisine</strong> and ingredients</td>
                </tr>
                <tr>
                  <td>ImageNet</td>
                  <td>An image of a {} with bright and <strong>natural lighting</strong></td>
                </tr>
                <tr>
                  <td>Flowers</td>
                  <td>A clear and <strong>vivid</strong> photograph of the {} in its <strong>natural setting</strong>
                  </td>
                </tr>
                <tr>
                  <td>Pets</td>
                  <td>A {} with distinct and <strong>recognizable</strong> characteristics</td>
                </tr>
                <tr>
                  <td>Cars</td>
                  <td>A {} featuring a wide range of <strong>color options</strong> for easy selection</td>
                </tr>
                <tr>
                  <td>SUN</td>
                  <td>A high-resolution photo of a {} with clear background and natural <strong>lighting</strong></td>
                </tr>
                <tr>
                  <td>UCF</td>
                  <td>A black and white photo of a {} <strong>in motion</strong></td>
                </tr>
              </table>

              <p style="text-align: justify; font-size: 16px; line-height: 1.5; color: #333;">
                Although we do not provide ChatGPT with any information regarding the targeted dataset, we observe that
                the resulting templates are remarkably similar to human-engineered templates, with many domain-specific
                details such as “motion” and “cuisine”, and stylistic elements such as “bright and natural lighting”.
              </p>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>
  <!-- End Qualitative Results -->

  <!--BibTex citation -->
  <section class="section hero is-light" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>@article{liu2023language,
  title={Language models as black-box optimizers for vision-language models},
  author={Liu, Shihong and Lin, Zhiqiu and Yu, Samuel and Lee, Ryan and Ling, Tiffany and Pathak, Deepak and Ramanan, Deva},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  year={2024}
}</code></pre>
    </div>
  </section>
  <!--End BibTex citation -->


  <!-- Statcounter tracking code -->

  <!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

  <!-- End of Statcounter Code -->

</body>

</html>