From eb1e321ad47a01e00ce80df607946d6d566930ff Mon Sep 17 00:00:00 2001
From: Omar Khattab <okhat@users.noreply.github.com>
Date: Sat, 28 Dec 2024 13:38:43 -0800
Subject: [PATCH] Add AlfWorld dataset and tutorial. Improve BootstrapFT.
 (#1996)

---
 docs/docs/tutorials/games/index.ipynb  | 656 +++++++++++++++++++++++++
 dspy/datasets/__init__.py              |   1 +
 dspy/datasets/alfworld/__init__.py     |   1 +
 dspy/datasets/alfworld/alfworld.py     | 149 ++++++
 dspy/datasets/alfworld/base_config.yml | 145 ++++++
 dspy/teleprompt/bootstrap_finetune.py  |  79 +--
 6 files changed, 997 insertions(+), 34 deletions(-)
 create mode 100644 docs/docs/tutorials/games/index.ipynb
 create mode 100644 dspy/datasets/alfworld/__init__.py
 create mode 100644 dspy/datasets/alfworld/alfworld.py
 create mode 100644 dspy/datasets/alfworld/base_config.yml

diff --git a/docs/docs/tutorials/games/index.ipynb b/docs/docs/tutorials/games/index.ipynb
new file mode 100644
index 000000000..e6b794353
--- /dev/null
+++ b/docs/docs/tutorials/games/index.ipynb
@@ -0,0 +1,656 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tutorial: Fine-tuning Agents\n",
+    "\n",
+    "Let's walk through a quick example of optimizing the _language model weights_ (i.e., fine-tuning) inside a DSPy module that represents a ReAct agent playing a game with 50-step tasks.\n",
+    "\n",
+    "### Install dependencies and download data\n",
+    "\n",
+    "Install the latest DSPy via `pip install -U --pre dspy` and follow along. This tutorial uses the AlfWorld dataset, which depends on DSPy 2.6.0 (pre-release).\n",
+    "\n",
+    "You will also need the following dependencies:\n",
+    "\n",
+    "```shell\n",
+    "> pip install -U alfworld==0.3.5 multiprocess\n",
+    "> alfworld-download\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Set up the language models\n",
+    "\n",
+    "Our goal is to allow `gpt-4o-mini` to play the AlfWorld household game proficiently, without tinkering with string prompts or example trajectories by hand.\n",
+    "\n",
+    "Though it's not strictly necessary, we'll make our job a little easier by using the larger `gpt-4o` for prompt optimization and fine-tuning, building our small `gpt-4o-mini` agent."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import dspy\n",
+    "\n",
+    "gpt4o_mini = dspy.LM('gpt-4o-mini-2024-07-18')\n",
+    "gpt4o = dspy.LM('openai/gpt-4o')\n",
+    "dspy.configure(experimental=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's load 200 training and 200 development tasks from AlfWorld. The dataset is much larger, but a small number of examples will help keep this tutorial run in 1-2 hours, including fine-tuning.\n",
+    "\n",
+    "With just 100 training tasks, we'll teach 4o-mini to go from 19% (can barely play the game) to 72%. If you use 500 tasks and retain the demonstrations during fine-tuning, you can push that easily to 82%."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(200, 200)"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from dspy.datasets.alfworld import AlfWorld\n",
+    "\n",
+    "alfworld = AlfWorld()\n",
+    "trainset, devset = alfworld.trainset[:200], alfworld.devset[-200:]\n",
+    "len(trainset), len(devset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Before we proceed, let's view an example of this task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-= Welcome to TextWorld, ALFRED! =-\n",
+      "\n",
+      "You are in the middle of a room. Looking quickly around you, you see a countertop 1, a drawer 8, a drawer 7, a drawer 6, a drawer 5, a drawer 4, a drawer 3, a drawer 2, a drawer 1, a garbagecan 1, a handtowelholder 1, a sinkbasin 2, a sinkbasin 1, a toilet 1, a toiletpaperhanger 1, and a towelholder 1.\n",
+      "\n",
+      "Your task is to: put a clean soapbar in garbagecan.\n"
+     ]
+    }
+   ],
+   "source": [
+    "example = trainset[0]\n",
+    "\n",
+    "with alfworld.POOL.session() as env:\n",
+    "    task, info = env.init(**example.inputs())\n",
+    "\n",
+    "print(task)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Defining the Agent program\n",
+    "\n",
+    "The agent is a pretty simple `dspy.Module` with one sub-module called `self.react`.\n",
+    "\n",
+    "This sub-module consumes a definition of a specific `task`, sees its previous `trajectory`, and sees a list of\n",
+    "`possible_actions` it can take. It responds simply with the next action.\n",
+    "\n",
+    "In the `forward` method, we just initialize an environment for the given task `idx`. And we loop up to `self.max_iters`,\n",
+    "repeatedly invoking the `self.react` module to take the next action."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Agent(dspy.Module):\n",
+    "    def __init__(self, max_iters=50, verbose=False):\n",
+    "        self.max_iters = max_iters\n",
+    "        self.verbose = verbose\n",
+    "        self.react = dspy.Predict(\"task, trajectory, possible_actions: list[str] -> action\")\n",
+    "\n",
+    "    def forward(self, idx):\n",
+    "        with alfworld.POOL.session() as env:\n",
+    "            trajectory = []\n",
+    "            task, info = env.init(idx)\n",
+    "            if self.verbose:\n",
+    "                print(f\"Task: {task}\")\n",
+    "\n",
+    "            for _ in range(self.max_iters):\n",
+    "                trajectory_ = \"\\n\".join(trajectory)\n",
+    "                possible_actions = info[\"admissible_commands\"][0] + [\"think: ${...thoughts...}\"]\n",
+    "                prediction = self.react(task=task, trajectory=trajectory_, possible_actions=possible_actions)\n",
+    "                trajectory.append(f\"> {prediction.action}\")\n",
+    "\n",
+    "                if prediction.action.startswith(\"think:\"):\n",
+    "                    trajectory.append(\"OK.\")\n",
+    "                    continue\n",
+    "\n",
+    "                obs, reward, done, info = env.step(prediction.action)\n",
+    "                obs, reward, done = obs[0], reward[0], done[0]\n",
+    "                trajectory.append(obs)\n",
+    "\n",
+    "                if self.verbose:\n",
+    "                    print(\"\\n\".join(trajectory[-2:]))\n",
+    "\n",
+    "                if done:\n",
+    "                    break\n",
+    "\n",
+    "        assert reward == int(info[\"won\"][0]), (reward, info[\"won\"][0])\n",
+    "        return dspy.Prediction(trajecotry=trajectory, success=reward)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Aside: If you wanted to include instructions for your agent...\n",
+    "\n",
+    "Above, we opted to keep the agent super simple, without even providing short instructions that describe the task.\n",
+    "\n",
+    "In principle, you can copy a short definition of the AlfWorld task (based on Yao et al., 2022) and use that as the\n",
+    "instruction for your agent. This is not inherently essential, but it helps illustrate the role that\n",
+    "instructions play in DSPy: they're not for coercing the model to exhibit a certain behavior, but they're there to\n",
+    "describe the fundamentals of the task in a straightforward, human-readable way.\n",
+    "\n",
+    "If you want to do that, you can simply replace this:\n",
+    "\n",
+    "```python\n",
+    "self.react = dspy.Predict(\"task, trajectory, possible_actions: list[str] -> action\")\n",
+    "```\n",
+    "\n",
+    "with this:\n",
+    "\n",
+    "```python\n",
+    "INSTRUCTIONS = \"\"\"\n",
+    "Interact with a simulated household to achieve a high-level goal. Make sure to plan, track subgoals,\n",
+    "determine likely locations for common household items (e.g. desklamps will likely be on desks, shelfs, or dressers),\n",
+    "and explore systematically (e.g. check all desks one by one for desklamp).\n",
+    "\"\"\".strip()\n",
+    "\n",
+    "self.react = dspy.Predict(dspy.Signature(\"task, trajectory, possible_actions: list[str] -> action\", INSTRUCTIONS))\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Zero-shot evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, let's try this simple program, prior to any optimization work."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Task: -= Welcome to TextWorld, ALFRED! =-\n",
+      "\n",
+      "You are in the middle of a room. Looking quickly around you, you see a countertop 1, a drawer 8, a drawer 7, a drawer 6, a drawer 5, a drawer 4, a drawer 3, a drawer 2, a drawer 1, a garbagecan 1, a handtowelholder 1, a sinkbasin 2, a sinkbasin 1, a toilet 1, a toiletpaperhanger 1, and a towelholder 1.\n",
+      "\n",
+      "Your task is to: put a clean soapbar in garbagecan.\n",
+      "> go to countertop 1\n",
+      "You arrive at countertop 1. On the countertop 1, you see a candle 1, a soapbar 1, a soapbottle 2, a soapbottle 1, and a spraybottle 1.\n",
+      "> take soapbar 1 from countertop 1\n",
+      "You pick up the soapbar 1 from the countertop 1.\n",
+      "> go to garbagecan 1\n",
+      "You arrive at garbagecan 1. On the garbagecan 1, you see nothing.\n",
+      "> move soapbar 1 to garbagecan 1\n",
+      "You move the soapbar 1 to the garbagecan 1.\n",
+      "> examine garbagecan 1\n",
+      "On the garbagecan 1, you see a soapbar 1.\n",
+      "> examine garbagecan 1\n",
+      "On the garbagecan 1, you see a soapbar 1.\n",
+      "> examine garbagecan 1\n",
+      "On the garbagecan 1, you see a soapbar 1.\n",
+      "> examine garbagecan 1\n",
+      "On the garbagecan 1, you see a soapbar 1.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> take soapbar 1 from garbagecan 1\n",
+      "You pick up the soapbar 1 from the garbagecan 1.\n",
+      "> move soapbar 1 to garbagecan 1\n",
+      "You move the soapbar 1 to the garbagecan 1.\n",
+      "> examine garbagecan 1\n",
+      "On the garbagecan 1, you see a soapbar 1.\n",
+      "> examine garbagecan 1\n",
+      "On the garbagecan 1, you see a soapbar 1.\n",
+      "> examine garbagecan 1\n",
+      "On the garbagecan 1, you see a soapbar 1.\n",
+      "> examine garbagecan 1\n",
+      "On the garbagecan 1, you see a soapbar 1.\n",
+      "> examine garbagecan 1\n",
+      "On the garbagecan 1, you see a soapbar 1.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> examine garbagecan 1\n",
+      "On the garbagecan 1, you see a soapbar 1.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> examine garbagecan 1\n",
+      "On the garbagecan 1, you see a soapbar 1.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n",
+      "> examine garbagecan 1\n",
+      "On the garbagecan 1, you see a soapbar 1.\n",
+      "> look\n",
+      "You are facing the garbagecan 1. Next to it, you see nothing.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Prediction(\n",
+       "    trajecotry=['> go to countertop 1', 'You arrive at countertop 1. On the countertop 1, you see a candle 1, a soapbar 1, a soapbottle 2, a soapbottle 1, and a spraybottle 1.', '> take soapbar 1 from countertop 1', 'You pick up the soapbar 1 from the countertop 1.', '> go to garbagecan 1', 'You arrive at garbagecan 1. On the garbagecan 1, you see nothing.', '> move soapbar 1 to garbagecan 1', 'You move the soapbar 1 to the garbagecan 1.', '> examine garbagecan 1', 'On the garbagecan 1, you see a soapbar 1.', '> examine garbagecan 1', 'On the garbagecan 1, you see a soapbar 1.', '> examine garbagecan 1', 'On the garbagecan 1, you see a soapbar 1.', '> examine garbagecan 1', 'On the garbagecan 1, you see a soapbar 1.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> take soapbar 1 from garbagecan 1', 'You pick up the soapbar 1 from the garbagecan 1.', '> move soapbar 1 to garbagecan 1', 'You move the soapbar 1 to the garbagecan 1.', '> examine garbagecan 1', 'On the garbagecan 1, you see a soapbar 1.', '> examine garbagecan 1', 'On the garbagecan 1, you see a soapbar 1.', '> examine garbagecan 1', 'On the garbagecan 1, you see a soapbar 1.', '> examine garbagecan 1', 'On the garbagecan 1, you see a soapbar 1.', '> examine garbagecan 1', 'On the garbagecan 1, you see a soapbar 1.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> examine garbagecan 1', 'On the garbagecan 1, you see a soapbar 1.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> examine garbagecan 1', 'On the garbagecan 1, you see a soapbar 1.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.', '> examine garbagecan 1', 'On the garbagecan 1, you see a soapbar 1.', '> look', 'You are facing the garbagecan 1. Next to it, you see nothing.'],\n",
+       "    success=0\n",
+       ")"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "agent_4o = Agent()\n",
+    "agent_4o.set_lm(gpt4o)\n",
+    "agent_4o.verbose = True\n",
+    "\n",
+    "agent_4o(**example.inputs())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Okay, in this case it couldn't solve this example! Now, let's check the average quality of 4o and 4o-mini."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metric = lambda x, y, trace=None: y.success\n",
+    "evaluate = dspy.Evaluate(devset=devset, metric=metric, display_progress=True, num_threads=16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 115.00 / 200 (57.5%): 100%|██████████| 200/200 [06:14<00:00,  1.87s/it]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/12/28 11:10:25 INFO dspy.evaluate.evaluate: Average Metric: 115 / 200 (57.5%)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "57.5"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "agent_4o.verbose = False\n",
+    "evaluate(agent_4o)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 30.00 / 200 (15.0%): 100%|██████████| 200/200 [08:33<00:00,  2.57s/it]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/12/28 11:18:59 INFO dspy.evaluate.evaluate: Average Metric: 30 / 200 (15.0%)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "15.0"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "agent_4o_mini = Agent()\n",
+    "agent_4o_mini.set_lm(gpt4o_mini)\n",
+    "\n",
+    "evaluate(agent_4o_mini)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Out of the box, on this task, 4o is decent (58% success rate) while 4o-mini struggles (15% success rate).\n",
+    "\n",
+    "Let's apply the following strategy:\n",
+    "\n",
+    "1. We'll optimize the _prompts_ for gpt-4o in a lightweight way.\n",
+    "2. We'll then use this prompt-optimized agent as a teacher to fine-tune gpt-4o-mini on the task. This will increase its quality from 19% to 72% (or 82% if you use 500 trainset examples)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prompt-optimizing GPT-4o"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = dspy.MIPROv2(metric=metric, auto=\"light\", num_threads=16, prompt_model=gpt4o)\n",
+    "\n",
+    "config = dict(max_bootstrapped_demos=1, max_labeled_demos=0, minibatch_size=40)\n",
+    "optimized_4o = optimizer.compile(agent_4o, trainset=trainset, **config, requires_permission_to_run=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fine-tuning GPT-4o-mini\n",
+    "\n",
+    "For fine-tuning, we'll need a teacher program (`optimized_4o` above) and a student program derived from it (`student_4om` below)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "student_4o_mini = optimized_4o.deepcopy()\n",
+    "student_4o_mini.set_lm(gpt4o_mini)\n",
+    "# student_4o_mini.react.demos = []  # you can optionally reset the demos"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = dspy.BootstrapFinetune(metric=metric, num_threads=16)\n",
+    "finetuned_4o_mini = optimizer.compile(student_4o_mini, teacher=optimized_4o, trainset=trainset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Evaluate the finetuned GPT-4o-mini agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 143.00 / 200 (71.5%): 100%|██████████| 200/200 [03:15<00:00,  1.05it/s]"
+     ]
+    }
+   ],
+   "source": [
+    "evaluate(finetuned_4o_mini)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Having done all this optimization, let's save our program so we can use it later! This will keep a reference to the fine-tuned model as well, as long as it continued to exist with the same identifier at the provider side."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "finetuned_4o_mini.save('finetuned_4o_mini_001.pkl')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's now check out one task using our fine-tuned agent program!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Task: -= Welcome to TextWorld, ALFRED! =-\n",
+      "\n",
+      "You are in the middle of a room. Looking quickly around you, you see a armchair 1, a cabinet 1, a drawer 21, a drawer 20, a drawer 19, a drawer 18, a drawer 17, a drawer 16, a drawer 15, a drawer 14, a drawer 13, a drawer 12, a drawer 11, a drawer 10, a drawer 9, a drawer 8, a drawer 7, a drawer 6, a drawer 5, a drawer 4, a drawer 3, a drawer 2, a drawer 1, a dresser 1, a garbagecan 1, a sidetable 5, a sidetable 4, a sidetable 3, a sidetable 2, a sidetable 1, and a sofa 1.\n",
+      "\n",
+      "Your task is to: put some box on dresser.\n",
+      "> go to cabinet 1\n",
+      "You arrive at cabinet 1. On the cabinet 1, you see nothing.\n",
+      "> go to dresser 1\n",
+      "You arrive at dresser 1. On the dresser 1, you see a book 1, a newspaper 1, a remotecontrol 1, a statue 3, and a television 1.\n",
+      "> look\n",
+      "You are facing the dresser 1. Next to it, you see nothing.\n",
+      "> go to sidetable 1\n",
+      "You arrive at sidetable 1. On the sidetable 1, you see a cellphone 1, and a desklamp 1.\n",
+      "> go to sidetable 2\n",
+      "You arrive at sidetable 2. On the sidetable 2, you see a box 2.\n",
+      "> take box 2 from sidetable 2\n",
+      "You pick up the box 2 from the sidetable 2.\n",
+      "> go to dresser 1\n",
+      "You arrive at dresser 1. On the dresser 1, you see a book 1, a newspaper 1, a remotecontrol 1, a statue 3, and a television 1.\n",
+      "> move box 2 to dresser 1\n",
+      "You move the box 2 to the dresser 1.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Prediction(\n",
+       "    trajecotry=['> go to cabinet 1', 'You arrive at cabinet 1. On the cabinet 1, you see nothing.', '> go to dresser 1', 'You arrive at dresser 1. On the dresser 1, you see a book 1, a newspaper 1, a remotecontrol 1, a statue 3, and a television 1.', '> look', 'You are facing the dresser 1. Next to it, you see nothing.', '> go to sidetable 1', 'You arrive at sidetable 1. On the sidetable 1, you see a cellphone 1, and a desklamp 1.', '> go to sidetable 2', 'You arrive at sidetable 2. On the sidetable 2, you see a box 2.', '> take box 2 from sidetable 2', 'You pick up the box 2 from the sidetable 2.', '> go to dresser 1', 'You arrive at dresser 1. On the dresser 1, you see a book 1, a newspaper 1, a remotecontrol 1, a statue 3, and a television 1.', '> move box 2 to dresser 1', 'You move the box 2 to the dresser 1.'],\n",
+       "    success=1\n",
+       ")"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "finetuned_4o_mini.verbose = True\n",
+    "finetuned_4o_mini(**devset[0].inputs())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you want to load and use the agent program, you can do that as follows."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loaded = Agent()\n",
+    "loaded.load('finetuned_4o_mini_001.pkl')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "jun2024_py310",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/dspy/datasets/__init__.py b/dspy/datasets/__init__.py
index 582be6e29..d9d8c510f 100644
--- a/dspy/datasets/__init__.py
+++ b/dspy/datasets/__init__.py
@@ -3,6 +3,7 @@
 from dspy.datasets.dataset import Dataset
 from dspy.datasets.hotpotqa import HotPotQA
 from dspy.datasets.math import MATH
+from dspy.datasets.alfworld import AlfWorld
 
 __all__ = [
     "Colors",
diff --git a/dspy/datasets/alfworld/__init__.py b/dspy/datasets/alfworld/__init__.py
new file mode 100644
index 000000000..9a1bc42d5
--- /dev/null
+++ b/dspy/datasets/alfworld/__init__.py
@@ -0,0 +1 @@
+from dspy.datasets.alfworld.alfworld import AlfWorld
\ No newline at end of file
diff --git a/dspy/datasets/alfworld/alfworld.py b/dspy/datasets/alfworld/alfworld.py
new file mode 100644
index 000000000..8a78a3e01
--- /dev/null
+++ b/dspy/datasets/alfworld/alfworld.py
@@ -0,0 +1,149 @@
+import os
+import queue
+import random
+
+def env_worker(inq, outq):
+    """
+    Worker process: creates a single AlfredTWEnv instance,
+    handles 'init' (with task idx) and 'step' (with action).
+    """
+
+    try:
+        import io
+        import yaml
+        import alfworld.agents.environment as environment
+        from contextlib import redirect_stdout, redirect_stderr
+    except ImportError:
+        raise ImportError("alfworld is not installed. " \
+            "Please install it via `pip install alfworld==0.3.5` then run `alfworld-download`.")
+
+    buf = io.StringIO()
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    config_path = os.path.join(base_dir, 'base_config.yml')
+
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+
+    with redirect_stdout(buf), redirect_stderr(buf):
+        base_env = environment.AlfredTWEnv(config, train_eval="train")
+
+    env = None
+    while True:
+        cmd, data = inq.get()
+        if cmd == 'init':
+            env = base_env.init_env(batch_size=1)
+            env.skip(data)
+            task_def, info = env.reset()
+            outq.put((task_def[0], info))
+        elif cmd == 'step':
+            obs, rew, done, info = env.step([data])
+            outq.put((obs, rew, done, info))
+        elif cmd == 'close':
+            outq.put('CLOSED')
+            break
+        else:
+            outq.put('UNKNOWN_CMD')
+
+
+class EnvPool:
+    """
+    Pool of processes, each with a unique env_worker.
+    Acquire a worker using a context manager for safe usage:
+        with pool.session() as sess:
+            sess.init(5)              # init with idx=5
+            obs, rew, done, info = sess.step("go north")
+            ...
+    """
+    def __init__(self, size=2):
+        self.size = size
+        self.workers = []
+        self.available = queue.Queue()
+
+        try:
+            import multiprocess as mp
+        except ImportError:
+            raise ImportError("multiprocess is not installed. " \
+                "Please install it via `pip install multiprocess`.")
+
+        # Must call set_start_method('spawn') here, before creating any processes
+        try:
+            mp.set_start_method("spawn", force=True)
+        except RuntimeError:
+            # If it's already set, ignore
+            pass
+
+        ctx = mp.get_context("spawn")
+        for i in range(size):
+            inq = ctx.Queue()
+            outq = ctx.Queue()
+            p = ctx.Process(target=env_worker, args=(inq, outq), daemon=True)
+            p.start()
+            self.workers.append((inq, outq, p))
+            self.available.put(i)
+
+    def _acquire(self):
+        wid = self.available.get()
+        return wid, self.workers[wid][0], self.workers[wid][1]
+
+    def _release(self, wid):
+        self.available.put(wid)
+
+    def close_all(self):
+        """Close all processes in the pool."""
+        while not self.available.empty():
+            wid = self.available.get()
+            inq, outq, proc = self.workers[wid]
+            inq.put(('close', None))
+            outq.get()  # Wait 'CLOSED'
+            inq.close()
+            outq.close()
+            proc.join()
+
+    def session(self):
+        """Context manager that acquires/releases a single worker."""
+        return _EnvSession(self)
+
+
+class _EnvSession:
+    """
+    A context manager that acquires a worker from the pool,
+    provides .init(idx) and .step(action), then releases the worker.
+    """
+    def __init__(self, pool: EnvPool):
+        self.pool = pool
+        self.wid = None
+        self.inq = None
+        self.outq = None
+
+    def __enter__(self):
+        self.wid, self.inq, self.outq = self.pool._acquire()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.pool._release(self.wid)
+
+    def init(self, idx):
+        self.inq.put(('init', idx))
+        return self.outq.get()  # (task_def, info)
+
+    def step(self, action):
+        self.inq.put(('step', action))
+        return self.outq.get()  # (obs, rew, done, info)
+
+
+class AlfWorld:
+    def __init__(self, max_threads=20):
+        self.POOL = EnvPool(size=max_threads)
+
+        import dspy
+        dataset = [dspy.Example(idx=idx).with_inputs('idx') for idx in range(3500)]
+        random.Random(0).shuffle(dataset)
+
+        trainset, devset = dataset[:3000], dataset[-500:]
+        assert len(trainset) + len(devset) <= len(dataset)
+
+        self.trainset = trainset
+        self.devset = devset
+
+    def __del__(self):
+        self.POOL.close_all()
diff --git a/dspy/datasets/alfworld/base_config.yml b/dspy/datasets/alfworld/base_config.yml
new file mode 100644
index 000000000..03c6d146b
--- /dev/null
+++ b/dspy/datasets/alfworld/base_config.yml
@@ -0,0 +1,145 @@
+dataset:
+  data_path: '$ALFWORLD_DATA/json_2.1.1/train'
+  eval_id_data_path: '$ALFWORLD_DATA/json_2.1.1/valid_seen'    # null/None to disable
+  eval_ood_data_path: '$ALFWORLD_DATA/json_2.1.1/valid_unseen' # null/None to disable
+  num_train_games: -1                                          # max training games (<=0 indicates full dataset)
+  num_eval_games: -1                                           # max evaluation games (<=0 indicates full dataset)
+
+logic:
+  domain: '$ALFWORLD_DATA/logic/alfred.pddl'                   # PDDL domain file that defines the world dynamics
+  grammar: '$ALFWORLD_DATA/logic/alfred.twl2'                  # Grammar file that defines the text feedbacks
+
+env:
+  type: 'AlfredTWEnv'                                          # 'AlfredTWEnv' or 'AlfredThorEnv' or 'AlfredHybrid'
+  regen_game_files: False                                      # check if game is solvable by expert and save to game.tw-pddl file
+  domain_randomization: False                                  # shuffle Textworld print order and object id nums
+  task_types: [1, 2, 3, 4, 5, 6]                               # task-type ids: 1 - Pick & Place, 2 - Examine in Light, 3 - Clean & Place, 4 - Heat & Place, 5 - Cool & Place, 6 - Pick Two & Place
+  expert_timeout_steps: 150                                    # max steps before timeout for expert to solve the task
+  expert_type: "handcoded"                                     # 'handcoded' or 'downward'. Note: the downward planner is very slow for real-time use
+  goal_desc_human_anns_prob: 0.0                               # prob of using human-annotated goal language instead of templated goals (1.0 indicates all human annotations from ALFRED)
+
+  hybrid:
+    start_eps: 100000                                          # starting episode of hybrid training, tw-only training upto this point
+    thor_prob: 0.5                                             # prob of AlfredThorEnv during hybrid training
+    eval_mode: "tw"                                            # 'tw' or 'thor' - env used for evaluation during hybrid training
+
+  thor:
+    screen_width: 300                                          # width of THOR window
+    screen_height: 300                                         # height of THOR window
+    smooth_nav: False                                          # smooth rotations, looks, and translations during navigation (very slow)
+    save_frames_to_disk: False                                 # save frame PNGs to disk (useful for making videos)
+    save_frames_path: './videos/'                              # path to save frame PNGs
+
+controller:
+  type: 'oracle'                                               # 'oracle' or 'oracle_astar' or 'mrcnn' or 'mrcnn_astar' (aka BUTLER)
+  debug: False
+  load_receps: True                                            # load receptacle locations from precomputed dict (if available)
+
+mask_rcnn:
+  pretrained_model_path: '$ALFWORLD_DATA/detectors/mrcnn.pth'
+
+general:
+  random_seed: 42
+  use_cuda: True                                               # disable this when running on machine without cuda
+  visdom: False                                                # plot training/eval curves, run with visdom server
+  task: 'alfred'
+  training_method: 'dagger'                                    # 'dqn' or 'dagger'
+  save_path: './training/'                                     # path to save pytorch models
+  observation_pool_capacity: 3                                 # k-size queue, 0 indicates no observation
+  hide_init_receptacles: False                                 # remove initial observation containing navigable receptacles
+
+  training:
+    batch_size: 10
+    max_episode: 50000
+    smoothing_eps: 0.1
+    optimizer:
+      learning_rate: 0.001
+      clip_grad_norm: 5
+
+  evaluate:
+    run_eval: True
+    batch_size: 10
+    env:
+      type: "AlfredTWEnv"
+
+  checkpoint:
+    report_frequency: 1000                                    # report every N episode
+    experiment_tag: 'test'                                    # name of experiment
+    load_pretrained: False                                    # during test, enable this so that the agent load your pretrained model
+    load_from_tag: 'not loading anything'                     # name of pre-trained model to load in save_path
+
+  model:
+    encoder_layers: 1
+    decoder_layers: 1
+    encoder_conv_num: 5
+    block_hidden_dim: 64
+    n_heads: 1
+    dropout: 0.1
+    block_dropout: 0.1
+    recurrent: True
+
+rl:
+  action_space: "admissible"                                  # 'admissible' (candidates from text engine) or 'generation' (seq2seq-style generation) or 'beam_search_choice' or 'exhaustive' (not working)
+  max_target_length: 20                                       # max token length for seq2seq generation
+  beam_width: 10                                              # 1 means greedy
+  generate_top_k: 3
+
+  training:
+    max_nb_steps_per_episode: 50                              # terminate after this many steps
+    learn_start_from_this_episode: 0                          # delay updates until this epsiode
+    target_net_update_frequency: 500                          # sync target net with online net per this many epochs
+
+  replay:
+    accumulate_reward_from_final: True
+    count_reward_lambda: 0.0                                  # 0 to disable
+    novel_object_reward_lambda: 0.0                           # 0 to disable
+    discount_gamma_game_reward: 0.9
+    discount_gamma_count_reward: 0.5
+    discount_gamma_novel_object_reward: 0.5
+    replay_memory_capacity: 500000                            # adjust this depending on your RAM size
+    replay_memory_priority_fraction: 0.5
+    update_per_k_game_steps: 5
+    replay_batch_size: 64
+    multi_step: 3
+    replay_sample_history_length: 4
+    replay_sample_update_from: 2
+
+  epsilon_greedy:
+    noisy_net: False                                          # if this is true, then epsilon greedy is disabled
+    epsilon_anneal_episodes: 1000                             # -1 if not annealing
+    epsilon_anneal_from: 0.3
+    epsilon_anneal_to: 0.1
+
+dagger:
+  action_space: "generation"                                  # 'admissible' (candidates from text engine) or 'generation' (seq2seq-style generation) or 'exhaustive' (not working)
+  max_target_length: 20                                       # max token length for seq2seq generation
+  beam_width: 10                                              # 1 means greedy
+  generate_top_k: 5
+  unstick_by_beam_search: False                               # use beam-search for failed actions, set True during evaluation
+
+  training:
+    max_nb_steps_per_episode: 50                              # terminate after this many steps
+
+  fraction_assist:
+    fraction_assist_anneal_episodes: 50000
+    fraction_assist_anneal_from: 1.0
+    fraction_assist_anneal_to: 0.01
+
+  fraction_random:
+    fraction_random_anneal_episodes: 0
+    fraction_random_anneal_from: 0.0
+    fraction_random_anneal_to: 0.0
+
+  replay:
+    replay_memory_capacity: 500000
+    update_per_k_game_steps: 5
+    replay_batch_size: 64
+    replay_sample_history_length: 4
+    replay_sample_update_from: 2
+
+vision_dagger:
+  model_type: "resnet"                                        # 'resnet' (whole image features) or 'maskrcnn_whole' (whole image MaskRCNN feats) or 'maskrcnn' (top k MaskRCNN detection feats) or 'no_vision' (zero vision input)
+  resnet_fc_dim: 64
+  maskrcnn_top_k_boxes: 10                                    # top k box features
+  use_exploration_frame_feats: False                          # append feats from initial exploration (memory intensive!)
+  sequence_aggregation_method: "average"                      # 'sum' or 'average' or 'rnn'
\ No newline at end of file
diff --git a/dspy/teleprompt/bootstrap_finetune.py b/dspy/teleprompt/bootstrap_finetune.py
index af75c7249..c69694625 100644
--- a/dspy/teleprompt/bootstrap_finetune.py
+++ b/dspy/teleprompt/bootstrap_finetune.py
@@ -65,12 +65,16 @@ def compile(self, student: Program, trainset: List[Example], teacher: Optional[P
         # environments.
         print("[BootstrapFinetune] Preparing the student and teacher programs...")
         student = prepare_student(student)
-        teacher = prepare_teacher(student, teacher)
+        teachers = teacher if isinstance(teacher, list) else [teacher]
+        teachers = [prepare_teacher(student, teacher) for teacher in teachers]
         set_missing_predictor_lms(student)
-        set_missing_predictor_lms(teacher)
 
         print("[BootstrapFinetune] Bootstrapping data...")
-        trace_data = bootstrap_trace_data(program=teacher, dataset=trainset, metric=self.metric, num_threads=self.num_threads)
+        trace_data = []
+
+        for teacher in teachers:
+            set_missing_predictor_lms(teacher)
+            trace_data += bootstrap_trace_data(program=teacher, dataset=trainset, metric=self.metric, num_threads=self.num_threads)
 
         print("[BootstrapFinetune] Preparing the train data...")
         key_to_data = {}
@@ -179,46 +183,53 @@ def bootstrap_trace_data(
     # Return a list of dicts with the following keys:
     #     example_ind, example, prediction, trace, and score (if metric != None)
     evaluator = Evaluate(
-        devset=dataset, num_threads=num_threads, display_progress=True,
+        devset=dataset, num_threads=num_threads, display_progress=True, return_outputs=True,
         provide_traceback=True  # TODO(check with team)
     )
-    # TODO(PR): Should "trace" not be included in the lambda function?
-    _metric = metric if metric else lambda example, prediction: 1
-    evaluator(program, metric=_metric)
+
+    def wrapped_metric(example, prediction, trace=None):
+        prediction, _ = prediction
+        return metric(example, prediction, trace) if metric else True
+
+    def wrapped_program(**kwargs):
+        with dspy.context(trace=[]):
+            return program(**kwargs), dspy.settings.trace.copy()
+
+    _, outputs = evaluator(wrapped_program, metric=wrapped_metric)
 
     data = []
-    for example_ind, example in enumerate(dataset):
-        data_dict = bootstrap_trace_data_one_example(
-            example=example, program=program, metric=metric
-        )
-        data_dict["example_ind"] = example_ind
+    for example_ind, (example, prediction, score) in enumerate(outputs):
+        prediction, trace = prediction
+        data_dict = dict(example=example, prediction=prediction, trace=trace, example_ind=example_ind)
+        if metric:
+            data_dict["score"] = score
         data.append(data_dict)
 
     return data
 
 
-# TODO(PR) check with team
-def bootstrap_trace_data_one_example(
-    example: Example,
-    program: Program,
-    metric: Optional[Callable] = None
-) -> Dict[str, Any]:
-    # Return a dict with the following keys:
-    #     example, prediction, trace, and score (if metric != None)
-    with dspy.context(trace=[]):
-        prediction = program(**example.inputs())
-        trace = dspy.settings.trace
-        score = metric(example, prediction, trace) if metric else None
-
-    data_dict = dict(
-        example=example,
-        prediction=prediction,
-        trace=trace,
-    )
-    if metric:
-        data_dict["score"] = score
+# # TODO(PR) check with team
+# def bootstrap_trace_data_one_example(
+#     example: Example,
+#     program: Program,
+#     metric: Optional[Callable] = None
+# ) -> Dict[str, Any]:
+#     # Return a dict with the following keys:
+#     #     example, prediction, trace, and score (if metric != None)
+#     with dspy.context(trace=[]):
+#         prediction = program(**example.inputs())
+#         trace = dspy.settings.trace
+#         score = metric(example, prediction, trace) if metric else None
+
+#     data_dict = dict(
+#         example=example,
+#         prediction=prediction,
+#         trace=trace,
+#     )
+#     if metric:
+#         data_dict["score"] = score
 
-    return data_dict
+#     return data_dict
 
 
 # Note: Shared below are useful functions for preparing student/teacher programs
@@ -273,7 +284,7 @@ def assert_structural_equivalency(program1: object, program2: object):
         assert name1 == name2, err
         assert isinstance(pred1, Predict)
         assert isinstance(pred2, Predict)
-        assert pred1.signature.equals(pred2.signature)
+        # assert pred1.signature.equals(pred2.signature)
 
 
 def assert_no_shared_predictor(program1: Program, program2: Program):