Merge pull request #25 from yuntongzhang/fresh-issue

Add a mode for running on new GitHub issues
AutoCodeRoverSG · Apr 17, 2024 · 1e89e9c · 1e89e9c
2 parents e474d48 + 68ac175
commit 1e89e9c
Show file tree

Hide file tree

Showing 11 changed files with 577 additions and 265 deletions.
diff --git a/README.md b/README.md
@@ -77,7 +77,37 @@ In the docker container, set the `OPENAI_KEY` env var to your [OpenAI key](https
 export OPENAI_KEY=xx-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 ```
 
-### Set up one or more tasks in SWE-bench
+### (Fresh issue mode) Set up and run on new GitHub issues
+
+> [!NOTE]
+> This section is for running AutoCodeRover on new GitHub issues. For running it on SWE-bench tasks, refer to [SWE-bench mode](#swe-bench-mode-set-up-and-run-on-swe-bench-tasks).
+
+If you want to use AutoCodeRover for new GitHub issues in a project, prepare the following:
+
+- Link to clone the project (used for `git clone ...`).
+- Commit hash of the project version for AutoCodeRover to work on (used for `git checkout ...`).
+- Link to the GitHub issue page.
+
+Then, in the docker container (or your local copy of AutoCodeRover), run the following commands to set up the target project
+and generate patch:
+
+```
+cd /opt/auto-code-rover
+conda activate auto-code-rover
+PYTHONPATH=. python app/main.py --mode fresh_issue --output-dir output --setup-dir setup --model gpt-4-0125-preview --model-temperature 0.2 --fresh-task-id <task id> --clone-link <link for cloning the project> --commit-hash <any version that has the issue> --issue-link <link to issue page>
+```
+
+The `<task id>` can be any string used to identify this issue.
+
+If patch generation is successful, the path to the generated patch will be printed in the end.
+
+
+### (SWE-bench mode) Set up and run on SWE-bench tasks
+
+> [!NOTE]
+> This section is for running AutoCodeRover on SWE-bench tasks. For running it on new GitHub issues, refer to [Fresh issue mode](#fresh-issue-mode-set-up-and-run-on-new-github-issues).
+
+#### Set up
 
 In the docker container, we need to first set up the tasks to run in SWE-bench (e.g., `django__django-11133`). The list of all tasks can be found in [`conf/swe_lite_tasks.txt`](conf/swe_lite_tasks.txt).
 
@@ -108,7 +138,7 @@ A conda environment will also be created for this task instance.
 
 _If you want to set up multiple tasks together, put their ids in `tasks.txt` and follow the same steps._
 
-### Run a single task
+#### Run a single task
 
 Before running the task (`django__django-11133` here), make sure it has been set up as mentioned [above](#set-up-one-or-more-tasks-in-swe-bench).
 
@@ -120,7 +150,7 @@ PYTHONPATH=. python app/main.py --enable-layered --model gpt-4-0125-preview --se
 
 The output of the run can then be found in `output/`. For example, the patch generated for `django__django-11133` can be found at a location like this: `output/applicable_patch/django__django-11133_yyyy-MM-dd_HH-mm-ss/extracted_patch_1.diff` (the date-time field in the directory name will be different depending on when the experiment was run).
 
-### Run multiple tasks
+#### Run multiple tasks
 
 First, put the id's of all tasks to run in a file, one per line. Suppose this file is `tasks.txt`, the tasks can be run with
 

diff --git a/app/api/manage.py b/app/api/manage.py
@@ -70,15 +70,15 @@ def __init__(
         task_id: str,
         project_path: str,
         commit: str,
-        env_name: str,
-        repo_name: str,
-        pre_install_cmds: list[str],
-        install_cmd: str,
-        test_cmd: str,
-        test_patch: str,
-        testcases_passing: list[str],
-        testcases_failing: list[str],
         output_dir: str,
+        env_name: str | None = None,
+        repo_name: str | None = None,
+        pre_install_cmds: list[str] | None = None,
+        install_cmd: str | None = None,
+        test_cmd: str | None = None,
+        test_patch: str | None = None,
+        testcases_passing: list[str] | None = None,
+        testcases_failing: list[str] | None = None,
         do_install: bool = False,
         import_root: str = "src",
     ):
@@ -90,16 +90,22 @@ def __init__(
         self.env_name = env_name
         self.repo_name = repo_name
         # additional installation commands after setup was done
-        self.pre_install_cmds: list[str] = pre_install_cmds
+        self.pre_install_cmds: list[str] = (
+            [] if pre_install_cmds is None else pre_install_cmds
+        )
         self.install_cmd: str = install_cmd
         # command to run tests
         self.test_cmd: str = test_cmd
         # the patch to testcases
         self.test_patch: str = test_patch
         # names of the passing testcases for this issue
-        self.testcases_passing: list[str] = testcases_passing
+        self.testcases_passing: list[str] = (
+            [] if testcases_passing is None else testcases_passing
+        )
         # names of the failing testcases for this issue
-        self.testcases_failing: list[str] = testcases_failing
+        self.testcases_failing: list[str] = (
+            [] if testcases_failing is None else testcases_failing
+        )
         # where to write our output
         self.output_dir = os.path.abspath(output_dir)
 
@@ -118,11 +124,15 @@ def __init__(
             self.do_install()
 
         # apply the test modifications to this task
-        self.apply_test_patch()
+        if self.test_patch is not None:
+            self.apply_test_patch()
 
         # commit the current changes, so that resetting later do not erase them
-        with apputils.cd(self.project_path):
-            apputils.repo_commit_current_changes(self.logger)
+        if do_install or self.test_patch is not None:
+            # this means we have applied some changes to the repo before
+            # starting the actual workflow
+            with apputils.cd(self.project_path):
+                apputils.repo_commit_current_changes(self.logger)
 
         # build search manager
         self.search_manager = SearchManager(self.project_path)

diff --git a/app/fresh_issue/common.py b/app/fresh_issue/common.py
@@ -0,0 +1,91 @@
+import json
+import os
+import shutil
+from os.path import join as pjoin
+
+from app import utils as apputils
+from app.fresh_issue import github
+
+
+class FreshTask:
+    """
+    Encapsulate everything required to run ACR on a fresh issue from the internet.
+    """
+
+    def __init__(
+        self,
+        task_id: str,
+        clone_link: str,
+        commit_hash: str,
+        issue_link: str,
+        setup_dir: str,
+        task_output_dir: str,
+    ):
+        self.task_id = task_id
+        self.clone_link = clone_link
+        self.commit_hash = commit_hash
+        self.issue_link = issue_link
+        # where to store output of ACR
+        self.task_output_dir = task_output_dir
+        # where the project source code is located
+        self.project_dir = self.setup_task_local(setup_dir)
+        self.problem_stmt, self.created_at = self.prepare_issue()
+        self.write_meta_file()
+
+    def setup_task_local(self, setup_dir: str):
+        """
+        Clone and check out the target project locally.
+        """
+        # we are going to clone to this path - make sure it is not there yet
+        cloned_path = pjoin(setup_dir, self.task_id)
+        if os.path.isdir(cloned_path):
+            print(
+                f"Path {cloned_path} already exists. Removing it to get a fresh clone."
+            )
+            shutil.rmtree(cloned_path)
+        # really clone the repo
+        cloned_path = apputils.clone_repo_and_checkout(
+            self.clone_link, self.commit_hash, setup_dir, self.task_id
+        )
+        print(f"Cloned source code to {cloned_path}.")
+        return cloned_path
+
+    def prepare_issue(self):
+        """
+        Prepare problem statement from the online issue report.
+        """
+        if "github.com" in self.issue_link:
+            retrieved_issue = github.get_github_issue_info(self.issue_link)
+            if retrieved_issue is None:
+                raise Exception(
+                    f"Failed to retrieve issue information from {self.issue_link}"
+                )
+            else:
+                title, body, created_at = retrieved_issue
+                problem_stmt = f"{title}\n{body}"
+                # save this issue into a file for reference
+                problem_stmt_file = pjoin(self.task_output_dir, "problem_statement.txt")
+                with open(problem_stmt_file, "w") as f:
+                    f.write(problem_stmt)
+                return problem_stmt, created_at
+        else:
+            raise NotImplementedError("Only GitHub issues are supported for now.")
+
+    def write_meta_file(self):
+        """
+        Write a meta file for compatibility reasons with the swe-bench mode.
+        """
+        meta_file = pjoin(self.task_output_dir, "meta.json")
+        meta = {
+            "task_info": {
+                "base_commit": self.commit_hash,
+                "created_at": self.created_at,
+                "problem_statement": self.problem_stmt,
+                "instance_id": self.task_id,
+            },
+            "setup_info": {
+                "repo_path": self.project_dir,
+            },
+        }
+        with open(meta_file, "w") as f:
+            json.dump(meta, f, indent=4)
diff --git a/app/fresh_issue/github.py b/app/fresh_issue/github.py
@@ -0,0 +1,22 @@
+import requests
+
+
+def get_github_issue_info(issue_url: str) -> tuple[str, str, str] | None:
+    # Extract owner, repo, and issue number from the URL
+    # Example issue URL: https://github.com/owner/repo/issues/123
+    _, owner, repo, _, issue_number = issue_url.rsplit("/", 4)
+
+    api_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}"
+    response = requests.get(api_url)
+
+    if response.status_code == 200:
+        issue_info = response.json()
+        # Extract relevant information from the issue
+        title = issue_info["title"]
+        body = issue_info["body"]
+        created_at = issue_info["created_at"]
+
+        return title, body, created_at
+    else:
+        print(f"Failed to fetch issue information: {response.status_code}")
+        return None
diff --git a/app/globals.py b/app/globals.py
@@ -5,13 +5,6 @@
 # Overall output directory for results
 output_dir: str = ""
 
-# whether to start conversation from fresh, or load from a conversation history.
-# If None, start from fresh.
-# If not None, continue from the conversation history stored in <file>.
-# <file> is the value of this variable, and should points to a json file
-# containing the past conversation history.
-load_cache: str | None = None
-
 # the model to use
 model: str = "gpt-3.5-turbo-0125"
 
@@ -26,7 +19,7 @@
 enable_sbfl: bool = False
 
 # whether to perform layered search
-enable_layered: bool = False
+enable_layered: bool = True
 
 # whether to perform our own validation
 enable_validation: bool = False