Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: repo to markdown #1061

Merged
merged 9 commits into from
Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
### Python template

# Byte-compiled / optimized / DLL files
__pycache__/
__pycache__
*.py[cod]
*$py.class

Expand Down
19 changes: 19 additions & 0 deletions metagpt/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import importlib
import inspect
import json
import mimetypes
import os
import platform
import re
Expand Down Expand Up @@ -834,3 +835,21 @@ def log_and_reraise(retry_state: RetryCallState):
"""
)
raise retry_state.outcome.exception()


def get_markdown_codeblock_type(filename: str) -> str:
"""Return the markdown code-block type corresponding to the file extension."""
mime_type, _ = mimetypes.guess_type(filename)
mappings = {
"text/x-shellscript": "bash",
"text/x-c++src": "cpp",
"text/css": "css",
"text/html": "html",
"text/x-java": "java",
"application/javascript": "javascript",
"application/json": "json",
"text/x-python": "python",
"text/x-ruby": "ruby",
"application/sql": "sql",
}
return mappings.get(mime_type, "text")
80 changes: 80 additions & 0 deletions metagpt/utils/repo_to_markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This file provides functionality to convert a local repository into a markdown representation.
"""
from __future__ import annotations

import mimetypes
from pathlib import Path

from gitignore_parser import parse_gitignore

from metagpt.logs import logger
from metagpt.utils.common import aread, awrite, get_markdown_codeblock_type, list_files
from metagpt.utils.tree import tree


async def repo_to_markdown(repo_path: str | Path, output: str | Path = None, gitignore: str | Path = None) -> str:
"""
Convert a local repository into a markdown representation.

This function takes a path to a local repository and generates a markdown representation of the repository structure,
including directory trees and file listings.

Args:
repo_path (str | Path): The path to the local repository.
output (str | Path, optional): The path to save the generated markdown file. Defaults to None.
gitignore (str | Path, optional): The path to the .gitignore file. Defaults to None.

Returns:
str: The markdown representation of the repository.
"""
repo_path = Path(repo_path)
gitignore = Path(gitignore or Path(__file__).parent / "../../.gitignore").resolve()

markdown = await _write_dir_tree(repo_path=repo_path, gitignore=gitignore)

gitignore_rules = parse_gitignore(full_path=str(gitignore))
markdown += await _write_files(repo_path=repo_path, gitignore_rules=gitignore_rules)

if output:
await awrite(filename=str(output), data=markdown, encoding="utf-8")
return markdown


async def _write_dir_tree(repo_path: Path, gitignore: Path) -> str:
try:
content = tree(repo_path, gitignore, run_command=True)
except Exception as e:
logger.info(f"{e}, using safe mode.")
content = tree(repo_path, gitignore, run_command=False)

doc = f"## Directory Tree\n```text\n{content}\n```\n---\n\n"
return doc


async def _write_files(repo_path, gitignore_rules) -> str:
filenames = list_files(repo_path)
markdown = ""
for filename in filenames:
if gitignore_rules(str(filename)):
continue
markdown += await _write_file(filename=filename, repo_path=repo_path)
return markdown


async def _write_file(filename: Path, repo_path: Path) -> str:
relative_path = filename.relative_to(repo_path)
markdown = f"## {relative_path}\n"

mime_type, _ = mimetypes.guess_type(filename.name)
if "text/" not in mime_type:
logger.info(f"Ignore content: {filename}")
markdown += "<binary file>\n---\n\n"
return markdown
content = await aread(filename, encoding="utf-8")
content = content.replace("```", "\\`\\`\\`").replace("---", "\\-\\-\\-")
code_block_type = get_markdown_codeblock_type(filename.name)
markdown += f"```{code_block_type}\n{content}\n```\n---\n\n"
return markdown
140 changes: 140 additions & 0 deletions metagpt/utils/tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time : 2024/3/11
@Author : mashenquan
@File : tree.py
@Desc : Implement the same functionality as the `tree` command.
Example:
>>> print_tree(".")
utils
+-- serialize.py
+-- project_repo.py
+-- tree.py
+-- mmdc_playwright.py
+-- cost_manager.py
+-- __pycache__
| +-- __init__.cpython-39.pyc
| +-- redis.cpython-39.pyc
| +-- singleton.cpython-39.pyc
| +-- embedding.cpython-39.pyc
| +-- make_sk_kernel.cpython-39.pyc
| +-- file_repository.cpython-39.pyc
+-- file.py
+-- save_code.py
+-- common.py
+-- redis.py
"""
from __future__ import annotations

import subprocess
from pathlib import Path
from typing import Callable, Dict, List

from gitignore_parser import parse_gitignore


def tree(root: str | Path, gitignore: str | Path = None, run_command: bool = False) -> str:
"""
Recursively traverses the directory structure and prints it out in a tree-like format.

Args:
root (str or Path): The root directory from which to start traversing.
gitignore (str or Path): The filename of gitignore file.
run_command (bool): Whether to execute `tree` command. Execute the `tree` command and return the result if True,
otherwise execute python code instead.

Returns:
str: A string representation of the directory tree.

Example:
>>> tree(".")
utils
+-- serialize.py
+-- project_repo.py
+-- tree.py
+-- mmdc_playwright.py
+-- __pycache__
| +-- __init__.cpython-39.pyc
| +-- redis.cpython-39.pyc
| +-- singleton.cpython-39.pyc
+-- parse_docstring.py

>>> tree(".", gitignore="../../.gitignore")
utils
+-- serialize.py
+-- project_repo.py
+-- tree.py
+-- mmdc_playwright.py
+-- parse_docstring.py

>>> tree(".", gitignore="../../.gitignore", run_command=True)
utils
├── serialize.py
├── project_repo.py
├── tree.py
├── mmdc_playwright.py
└── parse_docstring.py


"""
root = Path(root).resolve()
if run_command:
return _execute_tree(root, gitignore)

git_ignore_rules = parse_gitignore(gitignore) if gitignore else None
dir_ = {root.name: _list_children(root=root, git_ignore_rules=git_ignore_rules)}
v = _print_tree(dir_)
return "\n".join(v)


def _list_children(root: Path, git_ignore_rules: Callable) -> Dict[str, Dict]:
dir_ = {}
for i in root.iterdir():
if git_ignore_rules and git_ignore_rules(str(i)):
continue
try:
if i.is_file():
dir_[i.name] = {}
else:
dir_[i.name] = _list_children(root=i, git_ignore_rules=git_ignore_rules)
except (FileNotFoundError, PermissionError, OSError):
dir_[i.name] = {}
return dir_


def _print_tree(dir_: Dict[str:Dict]) -> List[str]:
ret = []
for name, children in dir_.items():
ret.append(name)
if not children:
continue
lines = _print_tree(children)
for j, v in enumerate(lines):
if v[0] not in ["+", " ", "|"]:
ret = _add_line(ret)
row = f"+-- {v}"
else:
row = f" {v}"
ret.append(row)
return ret


def _add_line(rows: List[str]) -> List[str]:
for i in range(len(rows) - 1, -1, -1):
v = rows[i]
if v[0] != " ":
return rows
rows[i] = "|" + v[1:]
return rows


def _execute_tree(root: Path, gitignore: str | Path) -> str:
args = ["--gitfile", str(gitignore)] if gitignore else []
try:
result = subprocess.run(["tree"] + args + [str(root)], capture_output=True, text=True, check=True)
if result.returncode != 0:
raise ValueError(f"tree exits with code {result.returncode}")
return result.stdout
except subprocess.CalledProcessError as e:
raise e
25 changes: 25 additions & 0 deletions tests/metagpt/utils/test_repo_to_markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import uuid
from pathlib import Path

import pytest

from metagpt.utils.repo_to_markdown import repo_to_markdown


@pytest.mark.parametrize(
["repo_path", "output"],
[(Path(__file__).parent.parent, Path(__file__).parent.parent.parent / f"workspace/unittest/{uuid.uuid4().hex}.md")],
)
@pytest.mark.asyncio
async def test_repo_to_markdown(repo_path: Path, output: Path):
markdown = await repo_to_markdown(repo_path=repo_path, output=output)
assert output.exists()
assert markdown

output.unlink(missing_ok=True)


if __name__ == "__main__":
pytest.main([__file__, "-s"])
64 changes: 64 additions & 0 deletions tests/metagpt/utils/test_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from pathlib import Path
from typing import List

import pytest

from metagpt.utils.tree import _print_tree, tree


@pytest.mark.parametrize(
("root", "rules"),
[
(str(Path(__file__).parent / "../.."), None),
(str(Path(__file__).parent / "../.."), str(Path(__file__).parent / "../../../.gitignore")),
],
)
def test_tree(root: str, rules: str):
v = tree(root=root, gitignore=rules)
assert v


@pytest.mark.parametrize(
("root", "rules"),
[
(str(Path(__file__).parent / "../.."), None),
(str(Path(__file__).parent / "../.."), str(Path(__file__).parent / "../../../.gitignore")),
],
)
def test_tree_command(root: str, rules: str):
v = tree(root=root, gitignore=rules, run_command=True)
assert v


@pytest.mark.parametrize(
("tree", "want"),
[
({"a": {"b": {}, "c": {}}}, ["a", "+-- b", "+-- c"]),
({"a": {"b": {}, "c": {"d": {}}}}, ["a", "+-- b", "+-- c", " +-- d"]),
(
{"a": {"b": {"e": {"f": {}, "g": {}}}, "c": {"d": {}}}},
["a", "+-- b", "| +-- e", "| +-- f", "| +-- g", "+-- c", " +-- d"],
),
(
{"h": {"a": {"b": {"e": {"f": {}, "g": {}}}, "c": {"d": {}}}, "i": {}}},
[
"h",
"+-- a",
"| +-- b",
"| | +-- e",
"| | +-- f",
"| | +-- g",
"| +-- c",
"| +-- d",
"+-- i",
],
),
],
)
def test__print_tree(tree: dict, want: List[str]):
v = _print_tree(tree)
assert v == want


if __name__ == "__main__":
pytest.main([__file__, "-s"])
Loading