diff --git a/python/composio/tools/local/base/utils/__init__.py b/python/composio/tools/local/base/utils/__init__.py index 8b4fe30a8d5..ca61bb58eb4 100644 --- a/python/composio/tools/local/base/utils/__init__.py +++ b/python/composio/tools/local/base/utils/__init__.py @@ -1,5 +1,2 @@ -from .grep_ast import TreeContext -from .grep_utils import get_files_excluding_gitignore, grep_util from .parser import filename_to_lang -from .repomap import RepoMap from .utils import get_mtime, get_rel_fname, print_if_verbose, split_path, token_count diff --git a/python/composio/tools/local/base/utils/grep_ast.py b/python/composio/tools/local/base/utils/grep_ast.py deleted file mode 100644 index b5dd4461e78..00000000000 --- a/python/composio/tools/local/base/utils/grep_ast.py +++ /dev/null @@ -1,303 +0,0 @@ -# This file is based on code from -# https://github.com/paul-gauthier/aider/blob/main/aider/repomap.py - -import re - -from composio.tools.local.base.utils.parser import filename_to_lang - - -class TreeContext: - def __init__( - self, - filename, # Name of the file being processed - code, # Content of the file as a string - color=False, # Whether to use color highlighting in the output - verbose=False, # Whether to include additional detailed information in the output - line_number=False, # Whether to display line numbers in the output - parent_context=True, # Whether to include the enclosing scope (e.g., class or function) of a match - child_context=True, # Whether to include nested scopes (e.g., inner functions or classes) of a match - last_line=True, # Whether to include the closing line of each scope (e.g., end of function or class) - margin=3, # Number of additional lines to include before and after matches for context - mark_lois=True, # Whether to highlight or mark the specific lines of interest within matches - header_max=10, # Maximum number of lines to include when showing the header/beginning of a scope - show_top_of_file_parent_scope=True, # Whether to display the top-level scope of the file (e.g., module-level code) - loi_pad=1, # Number of extra lines to include around specifically marked lines of interest - ): - # Initialize TreeContext with various parameters - self.filename = filename - self.color = color - self.verbose = verbose - self.line_number = line_number - self.last_line = last_line - self.margin = margin - self.mark_lois = mark_lois - self.header_max = header_max - self.loi_pad = loi_pad - self.show_top_of_file_parent_scope = show_top_of_file_parent_scope - self.done_parent_scopes = set() - self.parent_context = parent_context - self.child_context = child_context - - # Determine the language of the file - lang = filename_to_lang(filename) - - from tree_sitter_languages import get_parser # pylint: disable=C0415 - - # Get parser based on file extension - parser = get_parser(lang) - tree = parser.parse(bytes(code, "utf8")) - - # Split the code into lines - self.lines = code.splitlines() - self.num_lines = len(self.lines) + 1 - - # Initialize data structures for storing information about the code - self.output_lines = {} # color lines, with highlighted matches - self.scopes = [ - set() for _ in range(self.num_lines) - ] # Which scopes is each line part of? - self.header = [ - [] for _ in range(self.num_lines) - ] # Which lines serve as a short "header" for the scope starting on that line - self.nodes = [[] for _ in range(self.num_lines)] - - # Walk the AST tree - root_node = tree.root_node - self.walk_tree(root_node) - scope_width = 0 - # Process headers and scopes - if self.verbose: - scope_width = max( - len(str(set(self.scopes[i]))) for i in range(self.num_lines - 1) - ) - for i in range(self.num_lines): - header = sorted(self.header[i]) - if self.verbose and i < self.num_lines - 1: - scopes = str(sorted(set(self.scopes[i]))) - print(f"{scopes.ljust(scope_width)}", i, self.lines[i]) - - if len(header) > 1: - size, head_start, head_end = header[0] - if size > self.header_max: - head_end = head_start + self.header_max - else: - head_start = i - head_end = i + 1 - - self.header[i] = head_start, head_end - - # Initialize sets for tracking lines to show and lines of interest - self.show_lines = set() - self.lines_of_interest = set() - - def grep(self, pat, ignore_case): - # Search for pattern in lines and highlight matches if color is enabled - found = set() - - # Compile the regex pattern once before the loop - flags = re.IGNORECASE if ignore_case else 0 - compiled_pat = re.compile(pat, flags) - - for i, line in enumerate(self.lines): - if compiled_pat.search(line): - if self.color: - highlighted_line = compiled_pat.sub( - lambda match: f"\033[1;31m{match.group()}\033[0m", line - ) - self.output_lines[i] = highlighted_line - found.add(i) - return found - - def add_lines_of_interest(self, line_nums): - # Add lines of interest to the set - self.lines_of_interest.update(line_nums) - - def add_context(self): - # Add context around lines of interest - if not self.lines_of_interest: - return - - self.done_parent_scopes = set() - - self.show_lines = set(self.lines_of_interest) - - # Add padding around lines of interest - if self.loi_pad: - for line in self.show_lines.copy(): - new_lines = range( - max(0, line - self.loi_pad), - min(self.num_lines, line + self.loi_pad + 1), - ) - self.show_lines.update(new_lines) - - # Add the bottom line and its parent context if required - if self.last_line: - bottom_line = self.num_lines - 2 - self.show_lines.add(bottom_line) - self.add_parent_scopes(bottom_line) - - # Add parent context for lines of interest if required - if self.parent_context: - for i in set(self.lines_of_interest): - self.add_parent_scopes(i) - - # Add child context for lines of interest if required - if self.child_context: - for i in set(self.lines_of_interest): - self.add_child_context(i) - - # Add top margin lines - if self.margin: - self.show_lines.update(range(self.margin)) - - # Close small gaps between shown lines - self.close_small_gaps() - - def add_child_context(self, i): - # Add context for child nodes - if not self.nodes[i]: - return - - last_line = self.get_last_line_of_scope(i) - size = last_line - i - if size < 5: - self.show_lines.update(range(i, last_line + 1)) - return - - children = [] - for node in self.nodes[i]: - children += self.find_all_children(node) - - children = sorted( - children, - key=lambda node: node.end_point[0] - node.start_point[0], - reverse=True, - ) - - currently_showing = len(self.show_lines) - max_to_show = 25 - min_to_show = 5 - percent_to_show = 0.10 - max_to_show = max(min(size * percent_to_show, max_to_show), min_to_show) - - for child in children: - if len(self.show_lines) > currently_showing + max_to_show: - break - child_start_line = child.start_point[0] - self.add_parent_scopes(child_start_line) - - def find_all_children(self, node): - # Recursively find all children of a node - children = [node] - for child in node.children: - children += self.find_all_children(child) - return children - - def get_last_line_of_scope(self, i): - # Get the last line of the scope starting at line i - last_line = max(node.end_point[0] for node in self.nodes[i]) - return last_line - - def close_small_gaps(self): - # Close small gaps between shown lines - closed_show = set(self.show_lines) - sorted_show = sorted(self.show_lines) - for i in range(len(sorted_show) - 1): - if sorted_show[i + 1] - sorted_show[i] == 2: - closed_show.add(sorted_show[i] + 1) - - # Pick up adjacent blank lines - for i, _ in enumerate(self.lines): - if i not in closed_show: - continue - if ( - self.lines[i].strip() - and i < self.num_lines - 2 - and not self.lines[i + 1].strip() - ): - closed_show.add(i + 1) - - self.show_lines = closed_show - - def format(self): - # Format the output with line numbers, colors, and markers - if not self.show_lines: - return "" - - output = "" - if self.color: - # reset - output += "\033[0m\n" - - dots = not (0 in self.show_lines) - for i, line in enumerate(self.lines): - if i not in self.show_lines: - if dots: - if self.line_number: - output += "...⋮...\n" - else: - output += "⋮...\n" - dots = False - continue - - if i in self.lines_of_interest and self.mark_lois: - spacer = "█" - if self.color: - spacer = f"\033[31m{spacer}\033[0m" - else: - spacer = "│" - - line_output = f"{spacer}{self.output_lines.get(i, line)}" - if self.line_number: - line_output = f"{i + 1:3}" + line_output - output += line_output + "\n" - - dots = True - - return output - - def add_parent_scopes(self, i): - # Add parent scopes for a given line - if i in self.done_parent_scopes: - return - self.done_parent_scopes.add(i) - - for line_num in self.scopes[i]: - head_start, head_end = self.header[line_num] - if head_start > 0 or self.show_top_of_file_parent_scope: - self.show_lines.update(range(head_start, head_end)) - - if self.last_line: - last_line = self.get_last_line_of_scope(line_num) - self.add_parent_scopes(last_line) - - def walk_tree(self, node, depth=0): - # Recursively walk the AST tree and populate data structures - start = node.start_point - end = node.end_point - - start_line = start[0] - end_line = end[0] - size = end_line - start_line - - self.nodes[start_line].append(node) - - if self.verbose and node.is_named: - print( - " " * depth, - node.type, - f"{start_line}-{end_line}={size + 1}", - node.text.splitlines()[0], - self.lines[start_line], - ) - - if size: - self.header[start_line].append((size, start_line, end_line)) - - for i in range(start_line, end_line + 1): - self.scopes[i].add(start_line) - - for child in node.children: - self.walk_tree(child, depth + 1) - - return start_line, end_line diff --git a/python/composio/tools/local/base/utils/grep_utils.py b/python/composio/tools/local/base/utils/grep_utils.py deleted file mode 100644 index 05ad5fec4e4..00000000000 --- a/python/composio/tools/local/base/utils/grep_utils.py +++ /dev/null @@ -1,126 +0,0 @@ -# This file is based on code from -# https://github.com/paul-gauthier/grep-ast/blob/main/grep_ast/grep_ast.py - -import os -from pathlib import Path - -from composio.tools.local.base.utils.grep_ast import TreeContext -from composio.tools.local.base.utils.parser import filename_to_lang - - -def get_files_excluding_gitignore(root_path, no_gitignore=False, file_patterns=None): - """ - Get all files matching the given patterns in the root path, excluding those specified in .gitignore. - - :param root_path: The root directory to start searching from. - :param no_gitignore: If True, ignore .gitignore file. - :param file_patterns: A list of file patterns to match. Defaults to ["*.py", "*.md"]. - :return: A list of file paths matching the patterns. - """ - root_path = Path(root_path).resolve() - gitignore = None - - if not no_gitignore: - # Check root_path first - potential_gitignore = root_path / ".gitignore" - if potential_gitignore.exists(): - gitignore = potential_gitignore - else: - # Then check parent directories - for parent in root_path.parents: - potential_gitignore = parent / ".gitignore" - if potential_gitignore.exists(): - gitignore = potential_gitignore - break - - import pathspec # TODO: simplify import # pylint: disable=C0415 - - if gitignore: - with open(gitignore, "r", encoding="utf-8") as f: - spec = pathspec.PathSpec.from_lines("gitwildmatch", f) - else: - spec = pathspec.PathSpec.from_lines("gitwildmatch", []) - - if file_patterns is None: - file_patterns = ["*.[pP][yY]", "*.[mM][dD]", "*.[rR][sS][tT]", "*.[tT][xX][tT]"] - - files = [] - for pattern in file_patterns: - for path in root_path.rglob(pattern): - # Exclude .git and other version control system folders - if any(part.startswith(".") and part != "." for part in path.parts): - continue - if path.is_file() and not spec.match_file(path): - files.append(str(path)) - - return files - - -# callable utility which works the same way as main. -def grep_util( - pattern, - filenames, - encoding="utf8", - color=None, - verbose=False, - line_number=True, - ignore_case=True, - no_gitignore=False, -): - results = [] - - for filename in filenames: - if os.path.isdir(filename): - dir_files = get_files_excluding_gitignore(filename, no_gitignore) - for file in dir_files: - results.extend( - process_file( - file, - pattern, - encoding, - ignore_case, - color, - verbose, - line_number, - ) - ) - else: - results.extend( - process_file( - filename, - pattern, - encoding, - ignore_case, - color, - verbose, - line_number, - ) - ) - - return results - - -def process_file(filename, pattern, encoding, ignore_case, color, verbose, line_number): - file_results = [] - try: - with open(filename, "r", encoding=encoding) as f: - content = f.read() - except UnicodeDecodeError: - return file_results - - lang = filename_to_lang(filename) - - if lang: - try: - tc = TreeContext( - filename, content, color=color, verbose=verbose, line_number=line_number - ) - loi = tc.grep(pattern, ignore_case) - if loi: - tc.add_lines_of_interest(loi) - tc.add_context() - file_results.append({"filename": filename, "matches": tc.format()}) - except ValueError: - pass # Skip files that can't be parsed - - return file_results diff --git a/python/composio/tools/local/base/utils/repomap.py b/python/composio/tools/local/base/utils/repomap.py deleted file mode 100644 index fd865a07d00..00000000000 --- a/python/composio/tools/local/base/utils/repomap.py +++ /dev/null @@ -1,626 +0,0 @@ -# Import necessary libraries -import math -import os -import shutil -from collections import Counter, defaultdict, namedtuple -from importlib import resources -from pathlib import Path - -from composio.tools.local.base.utils.grep_ast import TreeContext -from composio.tools.local.base.utils.parser import filename_to_lang -from composio.tools.local.base.utils.utils import ( - get_mtime, - get_rel_fname, - print_if_verbose, - token_count, -) - - -# Define a named tuple for storing tag information -Tag = namedtuple("Tag", ["rel_fname", "fname", "line", "name", "kind"]) - -""" -RepoMap: Generates a structured view of a code repository. - -Key components: -1. chat_fnames: Files active in the current conversation. - - Given high personalization priority in ranking. - - Excluded from final repo map output. - -2. mentioned_fnames: Files of particular interest. - - Given medium personalization priority in ranking. - - Included in final repo map output. - -3. other_fnames: All other repository files. - - Given low personalization priority. - - Included in output if relevant or space allows. - -Process: -1. Collect tags (definitions and references) from all files. -2. Build a graph representing relationships between files and identifiers. -3. Use PageRank with personalization to rank files and identifiers. -4. Generate a tree-like structure of the most important elements. -5. Optimize the output to fit within a specified token limit. - -The resulting repo map provides a context-aware overview of the repository, -emphasizing relevant files and code structures based on the current conversation -and mentioned points of interest. -""" - - -class RepoMap: - # Class variables for caching - CACHE_VERSION = 1 - TAGS_CACHE_DIR = f".composio.tags.cache.v{CACHE_VERSION}" - - cache_missing = False - warned_files: set[str] = set() - - def __init__( - self, - map_tokens=10240, - root=None, - repo_content_prefix=None, - verbose=False, - max_context_window=10000, - ): - """ - Initialize the RepoMap object. - - :param map_tokens: Maximum number of tokens for the repo map - :param root: Root directory of the repository - :param repo_content_prefix: Prefix for repo content - :param verbose: Enable verbose output - :param max_context_window: Maximum context window size - """ - - self.verbose = verbose - - # Set root directory - if not root: - root = os.getcwd() - self.root = root - - # Load tags cache - self.load_tags_cache() - - self.max_map_tokens = map_tokens - self.max_context_window = max_context_window - self.TAGS_CACHE = None - self.tree_cache = {} - self.token_count = token_count - self.repo_content_prefix = repo_content_prefix - - def get_repo_map( - self, chat_files, other_files, mentioned_fnames=None, mentioned_idents=None - ): - """ - Generate a repository map based on the given files and mentions. - - :param chat_files: Files currently in the chat - :param other_files: Other files in the repository - :param mentioned_fnames: Mentioned file names - :param mentioned_idents: Mentioned identifiers - :return: Repository map as a string - """ - # Early exit conditions - if self.max_map_tokens <= 0 or not other_files: - print_if_verbose( - "Exiting repo-map due to max_map_tokens <= 0 or no other_files", - self.verbose, - ) - return "error" - - # Initialize mentioned sets if not provided - mentioned_fnames = mentioned_fnames or set() - mentioned_idents = mentioned_idents or set() - - max_map_tokens = self.max_map_tokens - - # Adjust max_map_tokens when no files are in the chat - MUL = 8 - padding = 4096 - if max_map_tokens and self.max_context_window: - target = min(max_map_tokens * MUL, self.max_context_window - padding) - else: - target = 0 - if not chat_files and self.max_context_window and target > 0: - max_map_tokens = target - - try: - # Generate ranked tags map - files_listing = self.get_ranked_tags_map( - chat_files, - other_files, - max_map_tokens, - mentioned_fnames, - mentioned_idents, - ) - except RecursionError: - # Handle recursion error (possibly due to large git repo) - self.max_map_tokens = 0 - print_if_verbose("Exiting repo-map due to RecursionError", self.verbose) - return "error" - - if not files_listing: - print_if_verbose( - "Exiting repo-map due to empty files_listing", self.verbose - ) - return "error" - - # Count tokens in the files listing - num_tokens = self.token_count(files_listing) - print_if_verbose(f"Repo-map: {num_tokens / 1024:.1f} k-tokens", self.verbose) - - # Prepare repo content string - other = "other " if chat_files else "" - if self.repo_content_prefix: - repo_content = self.repo_content_prefix.format(other=other) - else: - repo_content = "" - - repo_content += files_listing - - return repo_content - - def load_tags_cache(self): - from diskcache import Cache # pylint: disable=C0415 - - path = Path(self.root) / self.TAGS_CACHE_DIR - if not path.exists(): - self.cache_missing = True - self.TAGS_CACHE = Cache(path) - - def get_tags(self, fname: str, rel_fname: str) -> list[Tag]: - """ - Get tags for a file, using cache if available. - - :param fname: Absolute file name - :param rel_fname: Relative file name - :return: List of tags - """ - file_mtime = get_mtime(fname) - if file_mtime is None: - print_if_verbose( - f"Warning: Unable to get modification time for {fname}, skipping", - self.verbose, - ) - return [] - - if self.TAGS_CACHE is None: - print_if_verbose( - "Warning: Tags cache is not initialized, something went wrong", - self.verbose, - ) - return [] - - cache_key = fname - cache_data = self.TAGS_CACHE.get(cache_key) - - if cache_data is not None and cache_data.get("mtime") == file_mtime: - return cache_data.get("data", []) - - # Cache miss or outdated: generate new tags - data = list(self.get_tags_raw(fname, rel_fname)) - - # Update cache - self.TAGS_CACHE.set(cache_key, {"mtime": file_mtime, "data": data}) - return data - - def get_tags_raw(self, fname, rel_fname): - """ - Generate tags for a file using tree-sitter and pygments. - - :param fname: Absolute file name - :param rel_fname: Relative file name - :yield: Tag objects - """ - lang = filename_to_lang(fname) - if not lang: - print_if_verbose( - "Exiting get_tags_raw due to no language detected", self.verbose - ) - return - - from tree_sitter_languages import ( # pylint: disable=C0415 - get_language, - get_parser, - ) - - language = get_language(lang) - parser = get_parser(lang) - - # Load tags query - try: - scm_fname = resources.files(__package__).joinpath( - "queries", f"tree-sitter-{lang}-tags.scm" - ) - except KeyError: - print_if_verbose( - "Exiting get_tags_raw due to KeyError in loading tags query", - self.verbose, - ) - return - - if not scm_fname.is_file(): - print_if_verbose( - "Exiting get_tags_raw due to non-existent query_scm", self.verbose - ) - return - query_scm = scm_fname.read_text() - - # Parse code - with open(fname, "r", encoding="utf-8") as file: - code = file.read().strip() - - if not code: - print_if_verbose("Exiting get_tags_raw due to empty code", self.verbose) - return - - tree = parser.parse(bytes(code, "utf-8")) - - # Run tags query - query = language.query(query_scm) - captures = query.captures(tree.root_node) - - saw = set() - for node, tag in captures: - if tag.startswith("name.definition."): - kind = "def" - elif tag.startswith("name.reference."): - kind = "ref" - else: - continue - saw.add(kind) - - yield Tag( - rel_fname=rel_fname, - fname=fname, - name=node.text.decode("utf-8"), - kind=kind, - line=node.start_point[0], - ) - - # If no references found, use pygments for additional tagging - if "ref" in saw: - print_if_verbose( - "Exiting get_tags_raw after processing references", self.verbose - ) - print_if_verbose(fname, self.verbose) - return - if "def" not in saw: - print_if_verbose( - "Exiting get_tags_raw due to no definitions found", self.verbose - ) - return - - from pygments.util import ClassNotFound # pylint: disable=C0415 - - try: - from pygments.lexers import ( # pylint: disable=C0415 - guess_lexer_for_filename, - ) - from pygments.token import Token # pylint: disable=C0415 - - lexer = guess_lexer_for_filename(fname, code) - tokens = [ - token[1] for token in lexer.get_tokens(code) if token[0] in Token.Name - ] - except ClassNotFound: - print_if_verbose( - "Exiting get_tags_raw due to ClassNotFound in lexer guessing", - self.verbose, - ) - tokens = code.split() - - for token in tokens: - yield Tag( - rel_fname=rel_fname, - fname=fname, - name=token, - kind="ref", - line=-1, - ) - - def get_ranked_tags( - self, chat_fnames, other_fnames, mentioned_fnames, mentioned_idents - ): # pylint: disable=R0915 - """ - Generate ranked tags for files in the repository. - - :param chat_fnames: Files currently in the chat - :param other_fnames: Other files in the repository - :param mentioned_fnames: Mentioned file names - :param mentioned_idents: Mentioned identifiers - :return: List of ranked tags - """ - import networkx as nx # pylint: disable=C0415 - - defines = defaultdict(set) - references = defaultdict(list) - definitions = defaultdict(set) - - personalization = {} - fnames = sorted(set(chat_fnames).union(set(other_fnames))) - chat_rel_fnames = set() - - # Improved personalization logic - chat_weight = 10.0 - mentioned_weight = 5.0 - other_weight = 1.0 - - total_weight = ( - len(chat_fnames) * chat_weight - + len(mentioned_fnames) * mentioned_weight - + len(other_fnames) * other_weight - ) - self.cache_missing = False - - # Process each file - for fname in fnames: - if not Path(fname).is_file(): - if fname not in self.warned_files: - if Path(fname).exists(): - print_if_verbose( - f"Repo-map can't include {fname}, it is not a normal file", - self.verbose, - ) - else: - print_if_verbose( - f"Repo-map can't include {fname}, it no longer exists", - self.verbose, - ) - self.warned_files.add(fname) - continue - - rel_fname = get_rel_fname(self.root, fname) - if fname in chat_fnames: - personalization[rel_fname] = chat_weight / total_weight - chat_rel_fnames.add(rel_fname) - elif rel_fname in mentioned_fnames: - personalization[rel_fname] = mentioned_weight / total_weight - else: - personalization[rel_fname] = other_weight / total_weight - - tags = self.get_tags(fname, rel_fname) - if tags is None: - continue - - for tag in tags: - if tag.kind == "def": - defines[tag.name].add(rel_fname) - key = (rel_fname, tag.name) - definitions[key].add(tag) - if tag.kind == "ref": - references[tag.name].append(rel_fname) - - # If no references, use definitions as references - if not references: - references = dict((k, list(v)) for k, v in defines.items()) - - idents = set(defines.keys()).intersection(set(references.keys())) - - # Create graph - G = nx.MultiDiGraph() - - for ident in idents: - definers = defines[ident] - mul = ( - 2.0 - if ident in mentioned_idents - else 0.5 if ident.startswith("_") else 1.0 - ) - - for referencer, num_refs in Counter(references[ident]).items(): - for definer in definers: - # Scale down high-frequency mentions - num_refs = math.sqrt(num_refs) - G.add_edge(referencer, definer, weight=mul * num_refs, ident=ident) - - # Calculate PageRank - try: - ranked = nx.pagerank(G, weight="weight", personalization=personalization) - except ZeroDivisionError: - print_if_verbose( - "Exiting get_ranked_tags due to ZeroDivisionError in PageRank calculation", - self.verbose, - ) - return [] - - # Distribute rank across edges - ranked_definitions = defaultdict(float) - for src in G.nodes: - src_rank = ranked[src] - total_weight = sum( - data["weight"] for _src, _dst, data in G.out_edges(src, data=True) - ) - for _src, dst, data in G.out_edges(src, data=True): - data["rank"] = src_rank * data["weight"] / total_weight - ident = data["ident"] - ranked_definitions[(dst, ident)] += data["rank"] - - ranked_tags = [] - ranked_definitions = sorted( - ranked_definitions.items(), reverse=True, key=lambda x: x[1] - ) - - # Generate ranked tags - for (fname, ident), rank in ranked_definitions: - if fname not in chat_rel_fnames: - ranked_tags.extend(definitions.get((fname, ident), [])) - - rel_other_fnames_without_tags = set( - get_rel_fname(self.root, fname) for fname in other_fnames - ) - - fnames_already_included = set(rt[0] for rt in ranked_tags) - - # Add remaining files to ranked tags - top_rank = sorted( - [(rank, node) for (node, rank) in ranked.items()], reverse=True - ) - for rank, fname in top_rank: - if fname in rel_other_fnames_without_tags: - rel_other_fnames_without_tags.remove(fname) - if fname not in fnames_already_included: - ranked_tags.append((fname,)) - - for fname in rel_other_fnames_without_tags: - ranked_tags.append((fname,)) - - return ranked_tags - - def get_ranked_tags_map( - self, - chat_fnames, - other_fnames=None, - max_map_tokens=None, - mentioned_fnames=None, - mentioned_idents=None, - ): - """ - Generate a ranked tags map for the repository. - - :param chat_fnames: Files currently in the chat - :param other_fnames: Other files in the repository - :param max_map_tokens: Maximum number of tokens for the map - :param mentioned_fnames: Mentioned file names - :param mentioned_idents: Mentioned identifiers - :return: Formatted string of the ranked tags map - """ - # print("Starting get_ranked_tags_map") - if not other_fnames: - other_fnames = [] - if not max_map_tokens: - max_map_tokens = self.max_map_tokens - - mentioned_fnames = mentioned_fnames or set() - mentioned_idents = mentioned_idents or set() - - ranked_tags = self.get_ranked_tags( - chat_fnames, other_fnames, mentioned_fnames, mentioned_idents - ) - - num_tags = len(ranked_tags) - lower_bound = 0 - upper_bound = num_tags - best_tree = None - best_tree_tokens = 0 - - chat_rel_fnames = [get_rel_fname(self.root, fname) for fname in chat_fnames] - middle = min(max_map_tokens // 25, num_tags) - - self.tree_cache = {} - - while lower_bound <= upper_bound: - tree = self.to_tree(ranked_tags[:middle], chat_rel_fnames) - num_tokens = self.token_count(tree) - - if best_tree_tokens < num_tokens < max_map_tokens: - best_tree = tree - best_tree_tokens = num_tokens - - if num_tokens < max_map_tokens: - lower_bound = middle + 1 - else: - upper_bound = middle - 1 - - middle = (lower_bound + upper_bound) // 2 - - return best_tree - - def render_tree(self, abs_fname, rel_fname, lois): - key = (rel_fname, tuple(sorted(lois))) - - if key in self.tree_cache: - return self.tree_cache[key] - - # use python to read the file - with open(abs_fname, "r", encoding="utf-8") as file: - code = file.read() - if not code.endswith("\n"): - code += "\n" - context = TreeContext( - rel_fname, - code, - color=False, - line_number=False, - child_context=False, - last_line=False, - margin=0, - mark_lois=False, - loi_pad=0, - # header_max=30, - show_top_of_file_parent_scope=False, - ) - - context.add_lines_of_interest(lois) - context.add_context() - res = context.format() - self.tree_cache[key] = res - return res - - def to_tree(self, tags, chat_rel_fnames): - if not tags: - return "" - - tags = [tag for tag in tags if tag[0] not in chat_rel_fnames] - tags = sorted(tags) - - cur_fname = None - cur_abs_fname = None - lois = None - output = "" - - # add a bogus tag at the end so we trip the this_fname != cur_fname... - dummy_tag = (None,) - for tag in tags + [dummy_tag]: - this_rel_fname = tag[0] - - # ... here ... to output the final real entry in the list - if this_rel_fname != cur_fname: - if lois is not None: - output += "\n" - if cur_fname is not None: - output += cur_fname + ":\n" - lang = filename_to_lang(cur_abs_fname) - if lang: - output += self.render_tree(cur_abs_fname, cur_fname, lois) - if lang is None: - # print("Skipping : ", cur_abs_fname) - continue - lois = None - elif cur_fname: - output += "\n" + cur_fname + "\n" - if isinstance(tag, Tag): - lois = [] - cur_abs_fname = tag.fname - cur_fname = this_rel_fname - - if lois is not None: - lois.append(tag.line) - - # truncate long lines, in case we get minified js or something else crazy - output = "\n".join([line[:100] for line in output.splitlines()]) + "\n" - - return output - - def delete_cache(self): - """Delete the tags cache.""" - cache_path = Path(self.root) / self.TAGS_CACHE_DIR - # print("Deleting cache: ", cache_path) - if cache_path.exists(): - # Remove all files and subdirectories - for item in cache_path.glob("*"): - if item.is_file(): - item.unlink() - elif item.is_dir(): - shutil.rmtree(item) - # print(f"Cache contents deleted: {cache_path}") - else: - # print("No cache found to delete.") - from diskcache import Cache # pylint: disable=C0415 - - # Reset the cache object - self.TAGS_CACHE = Cache(cache_path) - self.cache_missing = True diff --git a/python/composio/tools/local/codeanalysis/actions/create_codemap.py b/python/composio/tools/local/codeanalysis/actions/create_codemap.py index 96e31e5dd6a..678b7a5d5ab 100644 --- a/python/composio/tools/local/codeanalysis/actions/create_codemap.py +++ b/python/composio/tools/local/codeanalysis/actions/create_codemap.py @@ -1,6 +1,5 @@ import json import os -import shutil from enum import Enum from pathlib import Path from typing import Any, Dict @@ -9,11 +8,7 @@ from composio.tools.base.exceptions import ExecutionFailed from composio.tools.base.local import LocalAction -from composio.tools.local.codeanalysis.constants import ( - CODE_MAP_CACHE, - FQDN_FILE, - TREE_SITTER_FOLDER, -) +from composio.tools.local.codeanalysis.constants import CODE_MAP_CACHE, FQDN_FILE from composio.tools.local.codeanalysis.tool_utils import retry_handler from composio.utils.logging import get as get_logger @@ -91,7 +86,6 @@ def execute( repo_name = os.path.basename(self.REPO_DIR) self.save_dir = f"{CODE_MAP_CACHE}/{repo_name}" os.makedirs(self.save_dir, exist_ok=True) - os.makedirs(TREE_SITTER_FOLDER, exist_ok=True) self.fqdn_cache_file = os.path.join(self.save_dir, FQDN_FILE) self._process(status, metadata) @@ -172,7 +166,6 @@ def create_index(self, is_python: bool): embedder.get_vector_store_from_chunks(self.REPO_DIR, documents, ids, metadatas) logger.info(f"Successfully created index for {len(python_files)} files.") - shutil.rmtree(TREE_SITTER_FOLDER) def load_all_fqdns(self): """ diff --git a/python/composio/tools/local/codeanalysis/chunker.py b/python/composio/tools/local/codeanalysis/chunker.py index 0221bab64bc..226323bafda 100644 --- a/python/composio/tools/local/codeanalysis/chunker.py +++ b/python/composio/tools/local/codeanalysis/chunker.py @@ -1,11 +1,11 @@ -import os import re -import subprocess from typing import Any, Dict, List, Tuple, Union +import tree_sitter_python as tspython from tree_sitter import Language, Parser -from composio.tools.local.codeanalysis.constants import TREE_SITTER_FOLDER + +PY_LANGUAGE = Language(tspython.language()) class Span: @@ -224,37 +224,12 @@ class Chunking: language (Language): The loaded Python language for tree-sitter parsing. Methods: - _setup_tree_sitter(): Sets up the tree-sitter environment. - _load_language(): Loads the Python language for tree-sitter. chunk(): Chunks the given file content into smaller pieces. """ def __init__(self, repo_dir: str): - self._setup_tree_sitter() - self.language = self._load_language() self.repo_dir = repo_dir - def _setup_tree_sitter(self): - python_repo = f"{TREE_SITTER_FOLDER}/python" - if not os.path.exists(python_repo): - subprocess.run( - [ - "git", - "clone", - "https://github.com/tree-sitter/tree-sitter-python", - python_repo, - ], - check=True, - ) - - build_path = f"{TREE_SITTER_FOLDER}/build/python.so" - if not os.path.exists(build_path): - os.makedirs(os.path.dirname(build_path), exist_ok=True) - Language.build_library(build_path, [python_repo]) - - def _load_language(self) -> Language: - return Language(f"{TREE_SITTER_FOLDER}/build/python.so", "python") - def chunk( self, file_content: str, @@ -265,8 +240,7 @@ def chunk( max_chunk_size: int = 512 * 3, ) -> Tuple[List[str], List[Dict[str, Any]], List[str]]: if is_python: - parser = Parser() - parser.set_language(self.language) + parser = Parser(PY_LANGUAGE) tree = parser.parse(file_content.encode("utf-8")) source_code_bytes = file_content.encode("utf-8") diff --git a/python/composio/tools/local/codeanalysis/constants.py b/python/composio/tools/local/codeanalysis/constants.py index aca3a082935..56a3176239c 100644 --- a/python/composio/tools/local/codeanalysis/constants.py +++ b/python/composio/tools/local/codeanalysis/constants.py @@ -5,5 +5,4 @@ CODE_MAP_CACHE = os.path.join(Path.home(), ".composio/tmp") FQDN_FILE = "fqdn_cache.json" DEEPLAKE_FOLDER = "deeplake" -TREE_SITTER_FOLDER = os.path.join(CODE_MAP_CACHE, "tree_sitter_cache") EMBEDDER = "sentence-transformers/all-mpnet-base-v2" diff --git a/python/composio/tools/local/codeanalysis/tool.py b/python/composio/tools/local/codeanalysis/tool.py index 0c3defaf328..732ae18a302 100644 --- a/python/composio/tools/local/codeanalysis/tool.py +++ b/python/composio/tools/local/codeanalysis/tool.py @@ -15,10 +15,10 @@ class CodeAnalysisTool(LocalTool, autoload=True): """Code index tool.""" requires = [ - "tree_sitter==0.21.3", + "tree_sitter>=0.22.0", "deeplake>3.9,<4", "sentence-transformers", - "tree_sitter_languages", + "tree_sitter_python>=0.22.0", "git+https://github.com/DataDog/jedi.git@92d0c807b0dcd115b1ffd0a4ed21e44db127c2fb#egg=jedi", "PyJWT", # deeplake/client/client.py:41 ] diff --git a/python/composio/tools/local/codeanalysis/tree_sitter_related.py b/python/composio/tools/local/codeanalysis/tree_sitter_related.py index 7478bfb8880..d8ad05e5b02 100644 --- a/python/composio/tools/local/codeanalysis/tree_sitter_related.py +++ b/python/composio/tools/local/codeanalysis/tree_sitter_related.py @@ -1,8 +1,11 @@ import re from typing import Dict, List, Tuple -from tree_sitter import Node, Parser, Tree -from tree_sitter_languages import get_parser +import tree_sitter_python as tspython +from tree_sitter import Language, Node, Parser, Tree + + +PY_LANGUAGE = Language(tspython.language()) class SpanRelated: @@ -125,7 +128,7 @@ def fetch_nodes_of_type(file_path: str, types_allowed: List[str]) -> List[Dict]: Returns: List[Dict]: A list of dictionaries containing node details. """ - parser = get_parser("python") + parser = Parser(PY_LANGUAGE) with open(file_path, "r", encoding="utf-8") as file: test_code_str = file.read() @@ -262,7 +265,7 @@ def fetch_entity_artifacts(entity_body: str, entity_type: str) -> Dict[str, str] if entity_type not in ["class", "function"]: raise ValueError("Invalid entity_type. Must be 'class' or 'function'.") - parser = get_parser("python") + parser = Parser(PY_LANGUAGE) tree = fetch_tree(parser, entity_body) entity_node = tree.root_node diff --git a/python/composio/tools/toolset.py b/python/composio/tools/toolset.py index 3f2fc3fa29f..61637a6b33d 100644 --- a/python/composio/tools/toolset.py +++ b/python/composio/tools/toolset.py @@ -79,6 +79,8 @@ ParamType = t.TypeVar("ParamType") ProcessorType = te.Literal["pre", "post", "schema"] +_IS_CI: t.Optional[bool] = None + class IntegrationParams(te.TypedDict): @@ -109,6 +111,13 @@ def _check_agentops() -> bool: return agentops.get_api_key() is not None +def _is_ci(): + global _IS_CI + if _IS_CI is None: + _IS_CI = os.environ.get("CI") == "true" + return _IS_CI + + def _record_action_if_available(func: t.Callable[P, T]) -> t.Callable[P, T]: @wraps(func) def wrapper(self, *args, **kwargs): @@ -145,9 +154,11 @@ class ComposioToolSet(WithLogger): # pylint: disable=too-many-public-methods def __init_subclass__( cls, + *args: t.Any, runtime: t.Optional[str] = None, description_char_limit: t.Optional[int] = None, action_name_char_limit: t.Optional[int] = None, + **kwargs: t.Any, ) -> None: if runtime is None: warnings.warn( @@ -161,6 +172,13 @@ def __init_subclass__( ) cls._description_char_limit = description_char_limit or 1024 cls._action_name_char_limit = action_name_char_limit + if len(args) > 0 or len(kwargs) > 0: + error = ( + f"Composio toolset subclass initializer got extra {args=} and {kwargs=}" + ) + if _is_ci(): + raise RuntimeError(error) + warnings.warn(error) def __init__( self, diff --git a/python/plugins/autogen/composio_autogen/toolset.py b/python/plugins/autogen/composio_autogen/toolset.py index 5a1d76d98f7..e1bb0930406 100644 --- a/python/plugins/autogen/composio_autogen/toolset.py +++ b/python/plugins/autogen/composio_autogen/toolset.py @@ -16,6 +16,7 @@ class ComposioToolSet( BaseComposioToolSet, runtime="autogen", description_char_limit=1024, + action_name_char_limit=64, ): """ Composio toolset for autogen framework. diff --git a/python/plugins/camel/composio_camel/toolset.py b/python/plugins/camel/composio_camel/toolset.py index 219166b568d..086b32c06fb 100644 --- a/python/plugins/camel/composio_camel/toolset.py +++ b/python/plugins/camel/composio_camel/toolset.py @@ -23,6 +23,7 @@ class ComposioToolSet( BaseComposioToolSet, runtime="camel", description_char_limit=1024, + action_name_char_limit=64, ): """ Composio toolset for OpenAI framework. diff --git a/python/plugins/claude/composio_claude/toolset.py b/python/plugins/claude/composio_claude/toolset.py index 7038901f835..10acfd14daa 100644 --- a/python/plugins/claude/composio_claude/toolset.py +++ b/python/plugins/claude/composio_claude/toolset.py @@ -27,6 +27,7 @@ class ComposioToolSet( BaseComposioToolSet, runtime="claude", description_char_limit=1024, + action_name_char_limit=64, ): """ Composio toolset for Anthropic Claude platform. diff --git a/python/plugins/crew_ai/composio_crewai/toolset.py b/python/plugins/crew_ai/composio_crewai/toolset.py index 0873c07e567..c3aa18bb9a1 100644 --- a/python/plugins/crew_ai/composio_crewai/toolset.py +++ b/python/plugins/crew_ai/composio_crewai/toolset.py @@ -12,6 +12,7 @@ class ComposioToolSet( # type: ignore[no-redef] Base, runtime="crewai", description_char_limit=1024, + action_name_char_limit=64, ): pass @@ -33,6 +34,7 @@ class ComposioToolSet( # type: ignore[no-redef] BaseComposioToolSet, runtime="crewai", description_char_limit=1024, + action_name_char_limit=64, ): """ Composio toolset for CrewiAI framework. diff --git a/python/plugins/google/composio_google/toolset.py b/python/plugins/google/composio_google/toolset.py index 6045e66cced..f6e4b4fe6b3 100644 --- a/python/plugins/google/composio_google/toolset.py +++ b/python/plugins/google/composio_google/toolset.py @@ -24,6 +24,7 @@ class ComposioToolset( BaseComposioToolSet, runtime="google_ai", description_char_limit=1024, + action_name_char_limit=64, ): """ Composio toolset for Google AI Python Gemini framework. diff --git a/python/plugins/griptape/composio_griptape/toolset.py b/python/plugins/griptape/composio_griptape/toolset.py index fe002b9bfae..6faa5fb11e4 100644 --- a/python/plugins/griptape/composio_griptape/toolset.py +++ b/python/plugins/griptape/composio_griptape/toolset.py @@ -19,6 +19,7 @@ class ComposioToolSet( BaseComposioToolSet, runtime="griptape", description_char_limit=1024, + action_name_char_limit=64, ): """ Composio toolset wrapper for Griptape framework. diff --git a/python/plugins/julep/composio_julep/toolset.py b/python/plugins/julep/composio_julep/toolset.py index a8756db14a3..3706a08cb92 100644 --- a/python/plugins/julep/composio_julep/toolset.py +++ b/python/plugins/julep/composio_julep/toolset.py @@ -13,6 +13,7 @@ class ComposioToolSet( BaseComposioToolSet, runtime="julep", description_char_limit=1024, + action_name_char_limit=64, ): """ Composio toolset wrapper for Julep framework. diff --git a/python/plugins/langgraph/composio_langgraph/toolset.py b/python/plugins/langgraph/composio_langgraph/toolset.py index f17b530cfbe..43b1ce22dd3 100644 --- a/python/plugins/langgraph/composio_langgraph/toolset.py +++ b/python/plugins/langgraph/composio_langgraph/toolset.py @@ -5,6 +5,7 @@ class ComposioToolSet( BaseComposioToolSet, runtime="langgraph", description_char_limit=1024, + action_name_char_limit=64, ): """ Composio toolset for LangGraph framework. diff --git a/python/plugins/llamaindex/composio_llamaindex/toolset.py b/python/plugins/llamaindex/composio_llamaindex/toolset.py index c51c0a96caa..b9295f0a551 100644 --- a/python/plugins/llamaindex/composio_llamaindex/toolset.py +++ b/python/plugins/llamaindex/composio_llamaindex/toolset.py @@ -16,6 +16,7 @@ class ComposioToolSet( BaseComposioToolSet, runtime="llamaindex", description_char_limit=1024, + action_name_char_limit=64, ): """ Composio toolset for LlamaIndex framework. diff --git a/python/plugins/lyzr/composio_lyzr/toolset.py b/python/plugins/lyzr/composio_lyzr/toolset.py index 8ca18cf3418..cf79a58f041 100644 --- a/python/plugins/lyzr/composio_lyzr/toolset.py +++ b/python/plugins/lyzr/composio_lyzr/toolset.py @@ -22,6 +22,7 @@ class ComposioToolSet( BaseComposioToolSet, runtime="lyzr", description_char_limit=1024, + action_name_char_limit=64, ): """ Composio toolset for Lyzr framework. diff --git a/python/plugins/openai/composio_openai/toolset.py b/python/plugins/openai/composio_openai/toolset.py index 6af50469b5a..abb1a6fc768 100644 --- a/python/plugins/openai/composio_openai/toolset.py +++ b/python/plugins/openai/composio_openai/toolset.py @@ -27,6 +27,7 @@ class ComposioToolSet( BaseComposioToolSet, runtime="openai", description_char_limit=1024, + action_name_char_limit=64, ): """ Composio toolset for OpenAI framework. diff --git a/python/plugins/phidata/composio_phidata/toolset.py b/python/plugins/phidata/composio_phidata/toolset.py index c1d53637f88..e3931c872f7 100644 --- a/python/plugins/phidata/composio_phidata/toolset.py +++ b/python/plugins/phidata/composio_phidata/toolset.py @@ -19,6 +19,7 @@ class ComposioToolSet( BaseComposioToolSet, runtime="phidata", description_char_limit=1024, + action_name_char_limit=64, ): """ Composio toolset for Phidata framework. diff --git a/python/plugins/praisonai/composio_praisonai/toolset.py b/python/plugins/praisonai/composio_praisonai/toolset.py index 724c4613e7c..5562c7ddc31 100644 --- a/python/plugins/praisonai/composio_praisonai/toolset.py +++ b/python/plugins/praisonai/composio_praisonai/toolset.py @@ -21,6 +21,7 @@ class ComposioToolSet( BaseComposioToolSet, runtime="praisonai", description_char_limit=1024, + action_name_char_limit=64, ): """ Composio toolset for PraisonAI framework. diff --git a/python/setup.py b/python/setup.py index cddc2e3e128..cd4160979cd 100644 --- a/python/setup.py +++ b/python/setup.py @@ -69,8 +69,6 @@ def scan_for_package_data( ] tools_requirements = [ - "tree_sitter_languages", - "tree_sitter==0.21.3", "pygments", "pathspec", "diskcache", diff --git a/python/tox.ini b/python/tox.ini index 16107460eab..3edce8485e2 100644 --- a/python/tox.ini +++ b/python/tox.ini @@ -91,7 +91,7 @@ deps = codecov==2.1.13 pytest-codecov==0.5.1 typing_extensions>=4.10.0 - tree_sitter==0.21.3# codeanalysis + tree_sitter>=0.22.0 # codeanalysis python-dotenv==1.0.1 ; composio_langgraph==0.5.13 langgraph==0.2.16 @@ -100,7 +100,7 @@ deps = git+https://github.com/DataDog/jedi.git@92d0c807b0dcd115b1ffd0a4ed21e44db127c2fb#egg=jedi # codeanalysis libcst # codeanalysis sentence_transformers # codeanalysis - tree_sitter_languages # codeanalysis + tree_sitter_python>=0.22.0 # codeanalysis PyJWT # deeplake/client/client.py:41 e2b>=0.17.2a37 # E2B Workspace e2b-code-interpreter # E2B workspace