diff --git a/Cargo.lock b/Cargo.lock index 1a5a9f8f..c48ec2a0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4281,6 +4281,7 @@ dependencies = [ "itertools 0.13.0", "mockall", "num_cpus", + "regex", "serde", "serde_json", "strum", diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 845df4dc..ef2713b6 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -29,6 +29,10 @@ doc-scrape-examples = true name = "index-codebase" path = "index_codebase.rs" +[[example]] +name = "index-codebase-reduced-context" +path = "index_codebase_reduced_context.rs" + [[example]] doc-scrape-examples = true name = "fastembed" diff --git a/examples/index_codebase_reduced_context.rs b/examples/index_codebase_reduced_context.rs new file mode 100644 index 00000000..05ceb1af --- /dev/null +++ b/examples/index_codebase_reduced_context.rs @@ -0,0 +1,75 @@ +//! # [Swiftide] Indexing the Swiftide itself example with reduced context size +//! +//! This example demonstrates how to index the Swiftide codebase itself, optimizing for a smaller context size. +//! Note that for it to work correctly you need to have OPENAI_API_KEY set, redis and qdrant +//! running. +//! +//! The pipeline will: +//! - Load all `.rs` files from the current directory +//! - Skip any nodes previously processed; hashes are based on the path and chunk (not the +//! metadata!) +//! - Generate an outline of the symbols defined in each file to be used as context in a later step and store it in the metadata +//! - Chunk the code into pieces of 10 to 2048 bytes +//! - For each chunk, generate a condensed subset of the symbols outline tailored for that specific chunk and store that in the metadata +//! - Run metadata QA on each chunk; generating questions and answers and adding metadata +//! - Embed the chunks in batches of 10, Metadata is embedded by default +//! - Store the nodes in Qdrant +//! +//! Note that metadata is copied over to smaller chunks when chunking. When making LLM requests +//! with lots of small chunks, consider the rate limits of the API. +//! +//! [Swiftide]: https://github.com/bosun-ai/swiftide +//! [examples]: https://github.com/bosun-ai/swiftide/blob/master/examples + +use swiftide::indexing; +use swiftide::indexing::loaders::FileLoader; +use swiftide::indexing::transformers::{ChunkCode, Embed, MetadataQACode}; +use swiftide::integrations::{self, qdrant::Qdrant, redis::Redis}; + +#[tokio::main] +async fn main() -> Result<(), Box> { + tracing_subscriber::fmt::init(); + + let openai_client = integrations::openai::OpenAI::builder() + .default_embed_model("text-embedding-3-small") + .default_prompt_model("gpt-3.5-turbo") + .build()?; + + let redis_url = std::env::var("REDIS_URL") + .as_deref() + .unwrap_or("redis://localhost:6379") + .to_owned(); + + let chunk_size = 2048; + + indexing::Pipeline::from_loader(FileLoader::new(".").with_extensions(&["rs"])) + .filter_cached(Redis::try_from_url( + redis_url, + "swiftide-examples-codebase-reduced-context", + )?) + .then( + indexing::transformers::OutlineCodeTreeSitter::try_for_language( + "rust", + Some(chunk_size), + )?, + ) + .then(MetadataQACode::new(openai_client.clone())) + .then_chunk(ChunkCode::try_for_language_and_chunk_size( + "rust", + 10..chunk_size, + )?) + .then(indexing::transformers::CompressCodeOutline::new( + openai_client.clone(), + )) + .then_in_batch(10, Embed::new(openai_client.clone())) + .then_store_with( + Qdrant::builder() + .batch_size(50) + .vector_size(1536) + .collection_name("swiftide-examples-codebase-reduced-context") + .build()?, + ) + .run() + .await?; + Ok(()) +} diff --git a/swiftide-core/src/node.rs b/swiftide-core/src/node.rs index e567e3bc..64aebc4e 100644 --- a/swiftide-core/src/node.rs +++ b/swiftide-core/src/node.rs @@ -48,6 +48,10 @@ pub struct Node { pub metadata: Metadata, /// Mode of embedding data Chunk and Metadata pub embed_mode: EmbedMode, + /// Size of the input this node was originally derived from in bytes + pub original_size: usize, + /// Offset of the chunk relative to the start of the input this node was originally derived from in bytes + pub offset: usize, } impl Debug for Node { @@ -80,8 +84,11 @@ impl Node { /// /// The other fields are set to their default values. pub fn new(chunk: impl Into) -> Node { + let chunk = chunk.into(); + let original_size = chunk.len(); Node { - chunk: chunk.into(), + chunk, + original_size, ..Default::default() } } diff --git a/swiftide-indexing/Cargo.toml b/swiftide-indexing/Cargo.toml index a75d21f7..b6cf41a2 100644 --- a/swiftide-indexing/Cargo.toml +++ b/swiftide-indexing/Cargo.toml @@ -29,6 +29,7 @@ strum = { workspace = true } strum_macros = { workspace = true } indoc = { workspace = true } +regex = "1.10.5" ignore = "0.4" text-splitter = { version = "0.14", features = ["markdown"] } @@ -42,8 +43,8 @@ test-case = { workspace = true } [features] # TODO: Should not depend on integrations, transformers that use them should be in integrations instead and re-exported from root for convencience tree-sitter = [ - "swiftide-integrations?/tree-sitter", - "dep:swiftide-integrations", + "swiftide-integrations?/tree-sitter", + "dep:swiftide-integrations", ] [lints] diff --git a/swiftide-indexing/src/loaders/file_loader.rs b/swiftide-indexing/src/loaders/file_loader.rs index 991cea65..d3b3b462 100644 --- a/swiftide-indexing/src/loaders/file_loader.rs +++ b/swiftide-indexing/src/loaders/file_loader.rs @@ -60,9 +60,11 @@ impl FileLoader { .map(|entry| { tracing::debug!("Reading file: {:?}", entry); let content = std::fs::read_to_string(&entry).unwrap(); + let original_size = content.len(); Node { path: entry, chunk: content, + original_size, ..Default::default() } }) @@ -99,9 +101,11 @@ impl Loader for FileLoader { tracing::debug!("Reading file: {:?}", entry); let content = std::fs::read_to_string(entry.path()).context("Failed to read file")?; + let original_size = content.len(); Ok(Node { path: entry.path().into(), chunk: content, + original_size, ..Default::default() }) }); diff --git a/swiftide-indexing/src/transformers/chunk_code.rs b/swiftide-indexing/src/transformers/chunk_code.rs index a22b5f97..9e567d97 100644 --- a/swiftide-indexing/src/transformers/chunk_code.rs +++ b/swiftide-indexing/src/transformers/chunk_code.rs @@ -90,11 +90,17 @@ impl ChunkerTransformer for ChunkCode { let split_result = self.chunker.split(&node.chunk); if let Ok(split) = split_result { + let mut offset = 0; + IndexingStream::iter(split.into_iter().map(move |chunk| { - Ok(Node { + let chunk_size = chunk.len(); + let mut node = Node { chunk, ..node.clone() - }) + }; + node.offset = offset; + offset += chunk_size; + Ok(node) })) } else { // Send the error downstream diff --git a/swiftide-indexing/src/transformers/compress_code_outline.rs b/swiftide-indexing/src/transformers/compress_code_outline.rs new file mode 100644 index 00000000..bc0e17d6 --- /dev/null +++ b/swiftide-indexing/src/transformers/compress_code_outline.rs @@ -0,0 +1,171 @@ +//! `CompressCodeOutline` is a transformer that reduces the size of the outline of a the parent file of a chunk to make it more relevant to the chunk. +use derive_builder::Builder; +use std::sync::Arc; + +use anyhow::Result; +use async_trait::async_trait; +use swiftide_core::{indexing::Node, prompt::PromptTemplate, SimplePrompt, Transformer}; + +/// `CompressCodeChunk` rewrites the "Outline" metadata field of a chunk to +/// condense it and make it more relevant to the chunk in question. It is useful as a +/// step after chunking a file that has had outline generated for it with `FileToOutlineTreeSitter`. +#[derive(Debug, Clone, Builder)] +#[builder(setter(into, strip_option))] +pub struct CompressCodeOutline { + #[builder(setter(custom))] + client: Arc, + #[builder(default = "default_prompt()")] + prompt_template: PromptTemplate, + #[builder(default)] + concurrency: Option, +} + +fn extract_markdown_codeblock(text: String) -> String { + let re = regex::Regex::new(r"(?sm)```\w*\n(.*?)```").unwrap(); + let captures = re.captures(text.as_str()); + captures + .map(|c| c.get(1).unwrap().as_str().to_string()) + .unwrap_or(text) +} + +impl CompressCodeOutline { + pub fn builder() -> CompressCodeOutlineBuilder { + CompressCodeOutlineBuilder::default() + } + + pub fn from_client(client: impl SimplePrompt + 'static) -> CompressCodeOutlineBuilder { + CompressCodeOutlineBuilder::default() + .client(client) + .to_owned() + } + /// Creates a new instance of `CompressCodeOutline`. + /// + /// # Arguments + /// + /// * `client` - An implementation of the `SimplePrompt` trait used to generate questions and answers. + /// + /// # Returns + /// + /// A new instance of `CompressCodeOutline` with a default prompt and a default number of questions. + pub fn new(client: impl SimplePrompt + 'static) -> Self { + Self { + client: Arc::new(client), + prompt_template: default_prompt(), + concurrency: None, + } + } + + #[must_use] + pub fn with_concurrency(mut self, concurrency: usize) -> Self { + self.concurrency = Some(concurrency); + self + } +} + +/// Returns the default prompt template for generating questions and answers. +/// +/// This template includes placeholders for the number of questions and the code chunk. +/// +/// # Returns +/// +/// A string representing the default prompt template. +fn default_prompt() -> PromptTemplate { + include_str!("prompts/compress_code_outline.prompt.md").into() +} + +impl CompressCodeOutlineBuilder { + pub fn client(&mut self, client: impl SimplePrompt + 'static) -> &mut Self { + self.client = Some(Arc::new(client)); + self + } +} + +#[async_trait] +impl Transformer for CompressCodeOutline { + /// Asynchronously transforms an `Node` by reducing the size of the outline to make it more relevant to the chunk. + /// + /// This method uses the `SimplePrompt` client to compress the outline of the `Node` and updates the `Node` with the compressed outline. + /// + /// # Arguments + /// + /// * `node` - The `Node` to be transformed. + /// + /// # Returns + /// + /// A result containing the transformed `Node` or an error if the transformation fails. + /// + /// # Errors + /// + /// This function will return an error if the `SimplePrompt` client fails to generate a response. + #[tracing::instrument(skip_all, name = "transformers.compress_code_outline")] + async fn transform_node(&self, mut node: Node) -> Result { + let maybe_outline = node.metadata.get("Outline"); + + let Some(outline) = maybe_outline else { + return Ok(node); + }; + + let prompt = self + .prompt_template + .to_prompt() + .with_context_value("outline", outline.as_str()) + .with_context_value("code", node.chunk.as_str()); + + let response = extract_markdown_codeblock(self.client.prompt(prompt).await?); + + node.metadata.insert("Outline".to_string(), response); + + Ok(node) + } + + fn concurrency(&self) -> Option { + self.concurrency + } +} + +#[cfg(test)] +mod test { + use swiftide_core::MockSimplePrompt; + + use super::*; + + #[test_log::test(tokio::test)] + async fn test_compress_code_template() { + let template = default_prompt(); + + let outline = "Relevant Outline"; + let code = "Code using outline"; + + let prompt = template + .to_prompt() + .with_context_value("outline", outline) + .with_context_value("code", code); + + insta::assert_snapshot!(prompt.render().await.unwrap()); + } + + #[tokio::test] + async fn test_compress_code_outline() { + let mut client = MockSimplePrompt::new(); + + client + .expect_prompt() + .returning(|_| Ok("RelevantOutline".to_string())); + + let transformer = CompressCodeOutline::builder() + .client(client) + .build() + .unwrap(); + let mut node = Node::new("Some text"); + node.offset = 0; + node.original_size = 100; + + node.metadata + .insert("Outline".to_string(), "Some outline".to_string()); + + let result = transformer.transform_node(node).await.unwrap(); + + assert_eq!(result.chunk, "Some text"); + assert_eq!(result.metadata.get("Outline").unwrap(), "RelevantOutline"); + } +} diff --git a/swiftide-indexing/src/transformers/metadata_qa_code.rs b/swiftide-indexing/src/transformers/metadata_qa_code.rs index 7d01dbdd..51dfb1b7 100644 --- a/swiftide-indexing/src/transformers/metadata_qa_code.rs +++ b/swiftide-indexing/src/transformers/metadata_qa_code.rs @@ -93,12 +93,16 @@ impl Transformer for MetadataQACode { /// This function will return an error if the `SimplePrompt` client fails to generate a response. #[tracing::instrument(skip_all, name = "transformers.metadata_qa_code")] async fn transform_node(&self, mut node: Node) -> Result { - let prompt = self + let mut prompt = self .prompt_template .to_prompt() .with_node(&node) .with_context_value("questions", self.num_questions); + if let Some(outline) = node.metadata.get("Outline") { + prompt = prompt.with_context_value("outline", outline.as_str()); + } + let response = self.client.prompt(prompt).await?; node.metadata.insert(NAME, response); @@ -128,6 +132,18 @@ mod test { insta::assert_snapshot!(prompt.render().await.unwrap()); } + #[tokio::test] + async fn test_template_with_outline() { + let template = default_prompt(); + + let prompt = template + .to_prompt() + .with_node(&Node::new("test")) + .with_context_value("questions", 5) + .with_context_value("outline", "Test outline"); + insta::assert_snapshot!(prompt.render().await.unwrap()); + } + #[tokio::test] async fn test_metadata_qacode() { let mut client = MockSimplePrompt::new(); diff --git a/swiftide-indexing/src/transformers/mod.rs b/swiftide-indexing/src/transformers/mod.rs index 680071bc..6e4f1065 100644 --- a/swiftide-indexing/src/transformers/mod.rs +++ b/swiftide-indexing/src/transformers/mod.rs @@ -9,6 +9,9 @@ #[cfg(feature = "tree-sitter")] pub mod chunk_code; +#[cfg(feature = "tree-sitter")] +pub mod outline_code_tree_sitter; + #[cfg(feature = "tree-sitter")] pub mod metadata_refs_defs_code; @@ -19,6 +22,7 @@ pub use chunk_code::ChunkCode; pub use metadata_refs_defs_code::MetadataRefsDefsCode; pub mod chunk_markdown; +pub mod compress_code_outline; pub mod embed; pub mod metadata_keywords; pub mod metadata_qa_code; @@ -26,7 +30,11 @@ pub mod metadata_qa_text; pub mod metadata_summary; pub mod metadata_title; +#[cfg(feature = "tree-sitter")] +pub use outline_code_tree_sitter::OutlineCodeTreeSitter; + pub use chunk_markdown::ChunkMarkdown; +pub use compress_code_outline::CompressCodeOutline; pub use embed::Embed; pub use metadata_keywords::MetadataKeywords; pub use metadata_qa_code::MetadataQACode; diff --git a/swiftide-indexing/src/transformers/outline_code_tree_sitter.rs b/swiftide-indexing/src/transformers/outline_code_tree_sitter.rs new file mode 100644 index 00000000..2ac92901 --- /dev/null +++ b/swiftide-indexing/src/transformers/outline_code_tree_sitter.rs @@ -0,0 +1,75 @@ +//! Add the outline of the code in the given file to the metadata of a node, using tree-sitter. +use anyhow::Result; +use async_trait::async_trait; +use derive_builder::Builder; + +use swiftide_core::indexing::Node; +use swiftide_core::Transformer; + +use swiftide_integrations::treesitter::{CodeOutliner, SupportedLanguages}; + +pub const NAME: &str = "Outline"; + +/// `OutlineCodeTreeSitter` adds a "Outline" field to the metadata of a node that contains +/// a summary of the code in the node. It uses the tree-sitter parser to parse the code and +/// remove any information that is less relevant for tasks that consider the file as a whole. +#[derive(Debug, Clone, Builder)] +#[builder(pattern = "owned", setter(into, strip_option))] +pub struct OutlineCodeTreeSitter { + outliner: CodeOutliner, + minimum_file_size: Option, +} + +impl OutlineCodeTreeSitter { + pub fn builder() -> OutlineCodeTreeSitterBuilder { + OutlineCodeTreeSitterBuilder::default() + } + + /// Tries to create a `OutlineCodeTreeSitter` instance for a given programming language. + /// + /// # Parameters + /// - `lang`: The programming language to be used to parse the code. It should implement `TryInto`. + /// + /// # Returns + /// - `Result`: Returns an instance of `OutlineCodeTreeSitter` if successful, otherwise returns an error. + /// + /// # Errors + /// - Returns an error if the language is not supported or if the `CodeOutliner` fails to build. + pub fn try_for_language( + lang: impl TryInto, + minimum_file_size: Option, + ) -> Result { + Ok(Self { + outliner: CodeOutliner::builder().try_language(lang)?.build()?, + minimum_file_size, + }) + } +} + +#[async_trait] +impl Transformer for OutlineCodeTreeSitter { + /// Adds context to the metadata of a `Node` containing code in the "Outline" field. + /// + /// It uses the `CodeOutliner` to generate the context. + /// + /// # Parameters + /// - `node`: The `Node` containing the code of which the context is to be generated. + /// + /// # Returns + /// - `Node`: The same `Node` instances, with the metadata updated to include the generated context. + /// + /// # Errors + /// - If the code outlining fails, an error is sent downstream. + #[tracing::instrument(skip_all, name = "transformers.outline_code_tree_sitter")] + async fn transform_node(&self, mut node: Node) -> Result { + if let Some(minimum_file_size) = self.minimum_file_size { + if node.chunk.len() < minimum_file_size { + return Ok(node); + } + } + + let outline_result = self.outliner.outline(&node.chunk)?; + node.metadata.insert(NAME, outline_result); + Ok(node) + } +} diff --git a/swiftide-indexing/src/transformers/prompts/compress_code_outline.prompt.md b/swiftide-indexing/src/transformers/prompts/compress_code_outline.prompt.md new file mode 100644 index 00000000..708c7719 --- /dev/null +++ b/swiftide-indexing/src/transformers/prompts/compress_code_outline.prompt.md @@ -0,0 +1,19 @@ +# Filtering Code Outline +Your task is to filter the given file outline to the code chunk provided. The goal is to provide a context that is still contains the lines needed for understanding the code in the chunk whilst leaving out any irrelevant information. + +## Constraints + * Only use lines from the provided context, do not add any additional information + * Ensure that the selection you make is the most appropriate for the code chunk + * Make sure you include any definitions or imports that are used in the code chunk + * You do not need to repeat the code chunk in your response, it will be appended directly after your response. + * Do not use lines that are present in the code chunk + +## Code +``` +{{ code }} +``` + +## Outline +``` +{{ outline }} +``` diff --git a/swiftide-indexing/src/transformers/prompts/metadata_qa_code.prompt.md b/swiftide-indexing/src/transformers/prompts/metadata_qa_code.prompt.md index 1c5861e6..e05b607b 100644 --- a/swiftide-indexing/src/transformers/prompts/metadata_qa_code.prompt.md +++ b/swiftide-indexing/src/transformers/prompts/metadata_qa_code.prompt.md @@ -26,7 +26,13 @@ A1: It transforms strings into integers. Q2: What other internal parts does the code use? A2: A hasher to hash the strings. ``` +{% if outline %} +## Outline of the parent file +``` +{{ outline }} +``` +{% endif %} # Code ``` diff --git a/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_keywords__test__template.snap b/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_keywords__test__template.snap deleted file mode 100644 index a3e2eb2a..00000000 --- a/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_keywords__test__template.snap +++ /dev/null @@ -1,28 +0,0 @@ ---- -source: swiftide/src/transformers/metadata_keywords.rs -expression: prompt.render().await.unwrap() ---- -# Task - -Your task is to generate a descriptive, concise keywords for the given text - -# Constraints - -- Only respond in the example format -- Respond with a keywords that are representative of the text -- Only include keywords that are literally included in the text -- Respond with a comma-separated list of keywords - -# Example - -Respond in the following example format and do not include anything else: - -``` -, -``` - -# Text - -``` -test -``` diff --git a/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_qa_text__test__template.snap b/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_qa_text__test__template.snap deleted file mode 100644 index ff18e13a..00000000 --- a/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_qa_text__test__template.snap +++ /dev/null @@ -1,36 +0,0 @@ ---- -source: swiftide/src/transformers/metadata_qa_text.rs -expression: prompt.render().await.unwrap() ---- -# Task - -Your task is to generate questions and answers for the given text. - -Given that somebody else might ask questions about the text, consider things like: - -- What does this text do? -- What other internal parts does the text use? -- Does this text have any dependencies? -- What are some potential use cases for this text? -- ... and so on - -# Constraints - -- Generate at most 5 questions and answers. -- Only respond in the example format -- Only respond with questions and answers that can be derived from the text. - -# Example - -Respond in the following example format and do not include anything else: - -``` -Q1: What is the capital of France? -A1: Paris. -``` - -# text - -``` -test -``` diff --git a/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_summary__test__template.snap b/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_summary__test__template.snap deleted file mode 100644 index 73fc7fe4..00000000 --- a/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_summary__test__template.snap +++ /dev/null @@ -1,27 +0,0 @@ ---- -source: swiftide/src/transformers/metadata_summary.rs -expression: prompt.render().await.unwrap() ---- -# Task - -Your task is to generate a descriptive, concise summary for the given text - -# Constraints - -- Only respond in the example format -- Respond with a summary that is accurate and descriptive without fluff -- Only include information that is included in the text - -# Example - -Respond in the following example format and do not include anything else: - -``` - -``` - -# Text - -``` -test -``` diff --git a/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_title__test__template.snap b/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_title__test__template.snap deleted file mode 100644 index 9ac11648..00000000 --- a/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_title__test__template.snap +++ /dev/null @@ -1,26 +0,0 @@ ---- -source: swiftide/src/transformers/metadata_title.rs -expression: prompt.render().await.unwrap() ---- -# Task - -Your task is to generate a descriptive, concise title for the given text - -# Constraints - -- Only respond in the example format -- Respond with a title that is accurate and descriptive without fluff - -# Example - -Respond in the following example format and do not include anything else: - -``` - -``` - -# Text - -``` -test -``` diff --git a/swiftide-indexing/src/transformers/snapshots/swiftide_indexing__transformers__compress_code_outline__test__compress_code_template.snap b/swiftide-indexing/src/transformers/snapshots/swiftide_indexing__transformers__compress_code_outline__test__compress_code_template.snap new file mode 100644 index 00000000..b71a9352 --- /dev/null +++ b/swiftide-indexing/src/transformers/snapshots/swiftide_indexing__transformers__compress_code_outline__test__compress_code_template.snap @@ -0,0 +1,23 @@ +--- +source: swiftide-indexing/src/transformers/compress_code_outline.rs +expression: prompt.render().await.unwrap() +--- +# Filtering Code Outline +Your task is to filter the given file outline to the code chunk provided. The goal is to provide a context that is still contains the lines needed for understanding the code in the chunk whilst leaving out any irrelevant information. + +## Constraints + * Only use lines from the provided context, do not add any additional information + * Ensure that the selection you make is the most appropriate for the code chunk + * Make sure you include any definitions or imports that are used in the code chunk + * You do not need to repeat the code chunk in your response, it will be appended directly after your response. + * Do not use lines that are present in the code chunk + +## Code +``` +Code using outline +``` + +## Outline +``` +Relevant Outline +``` diff --git a/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_qa_code__test__template.snap b/swiftide-indexing/src/transformers/snapshots/swiftide_indexing__transformers__metadata_qa_code__test__template_with_outline.snap similarity index 87% rename from swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_qa_code__test__template.snap rename to swiftide-indexing/src/transformers/snapshots/swiftide_indexing__transformers__metadata_qa_code__test__template_with_outline.snap index d5bd1e0e..c1d66ca1 100644 --- a/swiftide-indexing/src/transformers/snapshots/swiftide__transformers__metadata_qa_code__test__template.snap +++ b/swiftide-indexing/src/transformers/snapshots/swiftide_indexing__transformers__metadata_qa_code__test__template_with_outline.snap @@ -1,5 +1,5 @@ --- -source: swiftide/src/transformers/metadata_qa_code.rs +source: swiftide-indexing/src/transformers/metadata_qa_code.rs expression: prompt.render().await.unwrap() --- # Task @@ -31,6 +31,12 @@ Q2: What other internal parts does the code use? A2: A hasher to hash the strings. ``` + +## Outline of the parent file +``` +Test outline +``` + # Code ``` diff --git a/swiftide-integrations/src/qdrant/indexing_node.rs b/swiftide-integrations/src/qdrant/indexing_node.rs index 4205f446..ad05d0d0 100644 --- a/swiftide-integrations/src/qdrant/indexing_node.rs +++ b/swiftide-integrations/src/qdrant/indexing_node.rs @@ -105,6 +105,8 @@ mod tests { #[test_case( Node { id: Some(1), path: "/path".into(), chunk: "data".into(), vectors: Some(HashMap::from([(EmbeddedField::Chunk, vec![1.0])])), + original_size: 4, + offset: 0, metadata: Metadata::from([("m1", "mv1")]), embed_mode: swiftide_core::indexing::EmbedMode::SingleWithMetadata }, @@ -112,7 +114,7 @@ mod tests { PointStruct { id: Some(PointId::from(6_516_159_902_038_153_111)), payload: HashMap::from([ ("content".into(), Value::from("data")), ("path".into(), Value::from("/path")), - ("m1".into(), Value::from("mv1"))]), + ("m1".into(), Value::from("mv1"))]), vectors: Some(Vectors { vectors_options: Some(VectorsOptions::Vector(Vector { data: vec![1.0], ..Default::default()} )) }) }; "Node with single vector creates struct with unnamed vector" @@ -124,13 +126,15 @@ mod tests { (EmbeddedField::Metadata("m1".into()), vec![2.0]) ])), metadata: Metadata::from([("m1", "mv1")]), - embed_mode: swiftide_core::indexing::EmbedMode::PerField + embed_mode: swiftide_core::indexing::EmbedMode::PerField, + original_size: 4, + offset: 0 }, HashSet::from([EmbeddedField::Chunk, EmbeddedField::Metadata("m1".into())]), PointStruct { id: Some(PointId::from(6_516_159_902_038_153_111)), payload: HashMap::from([ ("content".into(), Value::from("data")), ("path".into(), Value::from("/path")), - ("m1".into(), Value::from("mv1"))]), + ("m1".into(), Value::from("mv1"))]), vectors: Some(Vectors { vectors_options: Some(VectorsOptions::Vectors(NamedVectors { vectors: HashMap::from([ ("Chunk".into(), qdrant_client::qdrant::Vector { data: vec![1.0], ..Default::default() @@ -151,14 +155,16 @@ mod tests { (EmbeddedField::Metadata("m2".into()), vec![2.0]) ])), metadata: Metadata::from([("m1", "mv1"), ("m2", "mv2")]), - embed_mode: swiftide_core::indexing::EmbedMode::Both + embed_mode: swiftide_core::indexing::EmbedMode::Both, + original_size: 4, + offset: 0, }, HashSet::from([EmbeddedField::Combined]), PointStruct { id: Some(PointId::from(6_516_159_902_038_153_111)), payload: HashMap::from([ ("content".into(), Value::from("data")), ("path".into(), Value::from("/path")), - ("m1".into(), Value::from("mv1")), - ("m2".into(), Value::from("mv2"))]), + ("m1".into(), Value::from("mv1")), + ("m2".into(), Value::from("mv2"))]), vectors: Some(Vectors { vectors_options: Some(VectorsOptions::Vectors(NamedVectors { vectors: HashMap::from([ ("Combined".into(), qdrant_client::qdrant::Vector { data: vec![1.0], ..Default::default() diff --git a/swiftide-integrations/src/scraping/loader.rs b/swiftide-integrations/src/scraping/loader.rs index c1af1357..dd4adeb5 100644 --- a/swiftide-integrations/src/scraping/loader.rs +++ b/swiftide-integrations/src/scraping/loader.rs @@ -51,8 +51,11 @@ impl Loader for ScrapingLoader { let _recv_thread = tokio::spawn(async move { while let Ok(res) = spider_rx.recv().await { + let html = res.get_html(); + let original_size = html.len(); let node = Node { - chunk: res.get_html(), + chunk: html, + original_size, // TODO: Probably not the best way to represent this // and will fail. Can we add more metadata too? path: res.get_url().into(), diff --git a/swiftide-integrations/src/treesitter/mod.rs b/swiftide-integrations/src/treesitter/mod.rs index 4733d76b..ef69116f 100644 --- a/swiftide-integrations/src/treesitter/mod.rs +++ b/swiftide-integrations/src/treesitter/mod.rs @@ -1,9 +1,11 @@ //! Chunking code with tree-sitter and various tools mod code_tree; +mod outliner; mod queries; mod splitter; mod supported_languages; pub use code_tree::{CodeParser, CodeTree, ReferencesAndDefinitions}; +pub use outliner::{CodeOutliner, CodeOutlinerBuilder}; pub use splitter::{ChunkSize, CodeSplitter, CodeSplitterBuilder}; pub use supported_languages::SupportedLanguages; diff --git a/swiftide-integrations/src/treesitter/outliner.rs b/swiftide-integrations/src/treesitter/outliner.rs new file mode 100644 index 00000000..c63fc3d3 --- /dev/null +++ b/swiftide-integrations/src/treesitter/outliner.rs @@ -0,0 +1,301 @@ +use anyhow::{Context as _, Result}; +use tree_sitter::{Node, Parser, TreeCursor}; + +use derive_builder::Builder; + +use super::supported_languages::SupportedLanguages; + +#[derive(Debug, Builder, Clone)] +/// Generates a summary of a code file. +/// +/// It does so by parsing the code file and removing function bodies, leaving only the function signatures and +/// other top-level declarations along with any comments. +/// +/// The resulting summary can be used as a context when considering subsets of the code file, or for determining +/// relevance of the code file to a given task. +/// +#[builder(setter(into), build_fn(error = "anyhow::Error"))] +pub struct CodeOutliner { + #[builder(setter(custom))] + language: SupportedLanguages, +} + +impl CodeOutlinerBuilder { + /// Attempts to set the language for the `CodeOutliner`. + /// + /// # Arguments + /// + /// * `language` - A value that can be converted into `SupportedLanguages`. + /// + /// # Returns + /// + /// * `Result<Self>` - The builder instance with the language set, or an error if the language is not supported. + /// + /// # Errors + /// * If the language is not supported, an error is returned. + pub fn try_language(mut self, language: impl TryInto<SupportedLanguages>) -> Result<Self> { + self.language = Some( + language + .try_into() + .ok() + .context("Treesitter language not supported")?, + ); + Ok(self) + } +} + +impl CodeOutliner { + /// Creates a new `CodeOutliner` with the specified language + /// + /// # Arguments + /// + /// * `language` - The programming language for which the code will be outlined. + /// + /// # Returns + /// + /// * `Self` - A new instance of `CodeOutliner`. + pub fn new(language: SupportedLanguages) -> Self { + Self { language } + } + + /// Creates a new builder for `CodeOutliner`. + /// + /// # Returns + /// + /// * `CodeOutlinerBuilder` - A new builder instance for `CodeOutliner`. + pub fn builder() -> CodeOutlinerBuilder { + CodeOutlinerBuilder::default() + } + + /// outlines a code file. + /// + /// # Arguments + /// + /// * `code` - The source code to be split. + /// + /// # Returns + /// + /// * `Result<String>` - A result containing a string, or an error if the code could not be parsed. + /// + /// # Errors + /// * If the code could not be parsed, an error is returned. + pub fn outline(&self, code: &str) -> Result<String> { + let mut parser = Parser::new(); + parser.set_language(&self.language.into())?; + let tree = parser.parse(code, None).context("No nodes found")?; + let root_node = tree.root_node(); + + if root_node.has_error() { + anyhow::bail!("Root node has invalid syntax"); + } + + let mut cursor = root_node.walk(); + let mut summary = String::with_capacity(code.len()); + let mut last_end = 0; + self.outline_node(&mut cursor, code, &mut summary, &mut last_end); + Ok(summary) + } + + fn is_unneeded_node(&self, node: Node) -> bool { + match self.language { + SupportedLanguages::Rust => matches!(node.kind(), "block"), + SupportedLanguages::Typescript | SupportedLanguages::Javascript => { + matches!(node.kind(), "statement_block") + } + SupportedLanguages::Python => match node.kind() { + "block" => { + let parent = node.parent().expect("Python block node has no parent"); + parent.kind() == "function_definition" + } + _ => false, + }, + SupportedLanguages::Ruby => match node.kind() { + "body_statement" => { + let parent = node + .parent() + .expect("Ruby body_statement node has no parent"); + parent.kind() == "method" + } + _ => false, + }, + } + } + + /// outlines a syntax node + /// + /// # Arguments + /// + /// * `node` - The syntax node to be chunked. + /// * `source` - The source code as a string. + /// * `last_end` - The end byte of the last chunk. + /// + /// # Returns + /// + /// * `String` - A summary of the syntax node. + fn outline_node( + &self, + cursor: &mut TreeCursor, + source: &str, + summary: &mut String, + last_end: &mut usize, + ) { + let node = cursor.node(); + // If the node is not needed in the summary, skip it and go to the next sibling + if self.is_unneeded_node(node) { + summary.push_str(&source[*last_end..node.start_byte()]); + *last_end = node.end_byte(); + if cursor.goto_next_sibling() { + self.outline_node(cursor, source, summary, last_end); + } + return; + } + + let mut next_cursor = cursor.clone(); + + // If the node is a non-leaf, recursively outline its children + if next_cursor.goto_first_child() { + self.outline_node(&mut next_cursor, source, summary, last_end); + // If the node is a leaf, add the text to the summary + } else { + summary.push_str(&source[*last_end..node.end_byte()]); + *last_end = node.end_byte(); + } + + if cursor.goto_next_sibling() { + self.outline_node(cursor, source, summary, last_end); + } else { + // Done with this node + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // Test every supported language. + // We should strip away all code blocks and leave only imports, comments, function signatures, + // class, interface and structure definitions and definitions of constants, variables and other members. + #[test] + fn test_outline_rust() { + let code = r#" +use anyhow::{Context as _, Result}; +// This is a comment +fn main(a: usize, b: usize) -> usize { + println!("Hello, world!"); +} + +pub struct Bla { + a: usize +} + +impl Bla { + fn ok(&mut self) { + self.a = 1; + } +}"#; + let outliner = CodeOutliner::new(SupportedLanguages::Rust); + let summary = outliner.outline(code).unwrap(); + assert_eq!( + summary, + "\nuse anyhow::{Context as _, Result};\n// This is a comment\nfn main(a: usize, b: usize) -> usize \n\npub struct Bla {\n a: usize\n}\n\nimpl Bla {\n fn ok(&mut self) \n}" + ); + } + + #[test] + fn test_outline_typescript() { + let code = r#" +import { Context as _, Result } from 'anyhow'; +// This is a comment +function main(a: number, b: number): number { + console.log("Hello, world!"); +} + +export class Bla { + a: number; +} + +export interface Bla { + ok(): void; +}"#; + let outliner = CodeOutliner::new(SupportedLanguages::Typescript); + let summary = outliner.outline(code).unwrap(); + assert_eq!( + summary, + "\nimport { Context as _, Result } from 'anyhow';\n// This is a comment\nfunction main(a: number, b: number): number \n\nexport class Bla {\n a: number;\n}\n\nexport interface Bla {\n ok(): void;\n}" + ); + } + + #[test] + fn test_outline_python() { + let code = r#" +import sys +# This is a comment +def main(a: int, b: int) -> int: + print("Hello, world!") + +class Bla: + def __init__(self): + self.a = 1 + + def ok(self): + self.a = 1 +"#; + let outliner = CodeOutliner::new(SupportedLanguages::Python); + let summary = outliner.outline(code).unwrap(); + assert_eq!( + summary, + "\nimport sys\n# This is a comment\ndef main(a: int, b: int) -> int:\n \n\nclass Bla:\n def __init__(self):\n \n\n def ok(self):\n " + ); + } + + #[test] + fn test_outline_ruby() { + let code = r#" +require 'anyhow' +# This is a comment +def main(a, b) + puts "Hello, world!" +end + +class Bla + def ok + @a = 1 + end +end +"#; + let outliner = CodeOutliner::new(SupportedLanguages::Ruby); + let summary = outliner.outline(code).unwrap(); + assert_eq!( + summary, + "\nrequire 'anyhow'\n# This is a comment\ndef main(a, b)\n \nend\n\nclass Bla\n def ok\n \n end\nend" + ); + } + + #[test] + fn test_outline_javascript() { + let code = r#" +import { Context as _, Result } from 'anyhow'; +// This is a comment +function main(a, b) { + console.log("Hello, world!"); +} + +class Bla { + constructor() { + this.a = 1; + } + + ok() { + this.a = 1; + } +} +"#; + let outliner = CodeOutliner::new(SupportedLanguages::Javascript); + let summary = outliner.outline(code).unwrap(); + assert_eq!( + summary, + "\nimport { Context as _, Result } from 'anyhow';\n// This is a comment\nfunction main(a, b) \n\nclass Bla {\n constructor() \n\n ok() \n}" + ); + } +}