From d2a9ea1e7afa6f192bf9c32bbb54d9bb6e46472e Mon Sep 17 00:00:00 2001 From: Timon Vonk Date: Sun, 7 Jul 2024 19:19:08 +0200 Subject: [PATCH] chore: enable clippy pedantic (#132) --- .github/workflows/test.yml | 4 +- CHANGELOG.md | 31 ++++--- swiftide/Cargo.toml | 12 ++- swiftide/src/indexing/node.rs | 2 +- swiftide/src/indexing/pipeline.rs | 41 +++++++-- swiftide/src/ingestion/ingestion_stream.rs | 85 +++++++++++++++++++ swiftide/src/integrations/aws_bedrock/mod.rs | 11 +-- .../integrations/aws_bedrock/models/mod.rs | 4 +- swiftide/src/integrations/fastembed/mod.rs | 11 ++- swiftide/src/integrations/openai/mod.rs | 10 +-- .../src/integrations/openai/simple_prompt.rs | 2 +- swiftide/src/integrations/qdrant/mod.rs | 17 +++- swiftide/src/integrations/redis/mod.rs | 4 + swiftide/src/integrations/redis/persist.rs | 4 +- .../scraping/html_to_markdown_transformer.rs | 4 +- .../src/integrations/treesitter/splitter.rs | 28 +++--- swiftide/src/loaders/file_loader.rs | 26 +++--- swiftide/src/persist/memory_storage.rs | 8 +- swiftide/src/transformers/chunk_code.rs | 4 +- swiftide/src/transformers/chunk_markdown.rs | 7 +- swiftide/src/transformers/embed.rs | 3 +- .../src/transformers/metadata_keywords.rs | 9 +- swiftide/src/transformers/metadata_qa_code.rs | 5 +- swiftide/src/transformers/metadata_qa_text.rs | 7 +- swiftide/src/transformers/metadata_summary.rs | 7 +- swiftide/src/transformers/metadata_title.rs | 7 +- swiftide/tests/indexing_pipeline.rs | 6 +- 27 files changed, 261 insertions(+), 98 deletions(-) create mode 100644 swiftide/src/ingestion/ingestion_stream.rs diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4d108c9c..24a010bc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -58,6 +58,6 @@ jobs: - uses: dtolnay/rust-toolchain@stable with: components: clippy,rustfmt + - name: Cache Cargo dependencies + uses: Swatinem/rust-cache@v2 - uses: r7kamura/rust-problem-matchers@v1 - - name: "Clippy" - run: cargo clippy --all-targets --all-features diff --git a/CHANGELOG.md b/CHANGELOG.md index 311e50c5..ded56008 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,16 +7,8 @@ All notable changes to this project will be documented in this file. ### Bug Fixes -- [46b3cbc](https://github.com/bosun-ai/swiftide/commit/46b3cbc114d522efd1afab05ac33b46fcfbd9159) *(changelog)* Exclude bots from detailed lines - - [9334934](https://github.com/bosun-ai/swiftide/commit/9334934e4af92b35dbc61e1f92aa90abac29ca12) *(chunkcode)* Use correct chunksizes by @timonv in [#122](https://github.com/bosun-ai/swiftide/pull/122) -- [af0775e](https://github.com/bosun-ai/swiftide/commit/af0775ec4b635318d4a2fed8df1783fdd028983e) *(ci)* Minimal components - -- [9445777](https://github.com/bosun-ai/swiftide/commit/9445777e6cd462874b426ff698447c3e8f0fd5f4) *(ci)* Remove cache - -- [dd70537](https://github.com/bosun-ai/swiftide/commit/dd7053748801c44fa644b770121b56f5d60ee390) *(ci)* Job cleanup and separate cache for coverage - - [dba29a0](https://github.com/bosun-ai/swiftide/commit/dba29a07fa68589151536b5ba197a69ff339ad01) *(ci)* Ensure clippy runs with all features - [3b98334](https://github.com/bosun-ai/swiftide/commit/3b98334b2bf78cfe9c957bfa1dd3cd7c939b6c39) *(deps)* Update rust crate serde_json to v1.0.120 by @renovate[bot] in [#115](https://github.com/bosun-ai/swiftide/pull/115) @@ -33,7 +25,7 @@ All notable changes to this project will be documented in this file. --------- ```` -- [c73377f](https://github.com/bosun-ai/swiftide/commit/c73377fb695412eaa329ed937731074288088097) *(uncategorized)* Clippy +- [b498074](https://github.com/bosun-ai/swiftide/commit/b4980746b55073ce870bc897aef6721d10883acd) *(uncategorized)* Clippy ### Documentation @@ -49,7 +41,26 @@ All notable changes to this project will be documented in this file. - [bd72c6a](https://github.com/bosun-ai/swiftide/commit/bd72c6a62228deed722bbc22bdcd389843cde453) *(ci)* Coverage using llvm-cov -- [ad77a5f](https://github.com/bosun-ai/swiftide/commit/ad77a5faea79708de5dfee3dc0ef7ff170eebf01) *(uncategorized)* Properly configure typos +- [51c114c](https://github.com/bosun-ai/swiftide/commit/51c114ceb06db840c4952d3d0f694bfbf266681c) *(uncategorized)* Various tooling & community improvements by @timonv in [#131](https://github.com/bosun-ai/swiftide/pull/131) + + ````text + - **fix(ci): ensure clippy runs with all features** + - **chore(ci): coverage using llvm-cov** + - **chore: drastically improve changelog generation** + - **chore(ci): add sanity checks for pull requests** + - **chore(ci): split jobs and add typos** + ```` + +- [84dd65d](https://github.com/bosun-ai/swiftide/commit/84dd65dc6c0ff4595f27ed061a4f4c0a2dae7202) *(uncategorized)* Rename all mentions of ingest to index by @timonv in [#130](https://github.com/bosun-ai/swiftide/pull/130) [**breaking**] + + ````text + Swiftide is not an ingestion pipeline (loading data), but an indexing + pipeline (prepping for search). + + There is now a temporary, deprecated re-export to match the previous api. + ```` + +- [d7d318e](https://github.com/bosun-ai/swiftide/commit/d7d318e60d42a1fce58c08e296c0aeac2674b32b) *(uncategorized)* Enable clippy pedantic - [88429f9](https://github.com/bosun-ai/swiftide/commit/88429f9730c43e44d5707c3d1615f8509a3f2a24) *(uncategorized)* Drastically improve changelog generation diff --git a/swiftide/Cargo.toml b/swiftide/Cargo.toml index 9d222fbc..50cb2f77 100644 --- a/swiftide/Cargo.toml +++ b/swiftide/Cargo.toml @@ -7,7 +7,7 @@ readme = "../README.md" keywords = ["llm", "rag", "ai", "data", "openai"] description = "Blazing fast, streaming document and code indexation" categories = ["asynchronous"] -licence-file = "../LICENSE" +repository = "https://github.com/bosun-ai/swiftide-rs" [dependencies] anyhow = { version = "1.0.86", features = ["backtrace"] } @@ -94,8 +94,18 @@ mockall = "0.12.1" temp-dir = "0.1.13" wiremock = "0.6.0" +[lints.rust] +unsafe_code = "forbid" + [lints.clippy] +cargo = { level = "warn", priority = -1 } +pedantic = { level = "warn", priority = -1 } blocks_in_conditions = "allow" +must_use_candidate = "allow" +module_name_repetitions = "allow" +missing_fields_in_debug = "allow" +# Should be fixed asap +multiple_crate_versions = "allow" [package.metadata.docs.rs] all-features = true diff --git a/swiftide/src/indexing/node.rs b/swiftide/src/indexing/node.rs index bc3eaf73..9bfd21f6 100644 --- a/swiftide/src/indexing/node.rs +++ b/swiftide/src/indexing/node.rs @@ -88,7 +88,7 @@ impl Node { let metadata = self .metadata .iter() - .map(|(k, v)| format!("{}: {}", k, v)) + .map(|(k, v)| format!("{k}: {v}")) .collect::>() .join("\n"); diff --git a/swiftide/src/indexing/pipeline.rs b/swiftide/src/indexing/pipeline.rs index a29fc2cf..f633d94b 100644 --- a/swiftide/src/indexing/pipeline.rs +++ b/swiftide/src/indexing/pipeline.rs @@ -29,7 +29,7 @@ impl Default for Pipeline { fn default() -> Self { Self { stream: IndexingStream::empty(), - storage: Default::default(), + storage: Vec::default(), concurrency: num_cpus::get(), } } @@ -78,6 +78,7 @@ impl Pipeline { /// # Returns /// /// An instance of `Pipeline` with the updated concurrency level. + #[must_use] pub fn with_concurrency(mut self, concurrency: usize) -> Self { self.concurrency = concurrency; self @@ -92,6 +93,7 @@ impl Pipeline { /// # Returns /// /// An instance of `Pipeline` with the updated stream that filters out cached nodes. + #[must_use] pub fn filter_cached(mut self, cache: impl NodeCache + 'static) -> Self { let cache = Arc::new(cache); self.stream = self @@ -101,13 +103,13 @@ impl Pipeline { let span = tracing::trace_span!("filter_cached", node_cache = ?cache, node = ?node ); async move { - if !cache.get(&node).await { + if cache.get(&node).await { + tracing::debug!("Node in cache, skipping"); + Ok(None) + } else { cache.set(&node).await; tracing::debug!("Node not in cache, passing through"); Ok(Some(node)) - } else { - tracing::debug!("Node in cache, skipping"); - Ok(None) } } .instrument(span) @@ -126,6 +128,7 @@ impl Pipeline { /// # Returns /// /// An instance of `Pipeline` with the updated stream that applies the transformer to each node. + #[must_use] pub fn then(mut self, transformer: impl Transformer + 'static) -> Self { let concurrency = transformer.concurrency().unwrap_or(self.concurrency); let transformer = Arc::new(transformer); @@ -154,6 +157,7 @@ impl Pipeline { /// # Returns /// /// An instance of `Pipeline` with the updated stream that applies the batch transformer to each batch of nodes. + #[must_use] pub fn then_in_batch( mut self, batch_size: usize, @@ -186,6 +190,7 @@ impl Pipeline { /// # Returns /// /// An instance of `Pipeline` with the updated stream that applies the chunker transformer to each node. + #[must_use] pub fn then_chunk(mut self, chunker: impl ChunkerTransformer + 'static) -> Self { let chunker = Arc::new(chunker); let concurrency = chunker.concurrency().unwrap_or(self.concurrency); @@ -214,6 +219,12 @@ impl Pipeline { /// # Returns /// /// An instance of `Pipeline` with the configured storage backend. + /// + /// # Panics + /// + /// Panics if batch size turns out to be not set and batch storage is still invoked. + /// Pipeline only invokes batch storing if the batch size is set, so should be alright. + #[must_use] pub fn then_store_with(mut self, storage: impl Persist + 'static) -> Self { let storage = Arc::new(storage); self.storage.push(storage.clone()); @@ -259,6 +270,11 @@ impl Pipeline { /// if sending fails. /// /// They can either be run concurrently, alternated between or merged back together. + /// + /// # Panics + /// + /// Panics if the receiving pipelines buffers are full or unavailable. + #[must_use] pub fn split_by

(self, predicate: P) -> (Self, Self) where P: Fn(&Result) -> bool + Send + Sync + 'static, @@ -282,13 +298,13 @@ impl Pipeline { left_tx .send(item) .await - .expect("Failed to send to left stream") + .expect("Failed to send to left stream"); } else { tracing::debug!(?item, "Sending to right stream"); right_tx .send(item) .await - .expect("Failed to send to right stream") + .expect("Failed to send to right stream"); } } }) @@ -316,6 +332,7 @@ impl Pipeline { /// This is useful for merging two streams that have been split using the `split_by` method. /// /// The full stream can then be processed using the `run` method. + #[must_use] pub fn merge(self, other: Self) -> Self { let stream = tokio_stream::StreamExt::merge(self.stream, other.stream); @@ -327,7 +344,8 @@ impl Pipeline { /// Throttles the stream of nodes, limiting the rate to 1 per duration. /// - /// Useful for rate limiting the indexing pipeline. Uses tokio_stream::StreamExt::throttle internally which has a granualarity of 1ms. + /// Useful for rate limiting the indexing pipeline. Uses `tokio_stream::StreamExt::throttle` internally which has a granualarity of 1ms. + #[must_use] pub fn throttle(mut self, duration: impl Into) -> Self { self.stream = tokio_stream::StreamExt::throttle(self.stream, duration.into()) .boxed() @@ -339,6 +357,7 @@ impl Pipeline { // // This method filters out errors encountered by the pipeline, preventing them from bubbling up and terminating the stream. // Note that errors are not logged. + #[must_use] pub fn filter_errors(mut self) -> Self { self.stream = self .stream @@ -358,6 +377,7 @@ impl Pipeline { /// This allows you to skip specific errors or nodes, or do ad hoc inspection. /// /// If the closure returns true, the result is kept, otherwise it is skipped. + #[must_use] pub fn filter(mut self, filter: F) -> Self where F: Fn(&Result) -> bool + Send + Sync + 'static, @@ -377,6 +397,7 @@ impl Pipeline { /// Logs all results processed by the pipeline. /// /// This method logs all results processed by the pipeline at the `DEBUG` level. + #[must_use] pub fn log_all(self) -> Self { self.log_errors().log_nodes() } @@ -384,6 +405,7 @@ impl Pipeline { /// Logs all errors encountered by the pipeline. /// /// This method logs all errors encountered by the pipeline at the `ERROR` level. + #[must_use] pub fn log_errors(mut self) -> Self { self.stream = self .stream @@ -396,6 +418,7 @@ impl Pipeline { /// Logs all nodes processed by the pipeline. /// /// This method logs all nodes processed by the pipeline at the `DEBUG` level. + #[must_use] pub fn log_nodes(mut self) -> Self { self.stream = self .stream @@ -493,7 +516,7 @@ mod tests { let mut nodes = vec![]; for i in 0..3 { let mut node = node.clone(); - node.chunk = format!("transformed_chunk_{}", i); + node.chunk = format!("transformed_chunk_{i}"); nodes.push(Ok(node)); } nodes.into() diff --git a/swiftide/src/ingestion/ingestion_stream.rs b/swiftide/src/ingestion/ingestion_stream.rs new file mode 100644 index 00000000..c67ffdfd --- /dev/null +++ b/swiftide/src/ingestion/ingestion_stream.rs @@ -0,0 +1,85 @@ +#![allow(clippy::from_over_into)] +//! This module defines the `IngestionStream` type, which is used for handling asynchronous streams of `IngestionNode` items in the ingestion pipeline. + +use anyhow::Result; +use futures_util::stream::{self, Stream}; +use pin_project_lite::pin_project; +use std::pin::Pin; +use tokio::sync::mpsc::Receiver; + +use super::IngestionNode; + +pub use futures_util::{StreamExt, TryStreamExt}; + +// We need to inform the compiler that `inner` is pinned as well +pin_project! { + /// An asynchronous stream of `IngestionNode` items. + /// + /// Wraps an internal stream of `Result` items. + /// + /// Streams, iterators and vectors of `Result` can be converted into an `IngestionStream`. + pub struct IngestionStream { + #[pin] + pub(crate) inner: Pin> + Send>>, + } +} + +impl Stream for IngestionStream { + type Item = Result; + + fn poll_next( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + let this = self.project(); + this.inner.poll_next(cx) + } +} + +impl Into for Vec> { + fn into(self) -> IngestionStream { + IngestionStream::iter(self) + } +} + +impl Into for Result> { + fn into(self) -> IngestionStream { + match self { + Ok(nodes) => IngestionStream::iter(nodes.into_iter().map(Ok)), + Err(err) => IngestionStream::iter(vec![Err(err)]), + } + } +} + +impl Into for Pin> + Send>> { + fn into(self) -> IngestionStream { + IngestionStream { inner: self } + } +} + +impl Into for Receiver> { + fn into(self) -> IngestionStream { + IngestionStream { + inner: tokio_stream::wrappers::ReceiverStream::new(self).boxed(), + } + } +} + +impl IngestionStream { + pub fn empty() -> Self { + IngestionStream { + inner: stream::empty().boxed(), + } + } + + // NOTE: Can we really guarantee that the iterator will outlive the stream? + pub fn iter(iter: I) -> Self + where + I: IntoIterator> + Send + 'static, + ::IntoIter: Send, + { + IngestionStream { + inner: stream::iter(iter).boxed(), + } + } +} diff --git a/swiftide/src/integrations/aws_bedrock/mod.rs b/swiftide/src/integrations/aws_bedrock/mod.rs index 8e4305da..9e0c4f2e 100644 --- a/swiftide/src/integrations/aws_bedrock/mod.rs +++ b/swiftide/src/integrations/aws_bedrock/mod.rs @@ -5,7 +5,7 @@ use std::sync::Arc; use anyhow::Result; use async_trait::async_trait; -use aws_sdk_bedrockruntime::{primitives::Blob, Client}; +use aws_sdk_bedrockruntime::{error::SdkError, primitives::Blob, Client}; use derive_builder::Builder; use serde::Serialize; use tokio::runtime::Handle; @@ -18,7 +18,7 @@ mod simple_prompt; /// An integration with the AWS Bedrock service. /// -/// Can be used as SimplePrompt. +/// Can be used as `SimplePrompt`. /// /// To use Bedrock, you need to have a model id and access to the service. /// By default, the aws sdk will be configured from the environment. @@ -59,7 +59,7 @@ impl BedrockPrompt for Client { .model_id(model_id) .send() .await - .map_err(|e| e.into_service_error())?; + .map_err(SdkError::into_service_error)?; Ok(response.body.into_inner()) } @@ -81,12 +81,12 @@ impl AwsBedrock { AwsBedrockBuilder::default() } - /// Build a new AwsBedrock instance with the Titan model family + /// Build a new `AwsBedrock` instance with the Titan model family pub fn build_titan_family(model_id: impl Into) -> AwsBedrockBuilder { Self::builder().titan().model_id(model_id).to_owned() } - /// Build a new AwsBedrock instance with the Anthropic model family + /// Build a new `AwsBedrock` instance with the Anthropic model family pub fn build_anthropic_family(model_id: impl Into) -> AwsBedrockBuilder { Self::builder().anthropic().model_id(model_id).to_owned() } @@ -104,6 +104,7 @@ impl AwsBedrockBuilder { self } + #[allow(clippy::unused_self)] fn default_config(&self) -> aws_config::SdkConfig { tokio::task::block_in_place(|| { Handle::current().block_on(async { aws_config::from_env().load().await }) diff --git a/swiftide/src/integrations/aws_bedrock/models/mod.rs b/swiftide/src/integrations/aws_bedrock/models/mod.rs index d32c9b7f..811c4831 100644 --- a/swiftide/src/integrations/aws_bedrock/models/mod.rs +++ b/swiftide/src/integrations/aws_bedrock/models/mod.rs @@ -75,9 +75,9 @@ impl ModelFamily { if response.results.is_empty() { return Err(anyhow::anyhow!("No results returned")); - } else { - Ok(response.results.swap_remove(0).output_text) } + + Ok(response.results.swap_remove(0).output_text) } } } diff --git a/swiftide/src/integrations/fastembed/mod.rs b/swiftide/src/integrations/fastembed/mod.rs index 8f64d7ce..83549969 100644 --- a/swiftide/src/integrations/fastembed/mod.rs +++ b/swiftide/src/integrations/fastembed/mod.rs @@ -1,4 +1,4 @@ -//! FastEmbed integration for text embedding. +//! `FastEmbed` integration for text embedding. use anyhow::Result; use async_trait::async_trait; @@ -7,14 +7,14 @@ use fastembed::TextEmbedding; use crate::{EmbeddingModel, Embeddings}; -/// A wrapper around the FastEmbed library for text embedding. +/// A wrapper around the `FastEmbed` library for text embedding. /// /// Supports a variety of fast text embedding models. The default is the `Flag Embedding` model /// with a dimension size of 384. /// /// See the [FastEmbed documentation](https://docs.rs/fastembed) for more information on usage. /// -/// FastEmbed can be customized by setting the embedding model via the builder. The batch size can +/// `FastEmbed` can be customized by setting the embedding model via the builder. The batch size can /// also be set and is recommended. Batch size should match the batch size in the indexing /// pipeline. /// @@ -35,6 +35,11 @@ pub struct FastEmbed { } impl FastEmbed { + /// Tries to build a default `FastEmbed` with `Flag Embedding`. + /// + /// # Errors + /// + /// Errors if the build fails pub fn try_default() -> Result { Self::builder().build() } diff --git a/swiftide/src/integrations/openai/mod.rs b/swiftide/src/integrations/openai/mod.rs index bd55f666..f3773e69 100644 --- a/swiftide/src/integrations/openai/mod.rs +++ b/swiftide/src/integrations/openai/mod.rs @@ -1,4 +1,4 @@ -//! This module provides integration with OpenAI's API, enabling the use of language models and embeddings within the Swiftide project. +//! This module provides integration with `OpenAI`'s API, enabling the use of language models and embeddings within the Swiftide project. //! It includes the `OpenAI` struct for managing API clients and default options for embedding and prompt models. //! The module is conditionally compiled based on the "openai" feature flag. @@ -8,12 +8,12 @@ use std::sync::Arc; mod embed; mod simple_prompt; -/// The `OpenAI` struct encapsulates an OpenAI client and default options for embedding and prompt models. +/// The `OpenAI` struct encapsulates an `OpenAI` client and default options for embedding and prompt models. /// It uses the `Builder` pattern for flexible and customizable instantiation. #[derive(Debug, Builder, Clone)] #[builder(setter(into, strip_option))] pub struct OpenAI { - /// The OpenAI client, wrapped in an `Arc` for thread-safe reference counting. + /// The `OpenAI` client, wrapped in an `Arc` for thread-safe reference counting. /// Defaults to a new instance of `async_openai::Client`. #[builder(default = "Arc::new(async_openai::Client::new())", setter(custom))] client: Arc>, @@ -50,10 +50,10 @@ impl OpenAI { } impl OpenAIBuilder { - /// Sets the OpenAI client for the `OpenAI` instance. + /// Sets the `OpenAI` client for the `OpenAI` instance. /// /// # Parameters - /// - `client`: The OpenAI client to set. + /// - `client`: The `OpenAI` client to set. /// /// # Returns /// A mutable reference to the `OpenAIBuilder`. diff --git a/swiftide/src/integrations/openai/simple_prompt.rs b/swiftide/src/integrations/openai/simple_prompt.rs index 23c98ad4..8281e526 100644 --- a/swiftide/src/integrations/openai/simple_prompt.rs +++ b/swiftide/src/integrations/openai/simple_prompt.rs @@ -1,5 +1,5 @@ //! This module provides an implementation of the `SimplePrompt` trait for the `OpenAI` struct. -//! It defines an asynchronous function to interact with the OpenAI API, allowing prompt processing +//! It defines an asynchronous function to interact with the `OpenAI` API, allowing prompt processing //! and generating responses as part of the Swiftide system. use crate::SimplePrompt; use async_openai::types::{ChatCompletionRequestUserMessageArgs, CreateChatCompletionRequestArgs}; diff --git a/swiftide/src/integrations/qdrant/mod.rs b/swiftide/src/integrations/qdrant/mod.rs index 719c738b..9487f049 100644 --- a/swiftide/src/integrations/qdrant/mod.rs +++ b/swiftide/src/integrations/qdrant/mod.rs @@ -29,9 +29,10 @@ const DEFAULT_QDRANT_URL: &str = "http://localhost:6334"; pub struct Qdrant { /// The Qdrant client used to interact with the Qdrant vector database. /// - /// By default the client will be build from QDRANT_URL and option QDRANT_API_KEY. - /// It will fall back to `http://localhost:6334` if QDRANT_URL is not set. + /// By default the client will be build from `QDRANT_URL` and option `QDRANT_API_KEY`. + /// It will fall back to `http://localhost:6334` if `QDRANT_URL` is not set. #[builder(setter(into), default = "self.default_client()?")] + #[allow(clippy::missing_fields_in_debug)] client: Arc, /// The name of the collection to be used in Qdrant. Defaults to "swiftide". #[builder(default = "DEFAULT_COLLECTION_NAME.to_string()")] @@ -50,7 +51,7 @@ impl Qdrant { QdrantBuilder::default() } - /// Tries to create a `QdrantBuilder` from a given URL. Will use the api key in QDRANT_API_KEY if present. + /// Tries to create a `QdrantBuilder` from a given URL. Will use the api key in `QDRANT_API_KEY` if present. /// /// Returns /// @@ -61,6 +62,10 @@ impl Qdrant { /// # Returns /// /// A `Result` containing the `QdrantBuilder` if successful, or an error otherwise. + /// + /// # Errors + /// + /// Errors if client fails build pub fn try_from_url(url: impl AsRef) -> Result { Ok(QdrantBuilder::default().client( qdrant_client::Qdrant::from_url(url.as_ref()) @@ -77,6 +82,10 @@ impl Qdrant { /// # Returns /// /// A `Result` indicating success or failure. + /// + /// # Errors + /// + /// Errors if client fails build pub async fn create_index_if_not_exists(&self) -> Result<()> { tracing::info!("Checking if collection {} exists", self.collection_name); if self.client.collection_exists(&self.collection_name).await? { @@ -96,6 +105,7 @@ impl Qdrant { } impl QdrantBuilder { + #[allow(clippy::unused_self)] fn default_client(&self) -> Result> { let client = qdrant_client::Qdrant::from_url( &std::env::var("QDRANT_URL").unwrap_or(DEFAULT_QDRANT_URL.to_string()), @@ -108,6 +118,7 @@ impl QdrantBuilder { } } +#[allow(clippy::missing_fields_in_debug)] impl std::fmt::Debug for Qdrant { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Qdrant") diff --git a/swiftide/src/integrations/redis/mod.rs b/swiftide/src/integrations/redis/mod.rs index ade7e0c7..eb0792ea 100644 --- a/swiftide/src/integrations/redis/mod.rs +++ b/swiftide/src/integrations/redis/mod.rs @@ -77,6 +77,9 @@ impl Redis { }) } + /// # Errors + /// + /// Returns an error if the Redis client cannot be opened pub fn try_build_from_url(url: impl AsRef) -> Result { Ok(RedisBuilder::default() .client(redis::Client::open(url.as_ref()).context("Failed to open redis client")?)) @@ -189,6 +192,7 @@ impl Redis { } // Redis CM does not implement debug +#[allow(clippy::missing_fields_in_debug)] impl std::fmt::Debug for Redis { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Redis") diff --git a/swiftide/src/integrations/redis/persist.rs b/swiftide/src/integrations/redis/persist.rs index 92bd8231..9e706a24 100644 --- a/swiftide/src/integrations/redis/persist.rs +++ b/swiftide/src/integrations/redis/persist.rs @@ -153,7 +153,7 @@ mod tests { for node in streamed_nodes { let stored_node = serde_json::from_str(&redis.get_node(&node).await.unwrap().unwrap()); - assert_eq!(node, stored_node.unwrap()) + assert_eq!(node, stored_node.unwrap()); } } @@ -180,6 +180,6 @@ mod tests { assert_eq!( redis.persist_key_for_node(&node).unwrap(), "test".to_string() - ) + ); } } diff --git a/swiftide/src/integrations/scraping/html_to_markdown_transformer.rs b/swiftide/src/integrations/scraping/html_to_markdown_transformer.rs index 65d9fd99..6f888b02 100644 --- a/swiftide/src/integrations/scraping/html_to_markdown_transformer.rs +++ b/swiftide/src/integrations/scraping/html_to_markdown_transformer.rs @@ -70,7 +70,7 @@ mod test { async fn test_html_to_markdown() { let node = Node::new("

Hello, World!

"); let transformer = HtmlToMarkdownTransformer::default(); - let transformed = transformer.transform_node(node).await.unwrap(); - assert_eq!(transformed.chunk, "# Hello, World!"); + let transformed_node = transformer.transform_node(node).await.unwrap(); + assert_eq!(transformed_node.chunk, "# Hello, World!"); } } diff --git a/swiftide/src/integrations/treesitter/splitter.rs b/swiftide/src/integrations/treesitter/splitter.rs index ee5f0ad2..8b6cc8a2 100644 --- a/swiftide/src/integrations/treesitter/splitter.rs +++ b/swiftide/src/integrations/treesitter/splitter.rs @@ -32,6 +32,10 @@ impl CodeSplitterBuilder { /// # Returns /// /// * `Result` - The builder instance with the language set, or an error if the language is not supported. + /// + /// # Errors + /// + /// Errors if language is not supported pub fn try_language(mut self, language: impl TryInto) -> Result { self.language = Some( language @@ -83,7 +87,7 @@ impl CodeSplitter { /// * `Self` - A new instance of `CodeSplitter`. pub fn new(language: SupportedLanguages) -> Self { Self { - chunk_size: Default::default(), + chunk_size: ChunkSize::default(), language, } } @@ -152,7 +156,7 @@ impl CodeSplitter { } if !current_chunk.is_empty() && current_chunk.len() > self.min_bytes() { - new_chunks.push(current_chunk) + new_chunks.push(current_chunk); } new_chunks @@ -167,6 +171,10 @@ impl CodeSplitter { /// # Returns /// /// * `Result>` - A result containing a vector of code chunks as strings, or an error if the code could not be parsed. + /// + /// # Errors + /// + /// Returns an error if the node cannot be found or fails to parse pub fn split(&self, code: &str) -> Result> { let mut parser = Parser::new(); parser.set_language(&self.language.into())?; @@ -175,9 +183,9 @@ impl CodeSplitter { if root_node.has_error() { anyhow::bail!("Root node has invalid syntax"); - } else { - Ok(self.chunk_node(root_node, code, 0, None)) } + + Ok(self.chunk_node(root_node, code, 0, None)) } /// Returns the maximum number of bytes allowed in a chunk. @@ -264,7 +272,7 @@ mod test { assert!(chunks.iter().all(|chunk| chunk.len() <= 50)); assert!(chunks .windows(2) - .all(|pair| pair.iter().map(|chunk| chunk.len()).sum::() >= 50)); + .all(|pair| pair.iter().map(String::len).sum::() >= 50)); assert_eq!( chunks, @@ -272,7 +280,7 @@ mod test { "fn main() {\n println!(\"Hello, World!\");", "\n println!(\"Goodbye, World!\");\n}", ] - ) + ); } #[test] @@ -313,7 +321,7 @@ mod test { "fn main() {\n println!(\"Hello, World!\");", "\n println!(\"Goodbye, World!\");\n}", ] - ) + ); } #[test] @@ -335,7 +343,7 @@ mod test { assert!(chunks.iter().all(|chunk| chunk.len() <= 50)); assert!(chunks .windows(2) - .all(|pair| pair.iter().map(|chunk| chunk.len()).sum::() > 50)); + .all(|pair| pair.iter().map(String::len).sum::() > 50)); assert!(chunks.iter().all(|chunk| chunk.len() >= 20)); assert_eq!( @@ -344,7 +352,7 @@ mod test { "fn main() {\n println!(\"Hello, World!\");", "\n println!(\"Goodbye, World!\");\n}" ] - ) + ); } #[test] @@ -383,7 +391,7 @@ mod test { assert!(chunks.iter().all(|chunk| chunk.len() <= max)); let chunk_pairs_that_are_smaller_than_max = chunks .windows(2) - .filter(|pair| pair.iter().map(|chunk| chunk.len()).sum::() < max); + .filter(|pair| pair.iter().map(String::len).sum::() < max); assert!( chunk_pairs_that_are_smaller_than_max.clone().count() == 0, "max: {}, {} + {}, {:?}", diff --git a/swiftide/src/loaders/file_loader.rs b/swiftide/src/loaders/file_loader.rs index 242a413e..f70f3c1c 100644 --- a/swiftide/src/loaders/file_loader.rs +++ b/swiftide/src/loaders/file_loader.rs @@ -32,6 +32,7 @@ impl FileLoader { /// /// # Returns /// The `FileLoader` instance with the added extensions. + #[must_use] pub fn with_extensions(mut self, extensions: &[impl AsRef]) -> Self { self.extensions = Some( self.extensions @@ -52,10 +53,10 @@ impl FileLoader { /// This method will panic if it fails to read a file's content. pub fn list_nodes(&self) -> Vec { ignore::Walk::new(&self.path) - .filter_map(|entry| entry.ok()) - .filter(|entry| entry.file_type().map(|ft| ft.is_file()).unwrap_or(false)) + .filter_map(Result::ok) + .filter(|entry| entry.file_type().is_some_and(|ft| ft.is_file())) .filter(move |entry| self.file_has_extension(entry.path())) - .map(|entry| entry.into_path()) + .map(ignore::DirEntry::into_path) .map(|entry| { tracing::debug!("Reading file: {:?}", entry); let content = std::fs::read_to_string(&entry).unwrap(); @@ -72,15 +73,12 @@ impl FileLoader { // If no extensions are specified, this function will return true. // If the file has no extension, this function will return false. fn file_has_extension(&self, path: &Path) -> bool { - self.extensions - .as_ref() - .map(|exts| { - let Some(ext) = path.extension() else { - return false; - }; - exts.iter().any(|e| e == ext.to_string_lossy().as_ref()) - }) - .unwrap_or(true) + self.extensions.as_ref().map_or(true, |exts| { + let Some(ext) = path.extension() else { + return false; + }; + exts.iter().any(|e| e == ext.to_string_lossy().as_ref()) + }) } } @@ -94,8 +92,8 @@ impl Loader for FileLoader { /// This method will return an error if it fails to read a file's content. fn into_stream(self) -> IndexingStream { let files = ignore::Walk::new(&self.path) - .filter_map(|entry| entry.ok()) - .filter(|entry| entry.file_type().map(|ft| ft.is_file()).unwrap_or(false)) + .filter_map(Result::ok) + .filter(|entry| entry.file_type().is_some_and(|ft| ft.is_file())) .filter(move |entry| self.file_has_extension(entry.path())) .map(|entry| { tracing::debug!("Reading file: {:?}", entry); diff --git a/swiftide/src/persist/memory_storage.rs b/swiftide/src/persist/memory_storage.rs index 43fdfcdd..7e522cc9 100644 --- a/swiftide/src/persist/memory_storage.rs +++ b/swiftide/src/persist/memory_storage.rs @@ -132,10 +132,10 @@ mod test { .batch_store(vec![node1.clone(), node2.clone()]) .await; - let nodes: Vec = stream.try_collect().await.unwrap(); + let result: Vec = stream.try_collect().await.unwrap(); - assert_eq!(nodes.len(), 2); - assert_eq!(nodes[0], node1); - assert_eq!(nodes[1], node2); + assert_eq!(result.len(), 2); + assert_eq!(result[0], node1); + assert_eq!(result[1], node2); } } diff --git a/swiftide/src/transformers/chunk_code.rs b/swiftide/src/transformers/chunk_code.rs index f6c0bf5b..32d65231 100644 --- a/swiftide/src/transformers/chunk_code.rs +++ b/swiftide/src/transformers/chunk_code.rs @@ -61,12 +61,12 @@ impl ChunkCode { chunker: CodeSplitter::builder() .try_language(lang)? .chunk_size(chunk_size) - .build() - .expect("Failed to build code splitter"), + .build()?, concurrency: None, }) } + #[must_use] pub fn with_concurrency(mut self, concurrency: usize) -> Self { self.concurrency = Some(concurrency); self diff --git a/swiftide/src/transformers/chunk_markdown.rs b/swiftide/src/transformers/chunk_markdown.rs index 982ead5d..1e69244b 100644 --- a/swiftide/src/transformers/chunk_markdown.rs +++ b/swiftide/src/transformers/chunk_markdown.rs @@ -53,13 +53,14 @@ impl ChunkMarkdown { } /// Set the number of concurrent chunks to process. + #[must_use] pub fn with_concurrency(mut self, concurrency: usize) -> Self { self.concurrency = Some(concurrency); self } fn min_size(&self) -> usize { - self.range.as_ref().map(|r| r.start).unwrap_or(0) + self.range.as_ref().map_or(0, |r| r.start) } } @@ -98,7 +99,7 @@ mod test { use super::*; use futures_util::stream::TryStreamExt; - const MARKDOWN: &str = r#" + const MARKDOWN: &str = r" # Hello, world! This is a test markdown document. @@ -110,7 +111,7 @@ mod test { ## Section 2 This is another paragraph. - "#; + "; #[tokio::test] async fn test_transforming_with_max_characters_and_trimming() { diff --git a/swiftide/src/transformers/embed.rs b/swiftide/src/transformers/embed.rs index 34ac06e6..16cfa232 100644 --- a/swiftide/src/transformers/embed.rs +++ b/swiftide/src/transformers/embed.rs @@ -41,6 +41,7 @@ impl Embed { } } + #[must_use] pub fn with_concurrency(mut self, concurrency: usize) -> Self { self.concurrency = Some(concurrency); self @@ -65,7 +66,7 @@ impl BatchableTransformer for Embed { #[tracing::instrument(skip_all, name = "transformers.embed")] async fn batch_transform(&self, nodes: Vec) -> IndexingStream { // TODO: We should drop chunks that go over the token limit of the EmbedModel - let chunks_to_embed: Vec = nodes.iter().map(|n| n.as_embeddable()).collect(); + let chunks_to_embed: Vec = nodes.iter().map(Node::as_embeddable).collect(); self.embed_model .embed(chunks_to_embed) diff --git a/swiftide/src/transformers/metadata_keywords.rs b/swiftide/src/transformers/metadata_keywords.rs index 5bcb0f11..7cc45cb1 100644 --- a/swiftide/src/transformers/metadata_keywords.rs +++ b/swiftide/src/transformers/metadata_keywords.rs @@ -9,8 +9,8 @@ use indoc::indoc; /// This module defines the `MetadataKeywords` struct and its associated methods, /// which are used for generating metadata in the form of keywords -/// for a given text. It interacts with a client (e.g., OpenAI) to generate -/// the keywords based on the text chunk in an `Node`. +/// for a given text. It interacts with a client (e.g., `OpenAI`) to generate +/// the keywords based on the text chunk in a `Node`. /// `MetadataKeywords` is responsible for generating keywords /// for a given text chunk. It uses a templated prompt to interact with a client @@ -51,6 +51,7 @@ impl MetadataKeywords { } } + #[must_use] pub fn with_concurrency(mut self, concurrency: usize) -> Self { self.concurrency = Some(concurrency); self @@ -63,7 +64,7 @@ impl MetadataKeywords { /// /// A string containing the default prompt template. fn default_prompt() -> String { - indoc! {r#" + indoc! {r" # Task Your task is to generate a descriptive, concise keywords for the given text @@ -86,7 +87,7 @@ fn default_prompt() -> String { {text} ``` - "#} + "} .to_string() } diff --git a/swiftide/src/transformers/metadata_qa_code.rs b/swiftide/src/transformers/metadata_qa_code.rs index c912769e..642cae1f 100644 --- a/swiftide/src/transformers/metadata_qa_code.rs +++ b/swiftide/src/transformers/metadata_qa_code.rs @@ -49,6 +49,7 @@ impl MetadataQACode { } } + #[must_use] pub fn with_concurrency(mut self, concurrency: usize) -> Self { self.concurrency = Some(concurrency); self @@ -63,7 +64,7 @@ impl MetadataQACode { /// /// A string representing the default prompt template. fn default_prompt() -> String { - indoc! {r#" + indoc! {r" # Task Your task is to generate questions and answers for the given code. @@ -95,7 +96,7 @@ fn default_prompt() -> String { {code} ``` - "#} + "} .to_string() } diff --git a/swiftide/src/transformers/metadata_qa_text.rs b/swiftide/src/transformers/metadata_qa_text.rs index e5cfa36d..6e8c9d46 100644 --- a/swiftide/src/transformers/metadata_qa_text.rs +++ b/swiftide/src/transformers/metadata_qa_text.rs @@ -9,7 +9,7 @@ use indoc::indoc; /// This module defines the `MetadataQAText` struct and its associated methods, /// which are used for generating metadata in the form of questions and answers -/// from a given text. It interacts with a client (e.g., OpenAI) to generate +/// from a given text. It interacts with a client (e.g., `OpenAI`) to generate /// these questions and answers based on the text chunk in an `Node`. /// `MetadataQAText` is responsible for generating questions and answers @@ -54,6 +54,7 @@ impl MetadataQAText { } } + #[must_use] pub fn with_concurrency(mut self, concurrency: usize) -> Self { self.concurrency = Some(concurrency); self @@ -66,7 +67,7 @@ impl MetadataQAText { /// /// A string containing the default prompt template. fn default_prompt() -> String { - indoc! {r#" + indoc! {r" # Task Your task is to generate questions and answers for the given text. @@ -96,7 +97,7 @@ fn default_prompt() -> String { {text} ``` - "#} + "} .to_string() } diff --git a/swiftide/src/transformers/metadata_summary.rs b/swiftide/src/transformers/metadata_summary.rs index 858bec6f..dfca8920 100644 --- a/swiftide/src/transformers/metadata_summary.rs +++ b/swiftide/src/transformers/metadata_summary.rs @@ -9,7 +9,7 @@ use indoc::indoc; /// This module defines the `MetadataSummary` struct and its associated methods, /// which are used for generating metadata in the form of a summary -/// for a given text. It interacts with a client (e.g., OpenAI) to generate +/// for a given text. It interacts with a client (e.g., `OpenAI`) to generate /// the summary based on the text chunk in an `Node`. /// `MetadataSummary` is responsible for generating a summary @@ -51,6 +51,7 @@ impl MetadataSummary { } } + #[must_use] pub fn with_concurrency(mut self, concurrency: usize) -> Self { self.concurrency = Some(concurrency); self @@ -63,7 +64,7 @@ impl MetadataSummary { /// /// A string containing the default prompt template. fn default_prompt() -> String { - indoc! {r#" + indoc! {r" # Task Your task is to generate a descriptive, concise summary for the given text @@ -85,7 +86,7 @@ fn default_prompt() -> String { {text} ``` - "#} + "} .to_string() } diff --git a/swiftide/src/transformers/metadata_title.rs b/swiftide/src/transformers/metadata_title.rs index 6dd16b31..d94a2849 100644 --- a/swiftide/src/transformers/metadata_title.rs +++ b/swiftide/src/transformers/metadata_title.rs @@ -9,7 +9,7 @@ use indoc::indoc; /// This module defines the `MetadataTitle` struct and its associated methods, /// which are used for generating metadata in the form of a title -/// for a given text. It interacts with a client (e.g., OpenAI) to generate +/// for a given text. It interacts with a client (e.g., `OpenAI`) to generate /// these questions and answers based on the text chunk in an `Node`. /// `MetadataTitle` is responsible for generating a title @@ -51,6 +51,7 @@ impl MetadataTitle { } } + #[must_use] pub fn with_concurrency(mut self, concurrency: usize) -> Self { self.concurrency = Some(concurrency); self @@ -63,7 +64,7 @@ impl MetadataTitle { /// /// A string containing the default prompt template. fn default_prompt() -> String { - indoc! {r#" + indoc! {r" # Task Your task is to generate a descriptive, concise title for the given text @@ -84,7 +85,7 @@ fn default_prompt() -> String { {text} ``` - "#} + "} .to_string() } diff --git a/swiftide/tests/indexing_pipeline.rs b/swiftide/tests/indexing_pipeline.rs index 30962fae..972e7610 100644 --- a/swiftide/tests/indexing_pipeline.rs +++ b/swiftide/tests/indexing_pipeline.rs @@ -39,7 +39,7 @@ async fn test_indexing_pipeline() { .respond_with(ResponseTemplate::new(200).set_body_json(json!({ "id": "chatcmpl-123", "object": "chat.completion", - "created": 1677652288, + "created": 1_677_652_288, "model": "gpt-3.5-turbo-0125", "system_fingerprint": "fp_44709d6fcb", "choices": [{ @@ -130,7 +130,7 @@ async fn test_indexing_pipeline() { // Coverage CI runs in container, just accept the double qdrant and use the service instead let qdrant_url = std::env::var("QDRANT_URL").unwrap_or(qdrant_url); - println!("Qdrant URL: {}", qdrant_url); + println!("Qdrant URL: {qdrant_url}"); let result = Pipeline::from_loader(FileLoader::new(tempdir.path()).with_extensions(&["rs"])) .then_chunk(transformers::ChunkCode::try_for_language("rust").unwrap()) @@ -169,7 +169,7 @@ async fn test_indexing_pipeline() { }) .collect::>() .join("\n---\n"); - println!("{}", received_requests); + println!("{received_requests}"); }; result.expect("Indexing pipeline failed");