update examples for literate docs (#1880)

quickwit-oss · Feb 17, 2023 · bf1449b · bf1449b
1 parent 111f25a
commit bf1449b
Show file tree

Hide file tree

Showing 10 changed files with 512 additions and 104 deletions.
diff --git a/examples/aggregation.rs b/examples/aggregation.rs
diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs
@@ -171,7 +171,7 @@ fn main() -> tantivy::Result<()> {
     let searcher = reader.searcher();
     let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);
 
-    // here we want to get a hit on the 'ken' in Frankenstein
+    // here we want to search for `broom` and use `StatsCollector` on the hits.
     let query = query_parser.parse_query("broom")?;
     if let Some(stats) =
         searcher.search(&query, &StatsCollector::with_field("price".to_string()))?

diff --git a/examples/custom_tokenizer.rs b/examples/custom_tokenizer.rs
@@ -1,7 +1,7 @@
 // # Defining a tokenizer pipeline
 //
-// In this example, we'll see how to define a tokenizer pipeline
-// by aligning a bunch of `TokenFilter`.
+// In this example, we'll see how to define a tokenizer
+// by creating a custom `NgramTokenizer`.
 use tantivy::collector::TopDocs;
 use tantivy::query::QueryParser;
 use tantivy::schema::*;

diff --git a/examples/date_time_field.rs b/examples/date_time_field.rs
@@ -14,6 +14,7 @@ fn main() -> tantivy::Result<()> {
         .set_stored()
         .set_fast()
         .set_precision(tantivy::DatePrecision::Seconds);
+    // Add `occurred_at` date field type
     let occurred_at = schema_builder.add_date_field("occurred_at", opts);
     let event_type = schema_builder.add_text_field("event", STRING | STORED);
     let schema = schema_builder.build();
@@ -22,6 +23,7 @@ fn main() -> tantivy::Result<()> {
     let index = Index::create_in_ram(schema.clone());
 
     let mut index_writer = index.writer(50_000_000)?;
+    // The dates are passed as string in the RFC3339 format
     let doc = schema.parse_document(
         r#"{
         "occurred_at": "2022-06-22T12:53:50.53Z",
@@ -41,14 +43,16 @@ fn main() -> tantivy::Result<()> {
     let reader = index.reader()?;
     let searcher = reader.searcher();
 
-    // # Default fields: event_type
+    // # Search
     let query_parser = QueryParser::for_index(&index, vec![event_type]);
     {
-        let query = query_parser.parse_query("event:comment")?;
+        // Simple exact search on the date
+        let query = query_parser.parse_query("occurred_at:\"2022-06-22T12:53:50.53Z\"")?;
         let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
         assert_eq!(count_docs.len(), 1);
     }
     {
+        // Range query on the date field
         let query = query_parser
             .parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?;
         let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;

diff --git a/examples/faceted_search_with_tweaked_score.rs b/examples/faceted_search_with_tweaked_score.rs
@@ -1,3 +1,12 @@
+// # Faceted Search With Tweak Score
+//
+// This example covers the faceted search functionalities of
+// tantivy.
+//
+// We will :
+// - define a text field "name" in our schema
+// - define a facet field "classification" in our schema
+
 use std::collections::HashSet;
 
 use tantivy::collector::TopDocs;
@@ -55,6 +64,7 @@ fn main() -> tantivy::Result<()> {
                 .collect(),
         );
         let top_docs_by_custom_score =
+            // Call TopDocs with a custom tweak score
             TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
                 let ingredient_reader = segment_reader.facet_reader("ingredient").unwrap();
                 let facet_dict = ingredient_reader.facet_dict();
@@ -65,6 +75,7 @@ fn main() -> tantivy::Result<()> {
                     .collect();
 
                 move |doc: DocId, original_score: Score| {
+                    // Update the original score with a tweaked score
                     let missing_ingredients = ingredient_reader
                         .facet_ords(doc)
                         .filter(|ord| !query_ords.contains(ord))

diff --git a/examples/fuzzy_search.rs b/examples/fuzzy_search.rs
@@ -0,0 +1,170 @@
+// # Basic Example
+//
+// This example covers the basic functionalities of
+// tantivy.
+//
+// We will :
+// - define our schema
+// - create an index in a directory
+// - index a few documents into our index
+// - search for the best document matching a basic query
+// - retrieve the best document's original content.
+
+use std::collections::HashSet;
+
+// ---
+// Importing tantivy...
+use tantivy::collector::{Count, TopDocs};
+use tantivy::query::{FuzzyTermQuery, QueryParser};
+use tantivy::schema::*;
+use tantivy::{doc, DocId, Index, ReloadPolicy, Score, SegmentReader};
+use tempfile::TempDir;
+
+fn main() -> tantivy::Result<()> {
+    // Let's create a temporary directory for the
+    // sake of this example
+    let index_path = TempDir::new()?;
+
+    // # Defining the schema
+    //
+    // The Tantivy index requires a very strict schema.
+    // The schema declares which fields are in the index,
+    // and for each field, its type and "the way it should
+    // be indexed".
+
+    // First we need to define a schema ...
+    let mut schema_builder = Schema::builder();
+
+    // Our first field is title.
+    // We want full-text search for it, and we also want
+    // to be able to retrieve the document after the search.
+    //
+    // `TEXT | STORED` is some syntactic sugar to describe
+    // that.
+    //
+    // `TEXT` means the field should be tokenized and indexed,
+    // along with its term frequency and term positions.
+    //
+    // `STORED` means that the field will also be saved
+    // in a compressed, row-oriented key-value store.
+    // This store is useful for reconstructing the
+    // documents that were selected during the search phase.
+    let title = schema_builder.add_text_field("title", TEXT | STORED);
+
+    let schema = schema_builder.build();
+
+    // # Indexing documents
+    //
+    // Let's create a brand new index.
+    //
+    // This will actually just save a meta.json
+    // with our schema in the directory.
+    let index = Index::create_in_dir(&index_path, schema.clone())?;
+
+    // To insert a document we will need an index writer.
+    // There must be only one writer at a time.
+    // This single `IndexWriter` is already
+    // multithreaded.
+    //
+    // Here we give tantivy a budget of `50MB`.
+    // Using a bigger memory_arena for the indexer may increase
+    // throughput, but 50 MB is already plenty.
+    let mut index_writer = index.writer(50_000_000)?;
+
+    // Let's index our documents!
+    // We first need a handle on the title and the body field.
+
+    // ### Adding documents
+    //
+    index_writer.add_document(doc!(
+        title => "The Name of the Wind",
+    ))?;
+    index_writer.add_document(doc!(
+        title => "The Diary of Muadib",
+    ))?;
+    index_writer.add_document(doc!(
+        title => "A Dairy Cow",
+    ))?;
+    index_writer.add_document(doc!(
+        title => "The Diary of a Young Girl",
+    ))?;
+    index_writer.commit()?;
+
+    // ### Committing
+    //
+    // At this point our documents are not searchable.
+    //
+    //
+    // We need to call `.commit()` explicitly to force the
+    // `index_writer` to finish processing the documents in the queue,
+    // flush the current index to the disk, and advertise
+    // the existence of new documents.
+    //
+    // This call is blocking.
+    index_writer.commit()?;
+
+    // If `.commit()` returns correctly, then all of the
+    // documents that have been added are guaranteed to be
+    // persistently indexed.
+    //
+    // In the scenario of a crash or a power failure,
+    // tantivy behaves as if it has rolled back to its last
+    // commit.
+
+    // # Searching
+    //
+    // ### Searcher
+    //
+    // A reader is required first in order to search an index.
+    // It acts as a `Searcher` pool that reloads itself,
+    // depending on a `ReloadPolicy`.
+    //
+    // For a search server you will typically create one reader for the entire lifetime of your
+    // program, and acquire a new searcher for every single request.
+    //
+    // In the code below, we rely on the 'ON_COMMIT' policy: the reader
+    // will reload the index automatically after each commit.
+    let reader = index
+        .reader_builder()
+        .reload_policy(ReloadPolicy::OnCommit)
+        .try_into()?;
+
+    // We now need to acquire a searcher.
+    //
+    // A searcher points to a snapshotted, immutable version of the index.
+    //
+    // Some search experience might require more than
+    // one query. Using the same searcher ensures that all of these queries will run on the
+    // same version of the index.
+    //
+    // Acquiring a `searcher` is very cheap.
+    //
+    // You should acquire a searcher every time you start processing a request and
+    // and release it right after your query is finished.
+    let searcher = reader.searcher();
+
+    // ### FuzzyTermQuery
+    {
+        let term = Term::from_field_text(title, "Diary");
+        let query = FuzzyTermQuery::new(term, 2, true);
+
+        let (top_docs, count) = searcher
+            .search(&query, &(TopDocs::with_limit(5), Count))
+            .unwrap();
+        assert_eq!(count, 3);
+        assert_eq!(top_docs.len(), 3);
+        for (score, doc_address) in top_docs {
+            let retrieved_doc = searcher.doc(doc_address)?;
+            // Note that the score is not lower for the fuzzy hit.
+            // There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563
+            println!("score {score:?} doc {}", schema.to_json(&retrieved_doc));
+            // score 1.0 doc {"title":["The Diary of Muadib"]}
+            //
+            // score 1.0 doc {"title":["The Diary of a Young Girl"]}
+            //
+            // score 1.0 doc {"title":["A Dairy Cow"]}
+        }
+    }
+
+    Ok(())
+}
diff --git a/examples/multiple_producer.rs → examples/index_from_multiple_threads.rs b/examples/multiple_producer.rs → examples/index_from_multiple_threads.rs
diff --git a/examples/working_with_json.rs → examples/index_with_json.rs b/examples/working_with_json.rs → examples/index_with_json.rs
diff --git a/examples/ip_field.rs b/examples/ip_field.rs
@@ -10,6 +10,10 @@ use tantivy::Index;
 
 fn main() -> tantivy::Result<()> {
     // # Defining the schema
+    // We set the IP field as `INDEXED`, so it can be searched
+    // `FAST` will create a fast field. The fast field will be used to execute search queries.
+    // `FAST` is not a requirement for range queries, it can also be executed on the inverted index
+    // which is created by `INDEXED`.
     let mut schema_builder = Schema::builder();
     let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
     let ip = schema_builder.add_ip_addr_field("ip", STORED | INDEXED | FAST);
@@ -19,51 +23,81 @@ fn main() -> tantivy::Result<()> {
     let index = Index::create_in_ram(schema.clone());
 
     let mut index_writer = index.writer(50_000_000)?;
+
+    // ### IPv4
+    // Adding documents that contain an IPv4 address. Notice that the IP addresses are passed as
+    // `String`. Since the field is of type ip, we parse the IP address from the string and store it
+    // internally as IPv6.
     let doc = schema.parse_document(
         r#"{
-        "ip": "192.168.0.33",
-        "event_type": "login"
-    }"#,
+            "ip": "192.168.0.33",
+            "event_type": "login"
+        }"#,
     )?;
     index_writer.add_document(doc)?;
     let doc = schema.parse_document(
         r#"{
-        "ip": "192.168.0.80",
-        "event_type": "checkout"
-    }"#,
+            "ip": "192.168.0.80",
+            "event_type": "checkout"
+        }"#,
     )?;
     index_writer.add_document(doc)?;
+    // ### IPv6
+    // Adding a document that contains an IPv6 address.
     let doc = schema.parse_document(
         r#"{
-        "ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
-        "event_type": "checkout"
-    }"#,
+            "ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
+            "event_type": "checkout"
+        }"#,
     )?;
 
     index_writer.add_document(doc)?;
+    // Commit will create a segment containing our documents.
     index_writer.commit()?;
 
     let reader = index.reader()?;
     let searcher = reader.searcher();
 
+    // # Search
+    // Range queries on IPv4. Since we created a fast field, the fast field will be used to execute
+    // the search.
+    // ### Range Queries
     let query_parser = QueryParser::for_index(&index, vec![event_type, ip]);
     {
-        let query = query_parser.parse_query("ip:[192.168.0.0 TO 192.168.0.100]")?;
+        // Inclusive range queries
+        let query = query_parser.parse_query("ip:[192.168.0.80 TO 192.168.0.100]")?;
         let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
-        assert_eq!(count_docs.len(), 2);
+        assert_eq!(count_docs.len(), 1);
     }
     {
-        let query = query_parser.parse_query("ip:[192.168.1.0 TO 192.168.1.100]")?;
+        // Exclusive range queries
+        let query = query_parser.parse_query("ip:{192.168.0.80 TO 192.168.1.100]")?;
         let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
         assert_eq!(count_docs.len(), 0);
     }
+    {
+        // Find docs with IP addresses smaller equal 192.168.1.100
+        let query = query_parser.parse_query("ip:[* TO 192.168.1.100]")?;
+        let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
+        assert_eq!(count_docs.len(), 2);
+    }
+    {
+        // Find docs with IP addresses smaller than 192.168.1.100
+        let query = query_parser.parse_query("ip:[* TO 192.168.1.100}")?;
+        let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
+        assert_eq!(count_docs.len(), 2);
+    }
+
+    // ### Exact Queries
+    // Exact search on IPv4.
     {
         let query = query_parser.parse_query("ip:192.168.0.80")?;
         let count_docs = searcher.search(&*query, &Count)?;
         assert_eq!(count_docs, 1);
     }
+    // Exact search on IPv6.
+    // IpV6 addresses need to be quoted because they contain `:`
     {
-        // IpV6 needs to be escaped because it contains `:`
         let query = query_parser.parse_query("ip:\"2001:0db8:85a3:0000:0000:8a2e:0370:7334\"")?;
         let count_docs = searcher.search(&*query, &Count)?;
         assert_eq!(count_docs, 1);

diff --git a/examples/warmer.rs b/examples/warmer.rs
@@ -17,7 +17,6 @@ use tantivy::{
 
 type ProductId = u64;
 
-/// Price
 type Price = u32;
 
 pub trait PriceFetcher: Send + Sync + 'static {
@@ -90,10 +89,10 @@ impl Warmer for DynamicPriceColumn {
     }
 }
 
-/// For the sake of this example, the table is just an editable HashMap behind a RwLock.
-/// This map represents a map (ProductId -> Price)
-///
-/// In practise, it could be fetching things from an external service, like a SQL table.
+// For the sake of this example, the table is just an editable HashMap behind a RwLock.
+// This map represents a map (ProductId -> Price)
+//
+// In practise, it could be fetching things from an external service, like a SQL table.
 #[derive(Default, Clone)]
 pub struct ExternalPriceTable {
     prices: Arc<RwLock<HashMap<ProductId, Price>>>,