Skip to content

Commit

Permalink
update examples for literate docs (#1880)
Browse files Browse the repository at this point in the history
  • Loading branch information
PSeitz authored Feb 17, 2023
1 parent 111f25a commit bf1449b
Show file tree
Hide file tree
Showing 10 changed files with 512 additions and 104 deletions.
352 changes: 271 additions & 81 deletions examples/aggregation.rs

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion examples/custom_collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ fn main() -> tantivy::Result<()> {
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);

// here we want to get a hit on the 'ken' in Frankenstein
// here we want to search for `broom` and use `StatsCollector` on the hits.
let query = query_parser.parse_query("broom")?;
if let Some(stats) =
searcher.search(&query, &StatsCollector::with_field("price".to_string()))?
Expand Down
4 changes: 2 additions & 2 deletions examples/custom_tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// # Defining a tokenizer pipeline
//
// In this example, we'll see how to define a tokenizer pipeline
// by aligning a bunch of `TokenFilter`.
// In this example, we'll see how to define a tokenizer
// by creating a custom `NgramTokenizer`.
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
Expand Down
8 changes: 6 additions & 2 deletions examples/date_time_field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ fn main() -> tantivy::Result<()> {
.set_stored()
.set_fast()
.set_precision(tantivy::DatePrecision::Seconds);
// Add `occurred_at` date field type
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
let event_type = schema_builder.add_text_field("event", STRING | STORED);
let schema = schema_builder.build();
Expand All @@ -22,6 +23,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema.clone());

let mut index_writer = index.writer(50_000_000)?;
// The dates are passed as string in the RFC3339 format
let doc = schema.parse_document(
r#"{
"occurred_at": "2022-06-22T12:53:50.53Z",
Expand All @@ -41,14 +43,16 @@ fn main() -> tantivy::Result<()> {
let reader = index.reader()?;
let searcher = reader.searcher();

// # Default fields: event_type
// # Search
let query_parser = QueryParser::for_index(&index, vec![event_type]);
{
let query = query_parser.parse_query("event:comment")?;
// Simple exact search on the date
let query = query_parser.parse_query("occurred_at:\"2022-06-22T12:53:50.53Z\"")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
assert_eq!(count_docs.len(), 1);
}
{
// Range query on the date field
let query = query_parser
.parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
Expand Down
11 changes: 11 additions & 0 deletions examples/faceted_search_with_tweaked_score.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
// # Faceted Search With Tweak Score
//
// This example covers the faceted search functionalities of
// tantivy.
//
// We will :
// - define a text field "name" in our schema
// - define a facet field "classification" in our schema

use std::collections::HashSet;

use tantivy::collector::TopDocs;
Expand Down Expand Up @@ -55,6 +64,7 @@ fn main() -> tantivy::Result<()> {
.collect(),
);
let top_docs_by_custom_score =
// Call TopDocs with a custom tweak score
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
let ingredient_reader = segment_reader.facet_reader("ingredient").unwrap();
let facet_dict = ingredient_reader.facet_dict();
Expand All @@ -65,6 +75,7 @@ fn main() -> tantivy::Result<()> {
.collect();

move |doc: DocId, original_score: Score| {
// Update the original score with a tweaked score
let missing_ingredients = ingredient_reader
.facet_ords(doc)
.filter(|ord| !query_ords.contains(ord))
Expand Down
170 changes: 170 additions & 0 deletions examples/fuzzy_search.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
// # Basic Example
//
// This example covers the basic functionalities of
// tantivy.
//
// We will :
// - define our schema
// - create an index in a directory
// - index a few documents into our index
// - search for the best document matching a basic query
// - retrieve the best document's original content.

use std::collections::HashSet;

// ---
// Importing tantivy...
use tantivy::collector::{Count, TopDocs};
use tantivy::query::{FuzzyTermQuery, QueryParser};
use tantivy::schema::*;
use tantivy::{doc, DocId, Index, ReloadPolicy, Score, SegmentReader};
use tempfile::TempDir;

fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the
// sake of this example
let index_path = TempDir::new()?;

// # Defining the schema
//
// The Tantivy index requires a very strict schema.
// The schema declares which fields are in the index,
// and for each field, its type and "the way it should
// be indexed".

// First we need to define a schema ...
let mut schema_builder = Schema::builder();

// Our first field is title.
// We want full-text search for it, and we also want
// to be able to retrieve the document after the search.
//
// `TEXT | STORED` is some syntactic sugar to describe
// that.
//
// `TEXT` means the field should be tokenized and indexed,
// along with its term frequency and term positions.
//
// `STORED` means that the field will also be saved
// in a compressed, row-oriented key-value store.
// This store is useful for reconstructing the
// documents that were selected during the search phase.
let title = schema_builder.add_text_field("title", TEXT | STORED);

let schema = schema_builder.build();

// # Indexing documents
//
// Let's create a brand new index.
//
// This will actually just save a meta.json
// with our schema in the directory.
let index = Index::create_in_dir(&index_path, schema.clone())?;

// To insert a document we will need an index writer.
// There must be only one writer at a time.
// This single `IndexWriter` is already
// multithreaded.
//
// Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty.
let mut index_writer = index.writer(50_000_000)?;

// Let's index our documents!
// We first need a handle on the title and the body field.

// ### Adding documents
//
index_writer.add_document(doc!(
title => "The Name of the Wind",
))?;
index_writer.add_document(doc!(
title => "The Diary of Muadib",
))?;
index_writer.add_document(doc!(
title => "A Dairy Cow",
))?;
index_writer.add_document(doc!(
title => "The Diary of a Young Girl",
))?;
index_writer.commit()?;

// ### Committing
//
// At this point our documents are not searchable.
//
//
// We need to call `.commit()` explicitly to force the
// `index_writer` to finish processing the documents in the queue,
// flush the current index to the disk, and advertise
// the existence of new documents.
//
// This call is blocking.
index_writer.commit()?;

// If `.commit()` returns correctly, then all of the
// documents that have been added are guaranteed to be
// persistently indexed.
//
// In the scenario of a crash or a power failure,
// tantivy behaves as if it has rolled back to its last
// commit.

// # Searching
//
// ### Searcher
//
// A reader is required first in order to search an index.
// It acts as a `Searcher` pool that reloads itself,
// depending on a `ReloadPolicy`.
//
// For a search server you will typically create one reader for the entire lifetime of your
// program, and acquire a new searcher for every single request.
//
// In the code below, we rely on the 'ON_COMMIT' policy: the reader
// will reload the index automatically after each commit.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;

// We now need to acquire a searcher.
//
// A searcher points to a snapshotted, immutable version of the index.
//
// Some search experience might require more than
// one query. Using the same searcher ensures that all of these queries will run on the
// same version of the index.
//
// Acquiring a `searcher` is very cheap.
//
// You should acquire a searcher every time you start processing a request and
// and release it right after your query is finished.
let searcher = reader.searcher();

// ### FuzzyTermQuery
{
let term = Term::from_field_text(title, "Diary");
let query = FuzzyTermQuery::new(term, 2, true);

let (top_docs, count) = searcher
.search(&query, &(TopDocs::with_limit(5), Count))
.unwrap();
assert_eq!(count, 3);
assert_eq!(top_docs.len(), 3);
for (score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
// Note that the score is not lower for the fuzzy hit.
// There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563
println!("score {score:?} doc {}", schema.to_json(&retrieved_doc));
// score 1.0 doc {"title":["The Diary of Muadib"]}
//
// score 1.0 doc {"title":["The Diary of a Young Girl"]}
//
// score 1.0 doc {"title":["A Dairy Cow"]}
}
}

Ok(())
}
File renamed without changes.
File renamed without changes.
60 changes: 47 additions & 13 deletions examples/ip_field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ use tantivy::Index;

fn main() -> tantivy::Result<()> {
// # Defining the schema
// We set the IP field as `INDEXED`, so it can be searched
// `FAST` will create a fast field. The fast field will be used to execute search queries.
// `FAST` is not a requirement for range queries, it can also be executed on the inverted index
// which is created by `INDEXED`.
let mut schema_builder = Schema::builder();
let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
let ip = schema_builder.add_ip_addr_field("ip", STORED | INDEXED | FAST);
Expand All @@ -19,51 +23,81 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema.clone());

let mut index_writer = index.writer(50_000_000)?;

// ### IPv4
// Adding documents that contain an IPv4 address. Notice that the IP addresses are passed as
// `String`. Since the field is of type ip, we parse the IP address from the string and store it
// internally as IPv6.
let doc = schema.parse_document(
r#"{
"ip": "192.168.0.33",
"event_type": "login"
}"#,
"ip": "192.168.0.33",
"event_type": "login"
}"#,
)?;
index_writer.add_document(doc)?;
let doc = schema.parse_document(
r#"{
"ip": "192.168.0.80",
"event_type": "checkout"
}"#,
"ip": "192.168.0.80",
"event_type": "checkout"
}"#,
)?;
index_writer.add_document(doc)?;
// ### IPv6
// Adding a document that contains an IPv6 address.
let doc = schema.parse_document(
r#"{
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
"event_type": "checkout"
}"#,
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
"event_type": "checkout"
}"#,
)?;

index_writer.add_document(doc)?;
// Commit will create a segment containing our documents.
index_writer.commit()?;

let reader = index.reader()?;
let searcher = reader.searcher();

// # Search
// Range queries on IPv4. Since we created a fast field, the fast field will be used to execute
// the search.
// ### Range Queries
let query_parser = QueryParser::for_index(&index, vec![event_type, ip]);
{
let query = query_parser.parse_query("ip:[192.168.0.0 TO 192.168.0.100]")?;
// Inclusive range queries
let query = query_parser.parse_query("ip:[192.168.0.80 TO 192.168.0.100]")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
assert_eq!(count_docs.len(), 2);
assert_eq!(count_docs.len(), 1);
}
{
let query = query_parser.parse_query("ip:[192.168.1.0 TO 192.168.1.100]")?;
// Exclusive range queries
let query = query_parser.parse_query("ip:{192.168.0.80 TO 192.168.1.100]")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(count_docs.len(), 0);
}
{
// Find docs with IP addresses smaller equal 192.168.1.100
let query = query_parser.parse_query("ip:[* TO 192.168.1.100]")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(count_docs.len(), 2);
}
{
// Find docs with IP addresses smaller than 192.168.1.100
let query = query_parser.parse_query("ip:[* TO 192.168.1.100}")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(count_docs.len(), 2);
}

// ### Exact Queries
// Exact search on IPv4.
{
let query = query_parser.parse_query("ip:192.168.0.80")?;
let count_docs = searcher.search(&*query, &Count)?;
assert_eq!(count_docs, 1);
}
// Exact search on IPv6.
// IpV6 addresses need to be quoted because they contain `:`
{
// IpV6 needs to be escaped because it contains `:`
let query = query_parser.parse_query("ip:\"2001:0db8:85a3:0000:0000:8a2e:0370:7334\"")?;
let count_docs = searcher.search(&*query, &Count)?;
assert_eq!(count_docs, 1);
Expand Down
9 changes: 4 additions & 5 deletions examples/warmer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ use tantivy::{

type ProductId = u64;

/// Price
type Price = u32;

pub trait PriceFetcher: Send + Sync + 'static {
Expand Down Expand Up @@ -90,10 +89,10 @@ impl Warmer for DynamicPriceColumn {
}
}

/// For the sake of this example, the table is just an editable HashMap behind a RwLock.
/// This map represents a map (ProductId -> Price)
///
/// In practise, it could be fetching things from an external service, like a SQL table.
// For the sake of this example, the table is just an editable HashMap behind a RwLock.
// This map represents a map (ProductId -> Price)
//
// In practise, it could be fetching things from an external service, like a SQL table.
#[derive(Default, Clone)]
pub struct ExternalPriceTable {
prices: Arc<RwLock<HashMap<ProductId, Price>>>,
Expand Down

0 comments on commit bf1449b

Please sign in to comment.