From 09c8a3eabd2f24a7cff21e186ba2e484c918cf7b Mon Sep 17 00:00:00 2001 From: aoiasd <45024769+aoiasd@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:56:29 +0800 Subject: [PATCH] enhance: add examples for text match (#2383) Signed-off-by: aoiasd --- examples/orm/hello_text_match.py | 168 +++++++++++++++++++++++++++++++ examples/text_match.py | 148 +++++++++++++++++++++++++++ 2 files changed, 316 insertions(+) create mode 100644 examples/orm/hello_text_match.py create mode 100644 examples/text_match.py diff --git a/examples/orm/hello_text_match.py b/examples/orm/hello_text_match.py new file mode 100644 index 000000000..b716cdf96 --- /dev/null +++ b/examples/orm/hello_text_match.py @@ -0,0 +1,168 @@ +# hello_text_match.py demonstrates how to insert raw data only into Milvus and perform +# document retrieval based on specific terms by text match expression. +# 1. connect to Milvus +# 2. create collection +# 3. insert data +# 4. create index +# 5. search, query, and filtering search on entities +# 7. drop collection +import time +import numpy as np + +from pymilvus import ( + connections, + utility, + FieldSchema, CollectionSchema, Function, DataType, FunctionType, + Collection, +) + +fmt = "\n=== {:30} ===\n" +search_latency_fmt = "search latency = {:.4f}s" +dim = 8 + +################################################################################# +# 1. connect to Milvus +# Add a new connection alias `default` for Milvus server in `localhost:19530` +print(fmt.format("start connecting to Milvus")) +connections.connect("default", host="localhost", port="19530") + +has = utility.has_collection("hello_text_match") +print(f"Does collection hello_text_match exist in Milvus: {has}") + +################################################################################# +# 2. create collection +# We're going to create a collection with 2 explicit fields and a function. +# +-+------------+------------+----------------------+------------------------------+ +# | | field name | field type | other attributes | field description | +# +-+------------+------------+----------------------+------------------------------+ +# |1| "id" | INT64 | is_primary=True | "primary field" | +# | | | | auto_id=False | | +# +-+------------+------------+----------------------+------------------------------+ +# |2| "document" | VarChar | enable_analyzer=True | "raw text document" | +# | | | | enable_match=True | | +# +-+------------+------------+----------------------+------------------------------+ +# |3|"embeddings"| FloatVector| dim=8 | "float vector with dim 8" | +# +-+------------+------------+----------------------+------------------------------+ +fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), + # set analyzer params in document field for more situations + # default as analyzer_params = {"type": "standard"} + FieldSchema(name="document", dtype=DataType.VARCHAR, max_length=1000, enable_analyzer=True, enable_match=True), + FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim) +] + + +schema = CollectionSchema(fields, "hello_text_match demo") + +print(fmt.format("Create collection `hello_text_match`")) +hello_text_match = Collection("hello_text_match", schema, consistency_level="Strong") + +################################################################################ +# 3. insert data +# We are going to insert 6 rows of data into `hello_text_match` +# Data to be inserted must be organized in fields. +# +# The insert() method returns: +# - either automatically generated primary keys by Milvus if auto_id=True in the schema; +# - or the existing primary key field from the entities if auto_id=False in the schema. + +print(fmt.format("Start inserting entities")) + +rng = np.random.default_rng(seed=19530) +num_entities = 6 +keywords = ["milvus", "match", "search", "query", "analyzer", "tokenizer"] + +entities = [ + [f"This is a test document {i + hello_text_match.num_entities} with keywords: {keywords[i]}" for i in range(num_entities)], + rng.random((num_entities, dim), np.float32) +] + +insert_result = hello_text_match.insert(entities) +ids = insert_result.primary_keys + +hello_text_match.flush() +print(f"Number of entities in Milvus: {hello_text_match.num_entities}") # check the num_entities + +################################################################################ +# 4. create index +# We are going to create an vector index for hello_text_match collection +print(fmt.format("Start Creating index AUTOINDEX")) +index = { + "index_type": "AUTOINDEX", + "metric_type": "IP", +} + +hello_text_match.create_index("embeddings", index) +################################################################################ +# 5. query and scalar filtering search with text match +# After data were inserted into Milvus and indexed, you can perform: +# - query with text match expression +# - search data with text match filter + +# Before conducting a search or a query, you need to load the data in `hello_text_match` into memory. +print(fmt.format("Start loading")) +hello_text_match.load() + +# ----------------------------------------------------------------------------- +# query based text match with single keyword +expr = f"TEXT_MATCH(document, '{keywords[0]}')" +print(fmt.format(f"Start querying with `{expr}`")) + +start_time = time.time() +result = hello_text_match.query(expr=expr, output_fields=["document"]) +end_time = time.time() + +print(f"query result:\n-{result[0]}") +print(search_latency_fmt.format(end_time - start_time)) + +# query based text match with mutiple keywords +expr = f"TEXT_MATCH(document, '{keywords[0]} {keywords[1]} {keywords[2]}')" +print(fmt.format(f"Start querying with `{expr}`")) + +start_time = time.time() +result = hello_text_match.query(expr=expr, output_fields=["document"]) +end_time = time.time() + +print(f"query result:\n-{result[0]}") +print(search_latency_fmt.format(end_time - start_time)) + +# ----------------------------------------------------------------------------- +# scalar filtering search with text match +search_params = { + "metric_type": "IP", + "params": {}, +} +expr = f"TEXT_MATCH(document, '{keywords[0]} {keywords[1]} {keywords[2]}')" +print(fmt.format(f"Start filtered searching with `{expr}`")) + +start_time = time.time() +vector_to_search = rng.random((1, dim), np.float32) +result = hello_text_match.search(vector_to_search, "embeddings", search_params, limit=3, expr=expr, output_fields=["document"]) +end_time = time.time() + +for hits in result: + for hit in hits: + print(f"\thit: {hit}, document field: {hit.entity.get('document')}") +print(search_latency_fmt.format(end_time - start_time)) + +############################################################################### +# 6. delete entities by text match +# You can delete entities by their PK values using boolean expressions. + +expr = f"TEXT_MATCH(document, '{keywords[4]}')" +print(fmt.format(f"Start deleting with expr `{expr}`")) + +result = hello_text_match.query(expr=expr, output_fields=["document"]) +print(f"query before delete by expr=`{expr}` -> result: \n- {result[0]}\n") + +hello_text_match.delete(expr) + +result = hello_text_match.query(expr=expr, output_fields=["document"]) +print(f"query after delete by expr=`{expr}` -> result: {result}\n") + + +############################################################################### +# 7. drop collection +# Finally, drop the hello_text_match collection +print(fmt.format("Drop collection `hello_text_match`")) +utility.drop_collection("hello_text_match") diff --git a/examples/text_match.py b/examples/text_match.py new file mode 100644 index 000000000..c3819a06d --- /dev/null +++ b/examples/text_match.py @@ -0,0 +1,148 @@ +# hello_text_match.py demonstrates how to insert raw data only into Milvus and perform +# document retrieval based on specific terms by text match expression. +# 1. connect to Milvus +# 2. create collection +# 3. insert data +# 4. search, query, and filtering search on entities +# 5. drop collection +import time +import numpy as np + +from pymilvus import ( + MilvusClient, + Function, + FunctionType, + DataType, +) + +fmt = "\n=== {:30} ===\n" +collection_name = "text_match" +dim = 8 + +################################################################################# +# 1. connect to Milvus +# Add a new connection alias `default` for Milvus server in `localhost:19530` +print(fmt.format("start connecting to Milvus")) +milvus_client = MilvusClient("http://localhost:19530") + +has_collection = milvus_client.has_collection(collection_name, timeout=5) +print(f"Does collection hello_text_match exist in Milvus: {has_collection}") +if has_collection: + milvus_client.drop_collection(collection_name) + +################################################################################# +# 2. create collection +# We're going to create a collection with 3 explicit fields. +# +-+------------+------------+----------------------+------------------------------+ +# | | field name | field type | other attributes | field description | +# +-+------------+------------+----------------------+------------------------------+ +# |1| "id" | INT64 | is_primary=True | "primary field" | +# | | | | auto_id=False | | +# +-+------------+------------+----------------------+------------------------------+ +# |2| "document" | VarChar | enable_analyzer=True | "raw text document" | +# | | | | enable_match=True | | +# +-+------------+------------+----------------------+------------------------------+ +# |3|"embeddings"| FloatVector| dim=8 | "float vector with dim 8" | +# +-+------------+------------+----------------------+------------------------------+ + +schema = milvus_client.create_schema() +schema.add_field("id", DataType.INT64, is_primary=True, auto_id=False) +# set analyzer params in document field for more situations +# default as analyzer_params = {"type": "standard"} +schema.add_field("document", DataType.VARCHAR, max_length=1000, enable_analyzer=True, enable_match=True), +schema.add_field("embeddings", DataType.FLOAT_VECTOR, dim=dim) + +print(fmt.format("Create collection `hello_text_match`")) + +index_params = milvus_client.prepare_index_params() +index_params.add_index( + "embeddings", + index_type= "AUTOINDEX", + metric_type= "IP" +) + +milvus_client.create_collection(collection_name, schema=schema, index_params=index_params, consistency_level="Strong") + +################################################################################ +# 3. insert data +# We are going to insert 6 rows of data into `hello_text_match` +# Data to be inserted must be organized in fields. +# +# The insert() method returns: +# - either automatically generated primary keys by Milvus if auto_id=True in the schema; +# - or the existing primary key field from the entities if auto_id=False in the schema. + +print(fmt.format("Start inserting entities")) + +rng = np.random.default_rng(seed=19530) +num_entities = 6 +keywords = ["milvus", "match", "search", "query", "analyzer", "tokenizer"] +embeddings = rng.random((num_entities, dim), np.float32) + +entities = [{ + "id": i, + "document":f"This is a test document {i} with keywords: {keywords[i]}", + "embeddings": embeddings[i] + } for i in range(num_entities) +] + +insert_result = milvus_client.insert(collection_name, entities) +print(f"Number of insert entities in Milvus: {insert_result['insert_count']}") # check the num_entities +milvus_client.flush(collection_name) + +# ############################################################################### +# 4. query and scalar filtering search with text match +# After data were inserted into Milvus and indexed, you can perform: +# - query with text match expression +# - search data with text match filter + +# ----------------------------------------------------------------------------- +# query based text match with single keyword filter +filter = f"TEXT_MATCH(document, '{keywords[0]}')" +print(fmt.format(f"Start querying with `{filter}`")) + +result = milvus_client.query(collection_name, filter, output_fields=["document"]) +print(f"query result:\n-{result}") + +# query based text match with mutiple keywords +filter = f"TEXT_MATCH(document, '{keywords[0]} {keywords[1]} {keywords[2]}')" +print(fmt.format(f"Start querying with `{filter}`")) + +result = milvus_client.query(collection_name, filter, output_fields=["document"]) +print(f"query result:\n-{result}") + +# ----------------------------------------------------------------------------- +# scalar filtering search with text match +search_params = { + "metric_type": "IP", + "params": {}, +} +filter = f"TEXT_MATCH(document, '{keywords[0]} {keywords[1]} {keywords[2]}')" +print(fmt.format(f"Start filtered searching with `{filter}`")) + +vector_to_search = rng.random((1, dim), np.float32) +result = milvus_client.search(collection_name ,vector_to_search, filter, anns_field="embeddings", search_params=search_params, limit=3, output_fields=["document"]) + +print(result) + +############################################################################### +# 6. delete entities by text match filter +# You can delete entities by their PK values using boolean expressions. + +filter = f"TEXT_MATCH(document, '{keywords[4]}')" +print(fmt.format(f"Start deleting with expr `{filter}`")) + +result = milvus_client.query(collection_name, filter, output_fields=["document"]) +print(f"query before delete by expr=`{filter}` -> result: \n- {result}\n") + +milvus_client.delete(collection_name, filter=filter) + +result = milvus_client.query(collection_name, filter, output_fields=["document"]) +print(f"query after delete by expr=`{filter}` -> result: {result}\n") + + +############################################################################### +# 5. drop collection +# Finally, drop the hello_text_match collection +print(fmt.format(f"Drop collection `{collection_name}`")) +milvus_client.drop_collection(collection_name)