From aa734012526b2c2ffc149c97adfc224f365ea747 Mon Sep 17 00:00:00 2001 From: zhuwenxing Date: Tue, 29 Oct 2024 17:37:46 +0800 Subject: [PATCH 1/4] test: add testcases contain growing segments Signed-off-by: zhuwenxing --- .../testcases/test_full_text_search.py | 195 +++++++++++++++++- tests/python_client/testcases/test_query.py | 133 ++++++++++++ 2 files changed, 327 insertions(+), 1 deletion(-) diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py index d42295de68f82..3cc6bafcc6426 100644 --- a/tests/python_client/testcases/test_full_text_search.py +++ b/tests/python_client/testcases/test_full_text_search.py @@ -2503,6 +2503,199 @@ def test_full_text_search_with_jieba_tokenizer( assert len( overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}" + + @pytest.mark.tags(CaseLabel.L0) + @pytest.mark.parametrize("nq", [2]) + @pytest.mark.parametrize("empty_percent", [0.5]) + @pytest.mark.parametrize("enable_partition_key", [True]) + @pytest.mark.parametrize("enable_inverted_index", [True]) + @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"]) + @pytest.mark.parametrize("expr", ["id_range"]) + @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("offset", [0]) + def test_full_text_search_for_growing_segment( + self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq + ): + """ + target: test full text search + method: 1. enable full text search and insert data with varchar + 2. search with text + 3. verify the result + expected: full text search successfully and result is correct + """ + tokenizer_params = { + "tokenizer": tokenizer, + } + dim = 128 + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name="word", + dtype=DataType.VARCHAR, + max_length=65535, + enable_tokenizer=True, + tokenizer_params=tokenizer_params, + is_partition_key=enable_partition_key, + ), + FieldSchema( + name="sentence", + dtype=DataType.VARCHAR, + max_length=65535, + enable_tokenizer=True, + tokenizer_params=tokenizer_params, + ), + FieldSchema( + name="paragraph", + dtype=DataType.VARCHAR, + max_length=65535, + enable_tokenizer=True, + tokenizer_params=tokenizer_params, + ), + FieldSchema( + name="text", + dtype=DataType.VARCHAR, + max_length=65535, + enable_tokenizer=True, + enable_match=True, + tokenizer_params=tokenizer_params, + ), + FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), + FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR), + ] + schema = CollectionSchema(fields=fields, description="test collection") + bm25_function = Function( + name="text_bm25_emb", + function_type=FunctionType.BM25, + input_field_names=["text"], + output_field_names=["text_sparse_emb"], + params={}, + ) + schema.add_function(bm25_function) + data_size = 5000 + collection_w = self.init_collection_wrap( + name=cf.gen_unique_str(prefix), schema=schema + ) + fake = fake_en + if tokenizer == "jieba": + language = "zh" + fake = fake_zh + else: + language = "en" + + data = [ + { + "id": i, + "word": fake.word().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", + "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "text": fake.text().lower() if random.random() >= empty_percent else "", + "emb": [random.random() for _ in range(dim)], + } + for i in range(data_size) + ] + df = pd.DataFrame(data) + log.info(f"dataframe\n{df}") + texts = df["text"].to_list() + word_freq = cf.analyze_documents(texts, language=language) + most_freq_word = word_freq.most_common(10) + tokens = [item[0] for item in most_freq_word] + if len(tokens) == 0: + log.info(f"empty tokens, add a dummy token") + tokens = ["dummy"] + collection_w.create_index( + "emb", + {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + ) + collection_w.create_index( + "text_sparse_emb", + { + "index_type": index_type, + "metric_type": "BM25", + "params": { + "bm25_k1": 1.5, + "bm25_b": 0.75, + } + } + ) + if enable_inverted_index: + collection_w.create_index("text", {"index_type": "INVERTED"}) + collection_w.load() + batch_size = 5000 + for i in range(0, len(df), batch_size): + collection_w.insert( + data[i: i + batch_size] + if i + batch_size < len(df) + else data[i: len(df)] + ) + limit = 100 + search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)] + if expr == "text_match": + filter = f"TextMatch(text, '{tokens[0]}')" + res, _ = collection_w.query( + expr=filter, + ) + elif expr == "id_range": + filter = f"id < {data_size // 2}" + else: + filter = "" + res, _ = collection_w.query( + expr=filter, + limit=limit, + ) + candidates_num = len(res) + log.info(f"search data: {search_data}") + # use offset = 0 to get all the results + full_res_list, _ = collection_w.search( + data=search_data, + anns_field="text_sparse_emb", + expr=filter, + param={}, + limit=limit + offset, + offset=0, + output_fields=["id", "text", "text_sparse_emb"]) + full_res_id_list = [] + for i in range(nq): + res = full_res_list[i] + tmp = [] + for r in res: + tmp.append(r.id) + full_res_id_list.append(tmp) + + res_list, _ = collection_w.search( + data=search_data, + anns_field="text_sparse_emb", + expr=filter, + param={}, + limit=limit, + offset=offset, + output_fields=["id", "text", "text_sparse_emb"]) + + # verify correctness + for i in range(nq): + assert 0 < len(res_list[i]) <= min(limit, candidates_num) + search_text = search_data[i] + log.info(f"res: {res_list[i]}") + res = res_list[i] + for j in range(len(res)): + r = res[j] + _id = r.id + # get the first id of the result in which position is larger than offset + if j == 0: + first_id = _id + p = full_res_id_list[i].index(first_id) + assert 1.2 * offset >= p >= offset * 0.8 + result_text = r.text + # verify search result satisfies the filter + if expr == "text_match": + assert tokens[0] in result_text + if expr == "id_range": + assert _id < data_size // 2 + # verify search result has overlap with search text + overlap, word_freq_a, word_freq_b = cf.check_token_overlap(search_text, result_text, language=language) + log.info(f"overlap {overlap}") + assert len( + overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}" + @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("nq", [2]) @pytest.mark.parametrize("empty_percent", [0]) @@ -2669,7 +2862,7 @@ def test_full_text_search_with_range_search( assert low <= tmp_distance <= high @pytest.mark.tags(CaseLabel.L1) - @pytest.mark.parametrize("nq", [1]) + @pytest.mark.parametrize("nq", [2]) @pytest.mark.parametrize("empty_percent", [0]) @pytest.mark.parametrize("enable_partition_key", [True]) @pytest.mark.parametrize("enable_inverted_index", [True]) diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py index e6b3335a724aa..0b1e9ae518c63 100644 --- a/tests/python_client/testcases/test_query.py +++ b/tests/python_client/testcases/test_query.py @@ -4713,6 +4713,139 @@ def test_query_text_match_zh_normal( + @pytest.mark.tags(CaseLabel.L0) + @pytest.mark.parametrize("enable_partition_key", [True]) + @pytest.mark.parametrize("enable_inverted_index", [True]) + @pytest.mark.parametrize("tokenizer", ["jieba", "standard"]) + @pytest.mark.xfail(reason="unstable case, issue: https://github.com/milvus-io/milvus/issues/36962") + def test_query_text_match_with_growing_segment( + self, tokenizer, enable_inverted_index, enable_partition_key + ): + """ + target: test text match normal + method: 1. enable text match and insert data with varchar + 2. get the most common words and query with text match + 3. verify the result + expected: text match successfully and result is correct + """ + tokenizer_params = { + "tokenizer": tokenizer, + } + dim = 128 + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name="word", + dtype=DataType.VARCHAR, + max_length=65535, + enable_tokenizer=True, + enable_match=True, + is_partition_key=enable_partition_key, + tokenizer_params=tokenizer_params, + ), + FieldSchema( + name="sentence", + dtype=DataType.VARCHAR, + max_length=65535, + enable_tokenizer=True, + enable_match=True, + tokenizer_params=tokenizer_params, + ), + FieldSchema( + name="paragraph", + dtype=DataType.VARCHAR, + max_length=65535, + enable_tokenizer=True, + enable_match=True, + tokenizer_params=tokenizer_params, + ), + FieldSchema( + name="text", + dtype=DataType.VARCHAR, + max_length=65535, + enable_tokenizer=True, + enable_match=True, + tokenizer_params=tokenizer_params, + ), + FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), + ] + schema = CollectionSchema(fields=fields, description="test collection") + data_size = 3000 + collection_w = self.init_collection_wrap( + name=cf.gen_unique_str(prefix), schema=schema + ) + fake = fake_en + if tokenizer == "jieba": + language = "zh" + fake = fake_zh + else: + language = "en" + collection_w.create_index( + "emb", + {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, + ) + if enable_inverted_index: + collection_w.create_index("word", {"index_type": "INVERTED"}) + collection_w.load() + # generate growing segment + data = [ + { + "id": i, + "word": fake.word().lower(), + "sentence": fake.sentence().lower(), + "paragraph": fake.paragraph().lower(), + "text": fake.text().lower(), + "emb": [random.random() for _ in range(dim)], + } + for i in range(data_size) + ] + df = pd.DataFrame(data) + log.info(f"dataframe\n{df}") + batch_size = 5000 + for i in range(0, len(df), batch_size): + collection_w.insert( + data[i: i + batch_size] + if i + batch_size < len(df) + else data[i: len(df)] + ) + # analyze the croup + text_fields = ["word", "sentence", "paragraph", "text"] + wf_map = {} + for field in text_fields: + wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) + # query single field for one token + for field in text_fields: + token = wf_map[field].most_common()[0][0] + expr = f"TextMatch({field}, '{token}')" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + assert len(res) > 0 + log.info(f"res len {len(res)}") + for r in res: + assert token in r[field] + # verify inverted index + if enable_inverted_index: + if field == "word": + expr = f"{field} == '{token}'" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + log.info(f"res len {len(res)}") + for r in res: + assert r[field] == token + # query single field for multi-word + for field in text_fields: + # match top 10 most common words + top_10_tokens = [] + for word, count in wf_map[field].most_common(10): + top_10_tokens.append(word) + string_of_top_10_words = " ".join(top_10_tokens) + expr = f"TextMatch({field}, '{string_of_top_10_words}')" + log.info(f"expr {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + log.info(f"res len {len(res)}") + for r in res: + assert any([token in r[field] for token in top_10_tokens]) + @pytest.mark.skip("unimplemented") @pytest.mark.tags(CaseLabel.L0) def test_query_text_match_custom_analyzer(self): From 7dbbe42afbec02c2684f49fd9fedebf8c042a7f8 Mon Sep 17 00:00:00 2001 From: zhuwenxing Date: Fri, 8 Nov 2024 14:30:09 +0800 Subject: [PATCH 2/4] test: fix tokenizer Signed-off-by: zhuwenxing --- tests/python_client/testcases/test_full_text_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py index 3cc6bafcc6426..4f0b8ae5a36f9 100644 --- a/tests/python_client/testcases/test_full_text_search.py +++ b/tests/python_client/testcases/test_full_text_search.py @@ -2511,7 +2511,7 @@ def test_full_text_search_with_jieba_tokenizer( @pytest.mark.parametrize("enable_inverted_index", [True]) @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"]) @pytest.mark.parametrize("expr", ["id_range"]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) @pytest.mark.parametrize("offset", [0]) def test_full_text_search_for_growing_segment( self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq From bdcccf23a1f0779f937f6108d6f4a87c516bdda7 Mon Sep 17 00:00:00 2001 From: zhuwenxing Date: Mon, 11 Nov 2024 10:21:52 +0800 Subject: [PATCH 3/4] test: fix search iter not support nq > 1 Signed-off-by: zhuwenxing --- tests/python_client/testcases/test_full_text_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py index 4f0b8ae5a36f9..8c8f42b41a3e5 100644 --- a/tests/python_client/testcases/test_full_text_search.py +++ b/tests/python_client/testcases/test_full_text_search.py @@ -2862,7 +2862,7 @@ def test_full_text_search_with_range_search( assert low <= tmp_distance <= high @pytest.mark.tags(CaseLabel.L1) - @pytest.mark.parametrize("nq", [2]) + @pytest.mark.parametrize("nq", [1]) @pytest.mark.parametrize("empty_percent", [0]) @pytest.mark.parametrize("enable_partition_key", [True]) @pytest.mark.parametrize("enable_inverted_index", [True]) From e43ea82300cdc477eb4e85984b14e242143e7e1b Mon Sep 17 00:00:00 2001 From: zhuwenxing Date: Mon, 11 Nov 2024 14:05:39 +0800 Subject: [PATCH 4/4] test: update bm25 param key Signed-off-by: zhuwenxing --- .../testcases/test_full_text_search.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py index 8c8f42b41a3e5..c6961f74bc2b6 100644 --- a/tests/python_client/testcases/test_full_text_search.py +++ b/tests/python_client/testcases/test_full_text_search.py @@ -2523,7 +2523,7 @@ def test_full_text_search_for_growing_segment( 3. verify the result expected: full text search successfully and result is correct """ - tokenizer_params = { + analyzer_params = { "tokenizer": tokenizer, } dim = 128 @@ -2533,31 +2533,31 @@ def test_full_text_search_for_growing_segment( name="word", dtype=DataType.VARCHAR, max_length=65535, - enable_tokenizer=True, - tokenizer_params=tokenizer_params, + enable_analyzer=True, + analyzer_params=analyzer_params, is_partition_key=enable_partition_key, ), FieldSchema( name="sentence", dtype=DataType.VARCHAR, max_length=65535, - enable_tokenizer=True, - tokenizer_params=tokenizer_params, + enable_analyzer=True, + analyzer_params=analyzer_params, ), FieldSchema( name="paragraph", dtype=DataType.VARCHAR, max_length=65535, - enable_tokenizer=True, - tokenizer_params=tokenizer_params, + enable_analyzer=True, + analyzer_params=analyzer_params, ), FieldSchema( name="text", dtype=DataType.VARCHAR, max_length=65535, - enable_tokenizer=True, + enable_analyzer=True, enable_match=True, - tokenizer_params=tokenizer_params, + analyzer_params=analyzer_params, ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),