Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: Add nullable test cases for bulk writer #37572

Merged
merged 1 commit into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 43 additions & 37 deletions tests/python_client/testcases/test_bulk_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1477,7 +1477,8 @@ def test_bulk_insert_sparse_vector_with_json(self, auto_id, dim, entities, enabl
@pytest.mark.parametrize("entities", [1000]) # 1000
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
@pytest.mark.parametrize("sparse_format", ["doc", "coo"])
def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format):
@pytest.mark.parametrize("nullable", [True, False])
def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format, nullable):
"""
collection schema 1: [pk, int64, float64, string float_vector]
data file: vectors.npy and uid.npy,
Expand All @@ -1489,14 +1490,14 @@ def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enab
self._connect()
fields = [
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
cf.gen_int64_field(name=df.int_field),
cf.gen_float_field(name=df.float_field),
cf.gen_string_field(name=df.string_field),
cf.gen_json_field(name=df.json_field),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field, nullable=nullable),
cf.gen_string_field(name=df.string_field, nullable=nullable),
cf.gen_json_field(name=df.json_field, nullable=nullable),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
cf.gen_float_vec_field(name=df.float_vec_field, dim=dim),
cf.gen_float16_vec_field(name=df.fp16_vec_field, dim=dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=dim),
Expand Down Expand Up @@ -1528,14 +1529,14 @@ def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enab
for i in range(entities):
row = {
df.pk_field: i,
df.int_field: 1,
df.float_field: 1.0,
df.string_field: "string",
df.json_field: json_value[i%len(json_value)],
df.array_int_field: [1, 2],
df.array_float_field: [1.0, 2.0],
df.array_string_field: ["string1", "string2"],
df.array_bool_field: [True, False],
df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
df.float_field: 1.0 if not (nullable and random.random() < 0.5) else None,
df.string_field: "string" if not (nullable and random.random() < 0.5) else None,
df.json_field: json_value[i%len(json_value)] if not (nullable and random.random() < 0.5) else None,
df.array_int_field: [1, 2] if not (nullable and random.random() < 0.5) else None,
df.array_float_field: [1.0, 2.0] if not (nullable and random.random() < 0.5) else None,
df.array_string_field: ["string1", "string2"] if not (nullable and random.random() < 0.5) else None,
df.array_bool_field: [True, False] if not (nullable and random.random() < 0.5) else None,
df.float_vec_field: cf.gen_vectors(1, dim)[0],
df.fp16_vec_field: cf.gen_vectors(1, dim, vector_data_type="FLOAT16_VECTOR")[0],
df.bf16_vec_field: cf.gen_vectors(1, dim, vector_data_type="BFLOAT16_VECTOR")[0],
Expand Down Expand Up @@ -1606,13 +1607,17 @@ def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enab
@pytest.mark.parametrize("dim", [128]) # 128
@pytest.mark.parametrize("entities", [1000]) # 1000
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field):
@pytest.mark.parametrize("nullable", [True, False])
def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, nullable):
"""
"""
if nullable is True:
pytest.skip("not support bulk writer numpy files in field(int_scalar) which has 'None' data")

self._connect()
fields = [
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
cf.gen_int64_field(name=df.int_field),
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field),
cf.gen_string_field(name=df.string_field),
cf.gen_json_field(name=df.json_field),
Expand Down Expand Up @@ -1646,7 +1651,7 @@ def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, ena
for i in range(entities):
row = {
df.pk_field: i,
df.int_field: 1,
df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
df.float_field: 1.0,
df.string_field: "string",
df.json_field: json_value[i%len(json_value)],
Expand Down Expand Up @@ -1720,20 +1725,21 @@ def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, ena
@pytest.mark.parametrize("entities", [1000]) # 1000
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
@pytest.mark.parametrize("sparse_format", ["doc", "coo"])
def test_with_all_field_parquet_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format):
@pytest.mark.parametrize("nullable", [True, False])
def test_with_all_field_parquet_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format, nullable):
"""
"""
self._connect()
fields = [
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
cf.gen_int64_field(name=df.int_field),
cf.gen_float_field(name=df.float_field),
cf.gen_string_field(name=df.string_field),
cf.gen_json_field(name=df.json_field),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field, nullable=nullable),
cf.gen_string_field(name=df.string_field, nullable=nullable),
cf.gen_json_field(name=df.json_field, nullable=nullable),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
cf.gen_float_vec_field(name=df.float_vec_field, dim=dim),
cf.gen_float16_vec_field(name=df.fp16_vec_field, dim=dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=dim),
Expand Down Expand Up @@ -1765,14 +1771,14 @@ def test_with_all_field_parquet_with_bulk_writer(self, auto_id, dim, entities, e
for i in range(entities):
row = {
df.pk_field: i,
df.int_field: 1,
df.float_field: 1.0,
df.string_field: "string",
df.json_field: json_value[i%len(json_value)],
df.array_int_field: [1, 2],
df.array_float_field: [1.0, 2.0],
df.array_string_field: ["string1", "string2"],
df.array_bool_field: [True, False],
df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
df.float_field: 1.0 if not (nullable and random.random() < 0.5) else None,
df.string_field: "string" if not (nullable and random.random() < 0.5) else None,
df.json_field: json_value[i%len(json_value)] if not (nullable and random.random() < 0.5) else None,
df.array_int_field: [1, 2] if not (nullable and random.random() < 0.5) else None,
df.array_float_field: [1.0, 2.0] if not (nullable and random.random() < 0.5) else None,
df.array_string_field: ["string1", "string2"] if not (nullable and random.random() < 0.5) else None,
df.array_bool_field: [True, False] if not (nullable and random.random() < 0.5) else None,
df.float_vec_field: cf.gen_vectors(1, dim)[0],
df.fp16_vec_field: cf.gen_vectors(1, dim, vector_data_type="FLOAT16_VECTOR")[0],
df.bf16_vec_field: cf.gen_vectors(1, dim, vector_data_type="BFLOAT16_VECTOR")[0],
Expand Down
135 changes: 135 additions & 0 deletions tests/python_client/testcases/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -4809,6 +4809,73 @@ def test_binary_indexed_over_max_dim(self, dim):
check_task=CheckTasks.err_res,
check_items={"err_code": 999, "err_msg": f"invalid dimension: {dim}."})

@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.skip(reason="issue #37547")
def test_search_verify_expr_cache(self, is_flush):
"""
target: test search case to test expr cache
method: 1. create collection with a double datatype field
2. search with expr "doubleField == 0"
3. drop this collection
4. create collection with same collection name and same field name but modify the type of double field
as varchar datatype
5. search with expr "doubleField == 0" again
expected: 1. search successfully with limit(topK) for the first collection
2. report error for the second collection with the same name
"""
# 1. initialize with data
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, is_flush=is_flush)[0:5]
collection_name = collection_w.name
# 2. generate search data
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
# 3. search with expr "nullableFid == 0"
search_exp = f"{ct.default_float_field_name} == 0"
output_fields = [default_int64_field_name, default_float_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 1,
"output_fields": output_fields})
# 4. drop collection
collection_w.drop()
# 5. create the same collection name with same field name but varchar field type
int64_field = cf.gen_int64_field(is_primary=True)
string_field = cf.gen_string_field(ct.default_float_field_name)
json_field = cf.gen_json_field()
float_vector_field = cf.gen_float_vec_field()
fields = [int64_field, string_field, json_field, float_vector_field]
schema = cf.gen_collection_schema(fields)
collection_w = self.init_collection_wrap(name=collection_name, schema=schema)
int64_values = pd.Series(data=[i for i in range(default_nb)])
string_values = pd.Series(data=[str(i) for i in range(default_nb)], dtype="string")
json_values = [{"number": i, "string": str(i), "bool": bool(i),
"list": [j for j in range(i, i + ct.default_json_list_length)]} for i in range(default_nb)]
float_vec_values = cf.gen_vectors(default_nb, default_dim)
df = pd.DataFrame({
ct.default_int64_field_name: int64_values,
ct.default_float_field_name: string_values,
ct.default_json_field_name: json_values,
ct.default_float_vec_field_name: float_vec_values
})
collection_w.insert(df)
collection_w.create_index(ct.default_float_vec_field_name, ct.default_flat_index)
collection_w.load()
collection_w.flush()
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.err_res,
check_items={"err_code": 1100,
"err_msg": "failed to create query plan: cannot parse expression: float == 0, "
"error: comparisons between VarChar and Int64 are not supported: "
"invalid parameter"})


class TestSearchBase(TestcaseBase):
@pytest.fixture(
Expand Down Expand Up @@ -13279,6 +13346,74 @@ def test_search_none_data_partial_load(self, is_flush, enable_dynamic_field, nul
"limit": default_limit,
"output_fields": output_fields})

@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.skip(reason="issue #37547")
def test_search_none_data_expr_cache(self, is_flush):
"""
target: test search case with none data to test expr cache
method: 1. create collection with double datatype as nullable field
2. search with expr "nullableFid == 0"
3. drop this collection
4. create collection with same collection name and same field name but modify the type of nullable field
as varchar datatype
5. search with expr "nullableFid == 0" again
expected: 1. search successfully with limit(topK) for the first collection
2. report error for the second collection with the same name
"""
# 1. initialize with data
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, is_flush=is_flush,
nullable_fields={ct.default_float_field_name: 0.5})[0:5]
collection_name = collection_w.name
# 2. generate search data
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
# 3. search with expr "nullableFid == 0"
search_exp = f"{ct.default_float_field_name} == 0"
output_fields = [default_int64_field_name, default_float_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 1,
"output_fields": output_fields})
# 4. drop collection
collection_w.drop()
# 5. create the same collection name with same field name but varchar field type
int64_field = cf.gen_int64_field(is_primary=True)
string_field = cf.gen_string_field(ct.default_float_field_name, nullable=True)
json_field = cf.gen_json_field()
float_vector_field = cf.gen_float_vec_field()
fields = [int64_field, string_field, json_field, float_vector_field]
schema = cf.gen_collection_schema(fields)
collection_w = self.init_collection_wrap(name=collection_name, schema=schema)
int64_values = pd.Series(data=[i for i in range(default_nb)])
string_values = pd.Series(data=[str(i) for i in range(default_nb)], dtype="string")
json_values = [{"number": i, "string": str(i), "bool": bool(i),
"list": [j for j in range(i, i + ct.default_json_list_length)]} for i in range(default_nb)]
float_vec_values = cf.gen_vectors(default_nb, default_dim)
df = pd.DataFrame({
ct.default_int64_field_name: int64_values,
ct.default_float_field_name: None,
ct.default_json_field_name: json_values,
ct.default_float_vec_field_name: float_vec_values
})
collection_w.insert(df)
collection_w.create_index(ct.default_float_vec_field_name, ct.default_flat_index)
collection_w.load()
collection_w.flush()
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.err_res,
check_items={"err_code": 1100,
"err_msg": "failed to create query plan: cannot parse expression: float == 0, "
"error: comparisons between VarChar and Int64 are not supported: "
"invalid parameter"})


class TestSearchWithTextMatchFilter(TestcaseBase):
"""
Expand Down