From 9a7b70799c836a1d9073aff12527eb7f93c12e54 Mon Sep 17 00:00:00 2001 From: JaySon Date: Wed, 25 Sep 2024 20:29:16 +0800 Subject: [PATCH] ddl: Adapt with the latest vector index definition (#9471) ref pingcap/tiflash#9032 ddl: Adapt with the latest vector index def --- dbms/src/TiDB/Schema/TiDB.cpp | 45 +++++++++++++++---- .../TiDB/Schema/tests/gtest_table_info.cpp | 43 +++++++++++++++++- 2 files changed, 79 insertions(+), 9 deletions(-) diff --git a/dbms/src/TiDB/Schema/TiDB.cpp b/dbms/src/TiDB/Schema/TiDB.cpp index 471c944b2f0..e67168d1633 100644 --- a/dbms/src/TiDB/Schema/TiDB.cpp +++ b/dbms/src/TiDB/Schema/TiDB.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -104,18 +105,45 @@ using DB::Exception; using DB::Field; using DB::SchemaNameMapper; -VectorIndexDefinitionPtr parseVectorIndexFromJSON(const Poco::JSON::Object::Ptr & json) +// The IndexType defined in TiDB +// https://github.com/pingcap/tidb/blob/a5e07a2ed360f29216c912775ce482f536f4102b/pkg/parser/model/model.go#L193-L219 +enum class IndexType +{ + INVALID = 0, + BTREE = 1, + HASH = 2, + RTREE = 3, + HYPO = 4, + HNSW = 5, +}; + +VectorIndexDefinitionPtr parseVectorIndexFromJSON(IndexType index_type, const Poco::JSON::Object::Ptr & json) { assert(json); // not nullptr tipb::VectorIndexKind kind = tipb::VectorIndexKind::INVALID_INDEX_KIND; - auto kind_field = json->getValue("kind"); - RUNTIME_CHECK_MSG(tipb::VectorIndexKind_Parse(kind_field, &kind), "invalid kind of vector index, {}", kind_field); - RUNTIME_CHECK(kind != tipb::VectorIndexKind::INVALID_INDEX_KIND); + if (unlikely(json->has("kind"))) + { + // TODO(vector-index): remove this deadcode + auto kind_field = json->getValue("kind"); + RUNTIME_CHECK_MSG( + tipb::VectorIndexKind_Parse(kind_field, &kind), + "invalid kind of vector index, {}", + kind_field); + RUNTIME_CHECK(kind != tipb::VectorIndexKind::INVALID_INDEX_KIND); + } + else + { + RUNTIME_CHECK_MSG( + index_type == IndexType::HNSW, + "Invalid index_type for vector index, {}({})", + magic_enum::enum_name(index_type), + fmt::underlying(index_type)); + kind = tipb::VectorIndexKind::HNSW; + } auto dimension = json->getValue("dimension"); - RUNTIME_CHECK(dimension > 0); - RUNTIME_CHECK(dimension <= TiDB::MAX_VECTOR_DIMENSION); // Just a protection + RUNTIME_CHECK(dimension > 0 && dimension <= TiDB::MAX_VECTOR_DIMENSION, dimension); // Just a protection tipb::VectorDistanceMetric distance_metric = tipb::VectorDistanceMetric::INVALID_DISTANCE_METRIC; auto distance_metric_field = json->getValue("distance_metric"); @@ -509,9 +537,10 @@ try } state = static_cast(json->getValue("state")); + // TODO(vector-index): remove this deadcode if (auto vector_index_json = json->getObject("vector_index"); vector_index_json) { - vector_index = parseVectorIndexFromJSON(vector_index_json); + vector_index = parseVectorIndexFromJSON(IndexType::HNSW, vector_index_json); } } catch (const Poco::Exception & e) @@ -902,7 +931,7 @@ try if (auto vector_index_json = json->getObject("vector_index"); vector_index_json) { - vector_index = parseVectorIndexFromJSON(vector_index_json); + vector_index = parseVectorIndexFromJSON(static_cast(index_type), vector_index_json); } } catch (const Poco::Exception & e) diff --git a/dbms/src/TiDB/Schema/tests/gtest_table_info.cpp b/dbms/src/TiDB/Schema/tests/gtest_table_info.cpp index f5233e5d7b6..919a2e5596b 100644 --- a/dbms/src/TiDB/Schema/tests/gtest_table_info.cpp +++ b/dbms/src/TiDB/Schema/tests/gtest_table_info.cpp @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include @@ -19,11 +20,13 @@ #include #include #include +#include #include #include #include #include #include +#include using TableInfo = TiDB::TableInfo; @@ -213,13 +216,51 @@ try ASSERT_EQ(col1.name, "v"); ASSERT_EQ(col1.tp, TiDB::TP::TypeTiDBVectorFloat32); ASSERT_EQ(col1.id, 2); - }}}; + }, + }, + // VectorIndex defined without "kind" field + ParseCase{ + R"json({"Lock":null,"ShardRowIDBits":0,"auto_id_cache":0,"auto_inc_id":0,"auto_rand_id":0,"auto_random_bits":0,"auto_random_range_bits":0,"cache_table_status":0,"charset":"utf8mb4","collate":"utf8mb4_bin","cols":[{"change_state_info":null,"comment":"","default":null,"default_bit":null,"default_is_expr":false,"dependences":null,"generated_expr_string":"","generated_stored":false,"hidden":false,"id":1,"name":{"L":"a","O":"a"},"offset":0,"origin_default":null,"origin_default_bit":null,"state":5,"type":{"Array":false,"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"ElemsIsBinaryLit":null,"Flag":4099,"Flen":11,"Tp":3},"version":2},{"change_state_info":null,"comment":"","default":null,"default_bit":null,"default_is_expr":false,"dependences":null,"generated_expr_string":"","generated_stored":false,"hidden":false,"id":2,"name":{"L":"vec","O":"vec"},"offset":1,"origin_default":null,"origin_default_bit":null,"state":5,"type":{"Array":false,"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"ElemsIsBinaryLit":null,"Flag":128,"Flen":3,"Tp":225},"version":2}],"comment":"","common_handle_version":0,"compression":"","constraint_info":null,"exchange_partition_info":null,"fk_info":null,"id":104, + "index_info":[{"backfill_state":0,"comment":"","id":1,"idx_cols":[{"length":-1,"name":{"L":"vec","O":"vec"},"offset":1}],"idx_name":{"L":"v","O":"v"},"index_type":5,"is_global":false,"is_invisible":false,"is_primary":false,"is_unique":false,"mv_index":false,"state":3,"tbl_name":{"L":"","O":""},"vector_index":{"dimension":3,"distance_metric":"COSINE"}}], + "is_columnar":false,"is_common_handle":false,"max_col_id":2,"max_cst_id":0,"max_fk_id":0,"max_idx_id":1,"max_shard_row_id_bits":0,"name":{"L":"t","O":"t"},"partition":null,"pk_is_handle":true,"revision":5,"sequence":null,"state":5,"stats_options":null,"temp_table_type":0,"update_timestamp":452784611061923843,"version":5,"view":null})json", + [](const TableInfo & table_info) { + ASSERT_EQ(table_info.index_infos.size(), 1); + auto idx0 = table_info.index_infos[0]; + ASSERT_EQ(idx0.id, 1); + ASSERT_EQ(idx0.idx_name, "v"); + ASSERT_EQ(idx0.idx_cols.size(), 1); + ASSERT_EQ(idx0.idx_cols[0].name, "vec"); + ASSERT_EQ(idx0.idx_cols[0].offset, 1); + ASSERT_NE(idx0.vector_index, nullptr); + ASSERT_EQ(idx0.index_type, 5); // HNSW + ASSERT_EQ(idx0.vector_index->kind, tipb::VectorIndexKind::HNSW); + ASSERT_EQ(idx0.vector_index->dimension, 3); + ASSERT_EQ(idx0.vector_index->distance_metric, tipb::VectorDistanceMetric::COSINE); + }, + }, + }; for (const auto & c : cases) { TableInfo table_info(c.table_info_json, NullspaceID); c.check(table_info); } + + Strings failure_case = { + // Suppose invalid index_type (index_type=4) for vector index is set, should throw exception + R"json({"Lock":null,"ShardRowIDBits":0,"auto_id_cache":0,"auto_inc_id":0,"auto_rand_id":0,"auto_random_bits":0,"auto_random_range_bits":0,"cache_table_status":0,"charset":"utf8mb4","collate":"utf8mb4_bin","cols":[{"change_state_info":null,"comment":"","default":null,"default_bit":null,"default_is_expr":false,"dependences":null,"generated_expr_string":"","generated_stored":false,"hidden":false,"id":1,"name":{"L":"a","O":"a"},"offset":0,"origin_default":null,"origin_default_bit":null,"state":5,"type":{"Array":false,"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"ElemsIsBinaryLit":null,"Flag":4099,"Flen":11,"Tp":3},"version":2},{"change_state_info":null,"comment":"","default":null,"default_bit":null,"default_is_expr":false,"dependences":null,"generated_expr_string":"","generated_stored":false,"hidden":false,"id":2,"name":{"L":"vec","O":"vec"},"offset":1,"origin_default":null,"origin_default_bit":null,"state":5,"type":{"Array":false,"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"ElemsIsBinaryLit":null,"Flag":128,"Flen":3,"Tp":225},"version":2}],"comment":"","common_handle_version":0,"compression":"","constraint_info":null,"exchange_partition_info":null,"fk_info":null,"id":104, + "index_info":[{"backfill_state":0,"comment":"","id":1,"idx_cols":[{"length":-1,"name":{"L":"vec","O":"vec"},"offset":1}],"idx_name":{"L":"v","O":"v"},"index_type":4,"is_global":false,"is_invisible":false,"is_primary":false,"is_unique":false,"mv_index":false,"state":3,"tbl_name":{"L":"","O":""},"vector_index":{"dimension":3,"distance_metric":"COSINE"}}], + "is_columnar":false,"is_common_handle":false,"max_col_id":2,"max_cst_id":0,"max_fk_id":0,"max_idx_id":1,"max_shard_row_id_bits":0,"name":{"L":"t","O":"t"},"partition":null,"pk_is_handle":true,"revision":5,"sequence":null,"state":5,"stats_options":null,"temp_table_type":0,"update_timestamp":452784611061923843,"version":5,"view":null})json", + // Suppose we add new algorithm type for vector index. Parsing unknown algorithm (index_type=99) should throw exception + R"json({"Lock":null,"ShardRowIDBits":0,"auto_id_cache":0,"auto_inc_id":0,"auto_rand_id":0,"auto_random_bits":0,"auto_random_range_bits":0,"cache_table_status":0,"charset":"utf8mb4","collate":"utf8mb4_bin","cols":[{"change_state_info":null,"comment":"","default":null,"default_bit":null,"default_is_expr":false,"dependences":null,"generated_expr_string":"","generated_stored":false,"hidden":false,"id":1,"name":{"L":"a","O":"a"},"offset":0,"origin_default":null,"origin_default_bit":null,"state":5,"type":{"Array":false,"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"ElemsIsBinaryLit":null,"Flag":4099,"Flen":11,"Tp":3},"version":2},{"change_state_info":null,"comment":"","default":null,"default_bit":null,"default_is_expr":false,"dependences":null,"generated_expr_string":"","generated_stored":false,"hidden":false,"id":2,"name":{"L":"vec","O":"vec"},"offset":1,"origin_default":null,"origin_default_bit":null,"state":5,"type":{"Array":false,"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"ElemsIsBinaryLit":null,"Flag":128,"Flen":3,"Tp":225},"version":2}],"comment":"","common_handle_version":0,"compression":"","constraint_info":null,"exchange_partition_info":null,"fk_info":null,"id":104, + "index_info":[{"backfill_state":0,"comment":"","id":1,"idx_cols":[{"length":-1,"name":{"L":"vec","O":"vec"},"offset":1}],"idx_name":{"L":"v","O":"v"},"index_type":99,"is_global":false,"is_invisible":false,"is_primary":false,"is_unique":false,"mv_index":false,"state":3,"tbl_name":{"L":"","O":""},"vector_index":{"dimension":3,"distance_metric":"COSINE"}}], + "is_columnar":false,"is_common_handle":false,"max_col_id":2,"max_cst_id":0,"max_fk_id":0,"max_idx_id":1,"max_shard_row_id_bits":0,"name":{"L":"t","O":"t"},"partition":null,"pk_is_handle":true,"revision":5,"sequence":null,"state":5,"stats_options":null,"temp_table_type":0,"update_timestamp":452784611061923843,"version":5,"view":null})json", + }; + + for (const auto & c : failure_case) + { + ASSERT_THROW({ TableInfo table_info(c, NullspaceID); }, DB::Exception) << c; + } } CATCH