From 9c8a58884b3f61afd17a4c1a132dbd78495f3037 Mon Sep 17 00:00:00 2001 From: yanweiqi <592838129@qq.com> Date: Mon, 13 Jun 2022 15:46:32 +0800 Subject: [PATCH] *: remove CatBoost (#5131) ref pingcap/tiflash#2019 --- dbms/src/Storages/StorageCatBoostPool.cpp | 287 ------------------ dbms/src/Storages/StorageCatBoostPool.h | 100 ------ .../TableFunctionCatBoostPool.cpp | 68 ----- .../TableFunctionCatBoostPool.h | 35 --- .../TableFunctions/registerTableFunctions.cpp | 3 +- 5 files changed, 1 insertion(+), 492 deletions(-) delete mode 100644 dbms/src/Storages/StorageCatBoostPool.cpp delete mode 100644 dbms/src/Storages/StorageCatBoostPool.h delete mode 100644 dbms/src/TableFunctions/TableFunctionCatBoostPool.cpp delete mode 100644 dbms/src/TableFunctions/TableFunctionCatBoostPool.h diff --git a/dbms/src/Storages/StorageCatBoostPool.cpp b/dbms/src/Storages/StorageCatBoostPool.cpp deleted file mode 100644 index 317cac21d52..00000000000 --- a/dbms/src/Storages/StorageCatBoostPool.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// Copyright 2022 PingCAP, Ltd. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int CANNOT_OPEN_FILE; - extern const int CANNOT_PARSE_TEXT; - extern const int DATABASE_ACCESS_DENIED; -} - -namespace -{ -class CatBoostDatasetBlockInputStream : public IProfilingBlockInputStream -{ -public: - - CatBoostDatasetBlockInputStream(const std::string & file_name, const std::string & format_name, - const Block & sample_block, const Context & context, size_t max_block_size) - : file_name(file_name), format_name(format_name) - { - read_buf = std::make_unique(file_name); - reader = FormatFactory().getInput(format_name, *read_buf, sample_block, context, max_block_size); - } - - String getName() const override - { - return "CatBoostDataset"; - } - - Block readImpl() override - { - return reader->read(); - } - - void readPrefixImpl() override - { - reader->readPrefix(); - } - - void readSuffixImpl() override - { - reader->readSuffix(); - } - - Block getHeader() const override { return sample_block; }; - -private: - Block sample_block; - std::unique_ptr read_buf; - BlockInputStreamPtr reader; - std::string file_name; - std::string format_name; -}; - -} - -static boost::filesystem::path canonicalPath(std::string && path) -{ - return boost::filesystem::canonical(boost::filesystem::path(path)); -} - -static std::string resolvePath(const boost::filesystem::path & base_path, std::string && path) -{ - boost::filesystem::path resolved_path(path); - if (!resolved_path.is_absolute()) - return (base_path / resolved_path).string(); - return resolved_path.string(); -} - -static void checkCreationIsAllowed(const String & base_path, const String & path) -{ - if (base_path != path.substr(0, base_path.size())) - throw Exception( - "Using file descriptor or user specified path as source of storage isn't allowed for server daemons", - ErrorCodes::DATABASE_ACCESS_DENIED); -} - - -StorageCatBoostPool::StorageCatBoostPool(const Context & context, - String column_description_file_name_, - String data_description_file_name_) - : column_description_file_name(std::move(column_description_file_name_)), - data_description_file_name(std::move(data_description_file_name_)) -{ - auto base_path = canonicalPath(context.getPath()); - column_description_file_name = resolvePath(base_path, std::move(column_description_file_name)); - data_description_file_name = resolvePath(base_path, std::move(data_description_file_name)); - if (context.getApplicationType() == Context::ApplicationType::SERVER) - { - const auto & base_path_str = base_path.string(); - checkCreationIsAllowed(base_path_str, column_description_file_name); - checkCreationIsAllowed(base_path_str, data_description_file_name); - } - - parseColumnDescription(); - createSampleBlockAndColumns(); -} - -std::string StorageCatBoostPool::getColumnTypesString(const ColumnTypesMap & columnTypesMap) -{ - std::string types_string; - bool first = true; - for (const auto & value : columnTypesMap) - { - if (!first) - types_string.append(", "); - - first = false; - types_string += value.first; - } - - return types_string; -} - -void StorageCatBoostPool::checkDatasetDescription() -{ - std::ifstream in(data_description_file_name); - if (!in.good()) - throw Exception("Cannot open file: " + data_description_file_name, ErrorCodes::CANNOT_OPEN_FILE); - - std::string line; - if (!std::getline(in, line)) - throw Exception("File is empty: " + data_description_file_name, ErrorCodes::CANNOT_PARSE_TEXT); - - size_t columns_count = 1; - for (char sym : line) - if (sym == '\t') - ++columns_count; - - columns_description.resize(columns_count); -} - -void StorageCatBoostPool::parseColumnDescription() -{ - /// NOTE: simple parsing - /// TODO: use ReadBufferFromFile - - checkDatasetDescription(); - - std::ifstream in(column_description_file_name); - if (!in.good()) - throw Exception("Cannot open file: " + column_description_file_name, ErrorCodes::CANNOT_OPEN_FILE); - - std::string line; - size_t line_num = 0; - auto column_types_map = getColumnTypesMap(); - auto column_types_string = getColumnTypesString(column_types_map); - - /// Enumerate default names for columns as Auxiliary, Auxiliary1, Auxiliary2, ... - std::map columns_per_type_count; - - while (std::getline(in, line)) - { - ++line_num; - std::string str_line_num = std::to_string(line_num); - - if (line.empty()) - continue; - - std::istringstream iss(line); - std::vector tokens; - std::string token; - while (std::getline(iss, token, '\t')) - tokens.push_back(token); - - if (tokens.size() != 2 && tokens.size() != 3) - throw Exception("Cannot parse column description at line " + str_line_num + " '" + line + "' " - + ": expected 2 or 3 columns, got " + std::to_string(tokens.size()), - ErrorCodes::CANNOT_PARSE_TEXT); - - std::string str_id = tokens[0]; - std::string col_type = tokens[1]; - std::string col_alias = tokens.size() > 2 ? tokens[2] : ""; - - size_t num_id; - try - { - num_id = std::stoull(str_id); - } - catch (std::exception & e) - { - throw Exception("Cannot parse column index at row " + str_line_num + ": " + e.what(), - ErrorCodes::CANNOT_PARSE_TEXT); - } - - if (num_id >= columns_description.size()) - throw Exception("Invalid index at row " + str_line_num + ": " + str_id - + ", expected in range [0, " + std::to_string(columns_description.size()) + ")", - ErrorCodes::CANNOT_PARSE_TEXT); - - if (column_types_map.count(col_type) == 0) - throw Exception("Invalid column type: " + col_type + ", expected: " + column_types_string, - ErrorCodes::CANNOT_PARSE_TEXT); - - auto type = column_types_map[col_type]; - - std::string col_name; - - bool is_feature_column = type == DatasetColumnType::Num || type == DatasetColumnType::Categ; - auto & col_number = columns_per_type_count[type]; - /// If column is not feature skip '0' after the name (to use 'Target' instead of 'Target0'). - col_name = col_type + (is_feature_column || col_number ? std::to_string(col_number) : ""); - ++col_number; - - columns_description[num_id] = ColumnDescription(col_name, col_alias, type); - } -} - -void StorageCatBoostPool::createSampleBlockAndColumns() -{ - ColumnsDescription columns; - NamesAndTypesList cat_columns; - NamesAndTypesList num_columns; - sample_block.clear(); - for (auto & desc : columns_description) - { - DataTypePtr type; - if (desc.column_type == DatasetColumnType::Categ - || desc.column_type == DatasetColumnType::Auxiliary - || desc.column_type == DatasetColumnType::DocId) - type = std::make_shared(); - else - type = std::make_shared(); - - if (desc.column_type == DatasetColumnType::Categ) - cat_columns.emplace_back(desc.column_name, type); - else if (desc.column_type == DatasetColumnType::Num) - num_columns.emplace_back(desc.column_name, type); - else - columns.materialized.emplace_back(desc.column_name, type); - - if (!desc.alias.empty()) - { - auto alias = std::make_shared(desc.column_name); - columns.defaults[desc.alias] = {ColumnDefaultKind::Alias, alias}; - columns.aliases.emplace_back(desc.alias, type); - } - - sample_block.insert(ColumnWithTypeAndName(type, desc.column_name)); - } - columns.ordinary.insert(columns.ordinary.end(), num_columns.begin(), num_columns.end()); - columns.ordinary.insert(columns.ordinary.end(), cat_columns.begin(), cat_columns.end()); - - setColumns(columns); -} - -BlockInputStreams StorageCatBoostPool::read(const Names & column_names, - const SelectQueryInfo & /*query_info*/, - const Context & context, - QueryProcessingStage::Enum & /*processed_stage*/, - size_t max_block_size, - unsigned /*threads*/) -{ - auto stream = std::make_shared( - data_description_file_name, "TSV", sample_block, context, max_block_size); - - auto filter_stream = std::make_shared(stream, column_names, false); - return { filter_stream }; -} - -} diff --git a/dbms/src/Storages/StorageCatBoostPool.h b/dbms/src/Storages/StorageCatBoostPool.h deleted file mode 100644 index 0f4f7c2cede..00000000000 --- a/dbms/src/Storages/StorageCatBoostPool.h +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright 2022 PingCAP, Ltd. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include - -namespace DB -{ -class StorageCatBoostPool : public ext::SharedPtrHelper - , public IStorage -{ -public: - std::string getName() const override { return "CatBoostPool"; } - - std::string getTableName() const override { return table_name; } - - BlockInputStreams read(const Names & column_names, - const SelectQueryInfo & query_info, - const Context & context, - QueryProcessingStage::Enum & processed_stage, - size_t max_block_size, - unsigned threads) override; - -private: - String table_name; - - String column_description_file_name; - String data_description_file_name; - Block sample_block; - - enum class DatasetColumnType - { - Target, - Num, - Categ, - Auxiliary, - DocId, - Weight, - Baseline - }; - - using ColumnTypesMap = std::map; - - ColumnTypesMap getColumnTypesMap() const - { - return { - {"Target", DatasetColumnType::Target}, - {"Num", DatasetColumnType::Num}, - {"Categ", DatasetColumnType::Categ}, - {"Auxiliary", DatasetColumnType::Auxiliary}, - {"DocId", DatasetColumnType::DocId}, - {"Weight", DatasetColumnType::Weight}, - {"Baseline", DatasetColumnType::Baseline}, - }; - }; - - std::string getColumnTypesString(const ColumnTypesMap & columnTypesMap); - - struct ColumnDescription - { - std::string column_name; - std::string alias; - DatasetColumnType column_type; - - ColumnDescription() - : column_type(DatasetColumnType::Num) - {} - ColumnDescription(std::string column_name, std::string alias, DatasetColumnType column_type) - : column_name(std::move(column_name)) - , alias(std::move(alias)) - , column_type(column_type) - {} - }; - - std::vector columns_description; - - void checkDatasetDescription(); - void parseColumnDescription(); - void createSampleBlockAndColumns(); - -protected: - StorageCatBoostPool(const Context & context, String column_description_file_name, String data_description_file_name); -}; - -} // namespace DB diff --git a/dbms/src/TableFunctions/TableFunctionCatBoostPool.cpp b/dbms/src/TableFunctions/TableFunctionCatBoostPool.cpp deleted file mode 100644 index ab5cd7e5849..00000000000 --- a/dbms/src/TableFunctions/TableFunctionCatBoostPool.cpp +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022 PingCAP, Ltd. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include - - -namespace DB -{ -namespace ErrorCodes -{ -extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; -extern const int BAD_ARGUMENTS; -} // namespace ErrorCodes - - -StoragePtr TableFunctionCatBoostPool::executeImpl(const ASTPtr & ast_function, const Context & context) const -{ - ASTs & args_func = typeid_cast(*ast_function).children; - - std::string err = "Table function '" + getName() + "' requires 2 parameters: " - + "column descriptions file, dataset description file"; - - if (args_func.size() != 1) - throw Exception(err, ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - - ASTs & args = typeid_cast(*args_func.at(0)).children; - - if (args.size() != 2) - throw Exception(err, ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - - auto getStringLiteral = [](const IAST & node, const char * description) { - auto lit = typeid_cast(&node); - if (!lit) - throw Exception(description + String(" must be string literal (in single quotes)."), ErrorCodes::BAD_ARGUMENTS); - - if (lit->value.getType() != Field::Types::String) - throw Exception(description + String(" must be string literal (in single quotes)."), ErrorCodes::BAD_ARGUMENTS); - - return safeGet(lit->value); - }; - String column_descriptions_file = getStringLiteral(*args[0], "Column descriptions file"); - String dataset_description_file = getStringLiteral(*args[1], "Dataset description file"); - - return StorageCatBoostPool::create(context, column_descriptions_file, dataset_description_file); -} - -void registerTableFunctionCatBoostPool(TableFunctionFactory & factory) -{ - factory.registerFunction(); -} - -} // namespace DB diff --git a/dbms/src/TableFunctions/TableFunctionCatBoostPool.h b/dbms/src/TableFunctions/TableFunctionCatBoostPool.h deleted file mode 100644 index 0b5b32dfffe..00000000000 --- a/dbms/src/TableFunctions/TableFunctionCatBoostPool.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2022 PingCAP, Ltd. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - - -namespace DB -{ -/* catboostPool('column_descriptions_file', 'dataset_description_file') - * Create storage from CatBoost dataset. - */ -class TableFunctionCatBoostPool : public ITableFunction -{ -public: - static constexpr auto name = "catBoostPool"; - std::string getName() const override { return name; } - -private: - StoragePtr executeImpl(const ASTPtr & ast_function, const Context & context) const override; -}; - -} // namespace DB diff --git a/dbms/src/TableFunctions/registerTableFunctions.cpp b/dbms/src/TableFunctions/registerTableFunctions.cpp index bfe219eec62..8449077cc96 100644 --- a/dbms/src/TableFunctions/registerTableFunctions.cpp +++ b/dbms/src/TableFunctions/registerTableFunctions.cpp @@ -21,14 +21,13 @@ namespace DB { void registerTableFunctionMerge(TableFunctionFactory & factory); void registerTableFunctionNumbers(TableFunctionFactory & factory); -void registerTableFunctionCatBoostPool(TableFunctionFactory & factory); + void registerTableFunctions() { auto & factory = TableFunctionFactory::instance(); registerTableFunctionMerge(factory); registerTableFunctionNumbers(factory); - registerTableFunctionCatBoostPool(factory); } } // namespace DB