diff --git a/src/generic_search.cpp b/src/generic_search.cpp index 47a9d09fa8..f7647b8f95 100644 --- a/src/generic_search.cpp +++ b/src/generic_search.cpp @@ -24,21 +24,39 @@ * *******************************************************************************/ -#include #include +#include #include #include +#include namespace miopen { namespace solver { -MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_TUNING_ITERATIONS_MAX) - std::size_t GetTuningIterationsMax() { return Value(MIOPEN_DEBUG_TUNING_ITERATIONS_MAX{}, std::numeric_limits::max()); } +std::chrono::milliseconds GetTuningTimeMax() +{ + static const auto fallback = + std::chrono::duration_cast(std::chrono::hours{2}); + static const auto res = + std::chrono::milliseconds{Value(MIOPEN_TUNING_TIME_MS_MAX{}, fallback.count())}; + return res; +} + +std::size_t GetTuningThreadsMax() +{ +#if MIOPEN_USE_COMGR + const auto def_max = 1; // COMGR is not parallelizable +#else + const int def_max = std::thread::hardware_concurrency() / 2; +#endif + return Value(MIOPEN_COMPILE_PARALLEL_LEVEL{}, def_max); +} + } // namespace solver } // namespace miopen diff --git a/src/include/miopen/generic_search.hpp b/src/include/miopen/generic_search.hpp index 6f8539b271..531911ada7 100644 --- a/src/include/miopen/generic_search.hpp +++ b/src/include/miopen/generic_search.hpp @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include @@ -45,12 +47,11 @@ #include #include #include +#include namespace miopen { namespace solver { -MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_COMPILE_ONLY) - /// This STL-like container together with corresponding iterator provide access /// to a set of all available performance configs for the given problem config. /// @@ -294,6 +295,50 @@ GetAllSolutions(const Solver s, const Context& context_, const Problem& problem) } std::size_t GetTuningIterationsMax(); +std::chrono::milliseconds GetTuningTimeMax(); // returns the max allowed time in milliseconds +std::size_t GetTuningThreadsMax(); + +template +void CompileAgent(size_t thread_index, + size_t total_threads, + const Solver& s, + const Context& context, + const Problem& problem, + std::vector& data, + ThreadSafeQueue>& comp_queue) +{ + const auto start_time = + std::chrono::time_point_cast(std::chrono::steady_clock::now()); + const auto data_size = data.size(); + const auto time_budget = GetTuningTimeMax(); + const auto& profile_h = context.GetStream(); + // start the counter + for(auto idx = thread_index; idx < data_size; idx += total_threads) + { + // Check if we are out of time + const auto current_time = std::chrono::time_point_cast( + std::chrono::steady_clock::now()); + if(current_time - start_time > time_budget) + { + MIOPEN_LOG_I2("Thread: " << thread_index << " Done, exhausted time budget"); + auto tmp = std::make_tuple({}, {}, true); + comp_queue.push(std::move(tmp)); + break; + } + auto& current_config = data.at(idx); + ConvSolution current_solution = s.GetSolution(context, problem, current_config); + for(const auto& kernel : current_solution.construction_params) + { + if(profile_h.HasProgram(kernel.kernel_file, kernel.comp_options)) + continue; + std::ignore = profile_h.LoadProgram(kernel.kernel_file, kernel.comp_options, false, ""); + } + auto tup = std::make_tuple( + std::move(current_config), std::move(current_solution), false); + comp_queue.push(std::move(tup)); + } + MIOPEN_LOG_I2("Thread: " << thread_index << " Done, completed tuning"); +} template auto GenericSearch(const Solver s, @@ -323,10 +368,16 @@ auto GenericSearch(const Solver s, auto& profile_h = context.GetStream(); AutoEnableProfiling enableProfiling{profile_h}; - auto all_configs = GetAllConfigs(s, context, problem); - const std::size_t n_runs_total = - std::min(static_cast(std::distance(all_configs.begin(), all_configs.end())), - GetTuningIterationsMax()); + auto tmp_all_configs = GetAllConfigs(s, context, problem); + // For random access + std::vector all_configs; + std::copy(tmp_all_configs.begin(), tmp_all_configs.end(), std::back_inserter(all_configs)); + // shuffle the configs + std::random_device rd{}; + auto rng = std::default_random_engine{rd()}; + std::shuffle(all_configs.begin(), all_configs.end(), rng); + const std::size_t n_runs_total = std::min(all_configs.size(), GetTuningIterationsMax()); + all_configs.resize(n_runs_total); bool is_passed = false; // left false only if all iterations failed. float best_time = std::numeric_limits::max(); @@ -335,45 +386,56 @@ auto GenericSearch(const Solver s, HeartBeat heartbeat; heartbeat.Start(); - if(!miopen::IsCacheDisabled()) // Otherwise precompilation is useless. + const auto total_threads = GetTuningThreadsMax(); + + ThreadSafeQueue> solution_queue; + std::vector compile_agents; + compile_agents.reserve(total_threads); + for(auto idx = 0; idx < total_threads; ++idx) { - std::vector kernels; - size_t n_current = 0; - for(const auto& current_config : all_configs) - { - if(n_current >= n_runs_total) - break; - ConvSolution current_solution = s.GetSolution(context, problem, current_config); - for(auto&& kernel : current_solution.construction_params) - { - if(profile_h.HasProgram(kernel.kernel_file, kernel.comp_options)) - continue; - kernels.push_back(kernel); - } - ++n_current; - } - std::ignore = PrecompileKernels(profile_h, kernels); + compile_agents.emplace_back(CompileAgent, + idx, + total_threads, + std::cref(s), + std::cref(context), + std::cref(problem), + std::ref(all_configs), + std::ref(solution_queue)); } if(!IsEnabled(MIOPEN_DEBUG_COMPILE_ONLY{})) { - size_t n_current = 0; - for(const auto& current_config : all_configs) + size_t n_current = 0; + auto threads_remaining = total_threads; + while(true) { if(n_current >= n_runs_total) break; + MIOPEN_LOG_I2("Waiting for item in queue"); + const auto kinder = solution_queue.pop(); + auto current_config = std::get<0>(kinder); + auto current_solution = std::get<1>(kinder); + + if(std::get<2>(kinder)) + { + threads_remaining--; + if(threads_remaining == 0) + break; + else + { + continue; + } + } float elapsed_time = 0.0f; int ret = 0; MIOPEN_LOG_I2('#' << n_current << '/' << n_failed << '/' << n_runs_total << ' ' << current_config); - ConvSolution current_solution; Invoker invoker; try { - current_solution = s.GetSolution(context, problem, current_config); if(default_solution.workspace_sz != current_solution.workspace_sz) { ret = -2; @@ -472,6 +534,9 @@ auto GenericSearch(const Solver s, "Running kernels on GPU is disabled. Search skipped"); } + for(auto& agent : compile_agents) + agent.join(); + MIOPEN_LOG_W("Done: " << n_runs_total << '/' << n_failed << '/' << n_runs_total << ", best #" << n_best << ' ' << best_time << ' ' << best_config); diff --git a/src/include/miopen/generic_search_controls.hpp b/src/include/miopen/generic_search_controls.hpp new file mode 100644 index 0000000000..37d14d0639 --- /dev/null +++ b/src/include/miopen/generic_search_controls.hpp @@ -0,0 +1,39 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once +#include + +namespace miopen { +namespace solver { + +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_TUNING_ITERATIONS_MAX) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_TUNING_TIME_MS_MAX) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_COMPILE_PARALLEL_LEVEL) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_COMPILE_ONLY) + +} // namespace solver +} // namespace miopen diff --git a/src/include/miopen/mt_queue.hpp b/src/include/miopen/mt_queue.hpp new file mode 100644 index 0000000000..0b6ffe801a --- /dev/null +++ b/src/include/miopen/mt_queue.hpp @@ -0,0 +1,59 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include +#include + +template +class ThreadSafeQueue +{ + std::mutex mutex; + std::condition_variable cond_var; + std::queue queue; + +public: + void push(T&& item) + { + + { + std::lock_guard lock(mutex); + queue.push(item); + } + + cond_var.notify_one(); + } + T pop() + { + std::unique_lock lock(mutex); + cond_var.wait(lock, [&] { return !queue.empty(); }); + T ret = queue.front(); + queue.pop(); + return ret; + } +}; diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp index c2a7154685..626f5098d8 100644 --- a/src/include/miopen/sqlite_db.hpp +++ b/src/include/miopen/sqlite_db.hpp @@ -58,6 +58,7 @@ class path; namespace miopen { MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_DISABLE_SQL_WAL) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_PERFDB_OVERRIDE) constexpr bool InMemDb = MIOPEN_EMBED_DB; #if MIOPEN_ENABLE_SQLITE_BACKOFF @@ -438,6 +439,30 @@ class SQLitePerfDb : public SQLiteBase { if(dbInvalid) return boost::none; + + const auto pdb_ovr = miopen::GetStringEnv(MIOPEN_DEBUG_PERFDB_OVERRIDE{}); + if(pdb_ovr != nullptr) + { + MIOPEN_LOG_I2("overriding tuning params with: " << pdb_ovr); + DbRecord ovr_rec; + const auto solv_vals = SplitDelim(pdb_ovr, ':'); + bool success = true; + for(const auto& solv_val : solv_vals) + { + const auto vals = SplitDelim(solv_val, ';'); + if(vals.size() != 2) + { + MIOPEN_LOG_W("Invalid value for MIOPEN_DEBUG_PERFDB_OVERRIDE. Format: " + ";:;params"); + success = false; + break; + } + MIOPEN_LOG_I2("Inserting Overriding PDB entry: " << vals[0] << ";" << vals[1]); + ovr_rec.SetValues(vals.at(0), vals.at(1)); + } + if(success) + return {ovr_rec}; + } std::string clause; std::vector values; std::tie(clause, values) = problem_config.WhereClause(); diff --git a/src/solver.cpp b/src/solver.cpp index fe2d7bc4b5..da4dd5eadc 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -45,8 +45,6 @@ namespace miopen { namespace solver { -MIOPEN_DECLARE_ENV_VAR(MIOPEN_COMPILE_PARALLEL_LEVEL) - std::ostream& operator<<(std::ostream& os, const KernelInfo& k) { os << k.kernel_file << ", " << k.kernel_name << " g_wk={ "; @@ -65,7 +63,8 @@ std::vector PrecompileKernels(const Handle& h, const std::vector +#include +#include +#include + +#include + +static std::atomic num_prod{}; + +static const auto total_producers = std::thread::hardware_concurrency(); +const auto data_len = 100; + +template +using data_t = std::vector; + +template +void producer(int thread_idx, data_t& common_data, ThreadSafeQueue& comp_queue) +{ + for(auto idx = thread_idx; idx < data_len; idx += total_producers) + { + comp_queue.push(std::move(common_data.at(idx))); + num_prod++; + std::this_thread::sleep_for(std::chrono::milliseconds(rand() % 100)); + } +} + +TEST(UtilMultiThreadQueue, Basic) +{ + ThreadSafeQueue comp_queue; + int num_cons = 0; + data_t common_data; + for(auto idx = 0; idx < data_len; ++idx) + common_data.emplace_back(idx); + + std::vector producers; + for(int idx = 0; idx < total_producers; idx++) + { + producers.emplace_back(producer, idx, std::ref(common_data), std::ref(comp_queue)); + } + + for(auto idx = 0; idx < data_len; ++idx) + { + auto res = comp_queue.pop(); + std::cerr << res << std::endl; + num_cons++; + } + + for(auto& prod : producers) + prod.join(); + + std::cout << "Stage 2" << std::endl; + for(const auto& tmp : common_data) + std::cout << tmp << std::endl; + EXPECT_EQ(num_prod, num_cons); +}