From 509a6ab9cd6915343bc0811f9a2a3f41b4150e35 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Tue, 7 Feb 2023 18:06:57 +0000 Subject: [PATCH 01/10] add MultiThreaded Queue --- src/include/miopen/generic_search.hpp | 35 ++++++++++-- src/include/miopen/mt_queue.hpp | 59 +++++++++++++++++++ src/include/miopen/sqlite_db.hpp | 17 ++++++ test/gtest/mt_queue.cpp | 82 +++++++++++++++++++++++++++ 4 files changed, 189 insertions(+), 4 deletions(-) create mode 100644 src/include/miopen/mt_queue.hpp create mode 100644 test/gtest/mt_queue.cpp diff --git a/src/include/miopen/generic_search.hpp b/src/include/miopen/generic_search.hpp index bc5bb8aa7c..4b52d7053b 100644 --- a/src/include/miopen/generic_search.hpp +++ b/src/include/miopen/generic_search.hpp @@ -45,6 +45,7 @@ #include #include #include +#include namespace miopen { namespace solver { @@ -323,10 +324,15 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam auto& profile_h = context.GetStream(); AutoEnableProfiling enableProfiling{profile_h}; - auto all_configs = GetAllConfigs(s, context); + auto tmp_all_configs = GetAllConfigs(s, context); const std::size_t n_runs_total = - std::min(static_cast(std::distance(all_configs.begin(), all_configs.end())), + std::min(static_cast(std::distance(tmp_all_configs.begin(), tmp_all_configs.end())), GetTuningIterationsMax()); + std::vector> all_configs; + for(const auto& config : tmp_all_configs) + { + all_configs.push_back(std::make_pair(config, false)); + } bool is_passed = false; // left false only if all iterations failed. float best_time = std::numeric_limits::max(); @@ -339,10 +345,12 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam { std::vector kernels; size_t n_current = 0; - for(const auto& current_config : all_configs) + for(const auto& kinder : all_configs) { if(n_current >= n_runs_total) break; + + const auto& current_config = kinder.first; ConvSolution current_solution = s.GetSolution(context, current_config); for(auto&& kernel : current_solution.construction_params) { @@ -355,13 +363,32 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam std::ignore = PrecompileKernels(profile_h, kernels); } + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> distrib(0, n_runs_total-1); + + if(!IsEnabled(MIOPEN_DEBUG_COMPILE_ONLY{})) { size_t n_current = 0; - for(const auto& current_config : all_configs) + // for(const auto& kinder: all_configs) + while(true) { if(n_current >= n_runs_total) break; + const auto idx = distrib(gen); + MIOPEN_LOG_I2("Got random index: " << idx); + auto& kinder = all_configs[idx]; + + if(kinder.second) + { + MIOPEN_LOG_I2("Skipping tested entry"); + continue; // This point has already been tested + } + else + kinder.second = true; + + const auto& current_config = kinder.first; float elapsed_time = 0.0f; int ret = 0; diff --git a/src/include/miopen/mt_queue.hpp b/src/include/miopen/mt_queue.hpp new file mode 100644 index 0000000000..584f759dab --- /dev/null +++ b/src/include/miopen/mt_queue.hpp @@ -0,0 +1,59 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include +#include + +template +class ThreadSafeQueue { + std::mutex mutex; + std::condition_variable cond_var; + std::queue queue; + +public: + void push(T&& item) { + { + std::lock_guard lock(mutex); + queue.push(item); + } + + cond_var.notify_one(); + } + + T& front() { + std::unique_lock lock(mutex); + cond_var.wait(lock, [&]{ return !queue.empty(); }); + return queue.front(); + } + + void pop() { + std::lock_guard lock(mutex); + queue.pop(); + } +}; diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp index c2a7154685..e673fb2615 100644 --- a/src/include/miopen/sqlite_db.hpp +++ b/src/include/miopen/sqlite_db.hpp @@ -58,6 +58,7 @@ class path; namespace miopen { MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_DISABLE_SQL_WAL) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_PERFDB_OVERRIDE) constexpr bool InMemDb = MIOPEN_EMBED_DB; #if MIOPEN_ENABLE_SQLITE_BACKOFF @@ -438,6 +439,22 @@ class SQLitePerfDb : public SQLiteBase { if(dbInvalid) return boost::none; + + const auto pdb_ovr = miopen::GetStringEnv(MIOPEN_DEBUG_PERFDB_OVERRIDE{}); + if(pdb_ovr != nullptr) + { + MIOPEN_LOG_I2("overriding tuning params with: " << pdb_ovr); + DbRecord ovr_rec; + const auto solv_vals = SplitDelim(pdb_ovr, ':'); + for(const auto& solv_val : solv_vals) + { + const auto vals = SplitDelim(solv_val, ';'); + assert(vals.size() == 2); + MIOPEN_LOG_I2("Inserting Overriding PDB entry: " << vals[0] << ";" << vals[1]); + ovr_rec.SetValues(vals.at(0), vals.at(1)); + } + return {ovr_rec}; + } std::string clause; std::vector values; std::tie(clause, values) = problem_config.WhereClause(); diff --git a/test/gtest/mt_queue.cpp b/test/gtest/mt_queue.cpp new file mode 100644 index 0000000000..83d60d8ca9 --- /dev/null +++ b/test/gtest/mt_queue.cpp @@ -0,0 +1,82 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include + +#include + +static ThreadSafeQueue> comp_queue; +static std::atomic num_prod{}; +static std::atomic num_cons{}; + +static const auto total_producers = 3; //std::thread::hardware_concurrency(); +const auto data_len = 100; +using data_t = std::vector>; + +void producer(int thread_idx, data_t& common_data) +{ + for(auto idx = thread_idx; idx < data_len; idx += total_producers) + { + comp_queue.push(std::move(common_data.at(idx))); + num_prod++; + std::this_thread::sleep_for(std::chrono::milliseconds(rand() % 100)); + } +} + +void consumer() +{ + for(auto idx = 0; idx < data_len; idx++) + { + std::cerr << *(comp_queue.front()) << std::endl; + num_cons++; + comp_queue.pop(); + } +} + +TEST(UtilMultiThreadQueue, Basic) +{ + data_t common_data; + for(auto idx = 0; idx < data_len; ++idx) + common_data.emplace_back(std::make_shared(idx)); + + std::thread cons(consumer); + std::vector producers; + for(int idx = 0;idx < total_producers; idx++) + { + producers.emplace_back(producer, idx, std::ref(common_data)); + } + + for(auto& prod: producers) + prod.join(); + + cons.join(); + std::cout << "Stage 2" << std::endl; + for(const auto& tmp: common_data) + std::cout << *tmp << std::endl; + EXPECT_EQ(num_prod, num_cons); +} From ba2ad1b918437090f6a80830b52fd81150189629 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Thu, 16 Feb 2023 22:04:48 +0000 Subject: [PATCH 02/10] finishing touches --- src/generic_search.cpp | 20 ++++ src/include/miopen/generic_search.hpp | 130 +++++++++++++++++--------- test/gtest/mt_queue.cpp | 41 ++++---- 3 files changed, 128 insertions(+), 63 deletions(-) diff --git a/src/generic_search.cpp b/src/generic_search.cpp index 47a9d09fa8..1b51ad6202 100644 --- a/src/generic_search.cpp +++ b/src/generic_search.cpp @@ -29,16 +29,36 @@ #include #include +#include namespace miopen { namespace solver { MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_TUNING_ITERATIONS_MAX) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_TUNING_TIME_MS_MAX) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_COMPILE_PARALLEL_LEVEL) std::size_t GetTuningIterationsMax() { return Value(MIOPEN_DEBUG_TUNING_ITERATIONS_MAX{}, std::numeric_limits::max()); } +std::chrono::milliseconds GetTuningTimeMax() +{ + const auto fallback = std::chrono::duration_cast(std::chrono::hours{2}); + static const auto res = std::chrono::milliseconds{Value(MIOPEN_TUNING_TIME_MS_MAX{}, fallback.count() )}; + return res; +} + +std::size_t GetTuningThreadsMax() +{ +#if MIOPEN_USE_COMGR + const auto def_max = 1; // COMGR is not parallelizable +#else + const auto def_max = 20; +#endif + return Value(MIOPEN_COMPILE_PARALLEL_LEVEL{}, def_max); +} + } // namespace solver } // namespace miopen diff --git a/src/include/miopen/generic_search.hpp b/src/include/miopen/generic_search.hpp index 4b52d7053b..5219a4bf91 100644 --- a/src/include/miopen/generic_search.hpp +++ b/src/include/miopen/generic_search.hpp @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -299,6 +300,49 @@ auto GenericSearch(const Solver s, } std::size_t GetTuningIterationsMax(); +std::chrono::milliseconds GetTuningTimeMax(); // returns the max allowed time in milliseconds +std::size_t GetTuningThreadsMax(); + +template +void CompileAgent( + size_t thread_index, + size_t total_threads, + const Solver& s, + const Context& context_, + std::vector& data, + ThreadSafeQueue>& comp_queue) +{ + const auto start_time = std::chrono::time_point_cast(std::chrono::system_clock::now()); + const auto data_size = data.size(); + const auto time_budget = GetTuningTimeMax(); + auto context = context_; // Not sure if context is thread safe + context.is_for_generic_search = true; + const auto& profile_h = context.GetStream(); + // start the counter + for(auto idx = thread_index; idx < data_size; idx += total_threads) + { + // Check if we are out of time + const auto current_time = std::chrono::time_point_cast(std::chrono::system_clock::now()); + if(current_time - start_time > time_budget) + { + MIOPEN_LOG_I2("Thread: " << thread_index << " Done"); + auto tmp = std::make_tuple({}, {}, true); + comp_queue.push(std::move(tmp)); + break; + } + auto& current_config = data.at(idx); + ConvSolution current_solution = s.GetSolution(context, current_config); + for(const auto& kernel : current_solution.construction_params) + { + if(profile_h.HasProgram(kernel.kernel_file, kernel.comp_options)) + continue; + std::ignore = profile_h.LoadProgram(kernel.kernel_file, kernel.comp_options, false, ""); + } + auto tup = std::make_tuple(std::move(current_config), std::move(current_solution), false); + comp_queue.push(std::move(tup)); + } + return; +} template auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParams& invoke_ctx_) @@ -325,14 +369,21 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam AutoEnableProfiling enableProfiling{profile_h}; auto tmp_all_configs = GetAllConfigs(s, context); - const std::size_t n_runs_total = - std::min(static_cast(std::distance(tmp_all_configs.begin(), tmp_all_configs.end())), - GetTuningIterationsMax()); - std::vector> all_configs; - for(const auto& config : tmp_all_configs) + // For random access + std::vector all_configs; + for(auto& kinder : tmp_all_configs) { - all_configs.push_back(std::make_pair(config, false)); + all_configs.push_back(kinder); } + // shuffle the configs + auto rd = std::random_device{}; + auto rng = std::default_random_engine{rd()}; + std::shuffle(all_configs.begin(), all_configs.end(), rng); + const std::size_t n_runs_total = + std::min(static_cast(std::distance(all_configs.begin(), all_configs.end())), + GetTuningIterationsMax()); + + all_configs = std::vector(all_configs.begin(), all_configs.begin() + n_runs_total); bool is_passed = false; // left false only if all iterations failed. float best_time = std::numeric_limits::max(); @@ -341,66 +392,55 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam HeartBeat heartbeat; heartbeat.Start(); - if(!miopen::IsCacheDisabled()) // Otherwise precompilation is useless. + const auto total_threads = GetTuningThreadsMax(); + ThreadSafeQueue> solution_queue; + std::vector compile_agents; + for(auto idx = 0; idx < total_threads; ++idx) { - std::vector kernels; - size_t n_current = 0; - for(const auto& kinder : all_configs) - { - if(n_current >= n_runs_total) - break; - - const auto& current_config = kinder.first; - ConvSolution current_solution = s.GetSolution(context, current_config); - for(auto&& kernel : current_solution.construction_params) - { - if(profile_h.HasProgram(kernel.kernel_file, kernel.comp_options)) - continue; - kernels.push_back(kernel); - } - ++n_current; - } - std::ignore = PrecompileKernels(profile_h, kernels); + compile_agents.emplace_back(CompileAgent, + idx, + total_threads, + std::cref(s), + std::cref(context), + std::ref(all_configs), + std::ref(solution_queue)); } - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution<> distrib(0, n_runs_total-1); - if(!IsEnabled(MIOPEN_DEBUG_COMPILE_ONLY{})) { size_t n_current = 0; - // for(const auto& kinder: all_configs) + auto threads_remaining = total_threads; while(true) { if(n_current >= n_runs_total) break; - const auto idx = distrib(gen); - MIOPEN_LOG_I2("Got random index: " << idx); - auto& kinder = all_configs[idx]; - - if(kinder.second) + MIOPEN_LOG_I2("Waiting for item in queue"); + const auto kinder = solution_queue.front(); + auto current_config = std::get<0>(kinder); + auto current_solution = std::get<1>(kinder); + + if(std::get<2>(kinder)) { - MIOPEN_LOG_I2("Skipping tested entry"); - continue; // This point has already been tested + threads_remaining--; + if(threads_remaining == 0) + break; + else + { + continue; + } } - else - kinder.second = true; - - const auto& current_config = kinder.first; + float elapsed_time = 0.0f; int ret = 0; MIOPEN_LOG_I2('#' << n_current << '/' << n_failed << '/' << n_runs_total << ' ' << current_config); - ConvSolution current_solution; Invoker invoker; try { - current_solution = s.GetSolution(context, current_config); if(default_solution.workspace_sz != current_solution.workspace_sz) { ret = -2; @@ -491,6 +531,7 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam n_runs_total, current_config); ++n_current; + solution_queue.pop(); } } else @@ -499,6 +540,9 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam "Running kernels on GPU is disabled. Search skipped"); } + for(auto& agent: compile_agents) + agent.join(); + MIOPEN_LOG_W("Done: " << n_runs_total << '/' << n_failed << '/' << n_runs_total << ", best #" << n_best << ' ' << best_time << ' ' << best_config); diff --git a/test/gtest/mt_queue.cpp b/test/gtest/mt_queue.cpp index 83d60d8ca9..5d51b505b0 100644 --- a/test/gtest/mt_queue.cpp +++ b/test/gtest/mt_queue.cpp @@ -30,16 +30,18 @@ #include -static ThreadSafeQueue> comp_queue; static std::atomic num_prod{}; -static std::atomic num_cons{}; -static const auto total_producers = 3; //std::thread::hardware_concurrency(); +static const auto total_producers = std::thread::hardware_concurrency(); const auto data_len = 100; -using data_t = std::vector>; -void producer(int thread_idx, data_t& common_data) +template +using data_t = std::vector; + +template +void producer(int thread_idx, data_t& common_data, ThreadSafeQueue& comp_queue) { + return; for(auto idx = thread_idx; idx < data_len; idx += total_producers) { comp_queue.push(std::move(common_data.at(idx))); @@ -48,35 +50,34 @@ void producer(int thread_idx, data_t& common_data) } } -void consumer() -{ - for(auto idx = 0; idx < data_len; idx++) - { - std::cerr << *(comp_queue.front()) << std::endl; - num_cons++; - comp_queue.pop(); - } -} + TEST(UtilMultiThreadQueue, Basic) { - data_t common_data; + ThreadSafeQueue comp_queue; + int num_cons = 0; + data_t common_data; for(auto idx = 0; idx < data_len; ++idx) - common_data.emplace_back(std::make_shared(idx)); + common_data.emplace_back(idx); - std::thread cons(consumer); std::vector producers; for(int idx = 0;idx < total_producers; idx++) { - producers.emplace_back(producer, idx, std::ref(common_data)); + producers.emplace_back(producer, idx, std::ref(common_data), std::ref(comp_queue)); + } + + for(auto idx = 0; idx < data_len; ++idx) + { + std::cerr << comp_queue.front() << std::endl; + num_cons++; + comp_queue.pop(); } for(auto& prod: producers) prod.join(); - cons.join(); std::cout << "Stage 2" << std::endl; for(const auto& tmp: common_data) - std::cout << *tmp << std::endl; + std::cout << tmp << std::endl; EXPECT_EQ(num_prod, num_cons); } From 203bc025fe1e5f13974849503fd0805a28071729 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Thu, 16 Feb 2023 22:06:50 +0000 Subject: [PATCH 03/10] clang-format --- fin | 1 - src/generic_search.cpp | 8 ++-- src/include/miopen/generic_search.hpp | 67 ++++++++++++++------------- src/include/miopen/mt_queue.hpp | 14 ++++-- src/include/miopen/sqlite_db.hpp | 4 +- test/gtest/mt_queue.cpp | 54 +++++++++++---------- 6 files changed, 76 insertions(+), 72 deletions(-) delete mode 160000 fin diff --git a/fin b/fin deleted file mode 160000 index 43e6bd7b78..0000000000 --- a/fin +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 43e6bd7b785e6b2ba9fd280c7ca8d1b4ba49c9a3 diff --git a/src/generic_search.cpp b/src/generic_search.cpp index 1b51ad6202..8234a6bab4 100644 --- a/src/generic_search.cpp +++ b/src/generic_search.cpp @@ -45,8 +45,10 @@ std::size_t GetTuningIterationsMax() std::chrono::milliseconds GetTuningTimeMax() { - const auto fallback = std::chrono::duration_cast(std::chrono::hours{2}); - static const auto res = std::chrono::milliseconds{Value(MIOPEN_TUNING_TIME_MS_MAX{}, fallback.count() )}; + const auto fallback = + std::chrono::duration_cast(std::chrono::hours{2}); + static const auto res = + std::chrono::milliseconds{Value(MIOPEN_TUNING_TIME_MS_MAX{}, fallback.count())}; return res; } @@ -54,7 +56,7 @@ std::size_t GetTuningThreadsMax() { #if MIOPEN_USE_COMGR const auto def_max = 1; // COMGR is not parallelizable -#else +#else const auto def_max = 20; #endif return Value(MIOPEN_COMPILE_PARALLEL_LEVEL{}, def_max); diff --git a/src/include/miopen/generic_search.hpp b/src/include/miopen/generic_search.hpp index 5219a4bf91..b23898cd8c 100644 --- a/src/include/miopen/generic_search.hpp +++ b/src/include/miopen/generic_search.hpp @@ -304,33 +304,34 @@ std::chrono::milliseconds GetTuningTimeMax(); // returns the max allowed time in std::size_t GetTuningThreadsMax(); template -void CompileAgent( - size_t thread_index, - size_t total_threads, - const Solver& s, - const Context& context_, - std::vector& data, - ThreadSafeQueue>& comp_queue) +void CompileAgent(size_t thread_index, + size_t total_threads, + const Solver& s, + const Context& context_, + std::vector& data, + ThreadSafeQueue>& comp_queue) { - const auto start_time = std::chrono::time_point_cast(std::chrono::system_clock::now()); - const auto data_size = data.size(); - const auto time_budget = GetTuningTimeMax(); - auto context = context_; // Not sure if context is thread safe + const auto start_time = + std::chrono::time_point_cast(std::chrono::system_clock::now()); + const auto data_size = data.size(); + const auto time_budget = GetTuningTimeMax(); + auto context = context_; // Not sure if context is thread safe context.is_for_generic_search = true; - const auto& profile_h = context.GetStream(); + const auto& profile_h = context.GetStream(); // start the counter for(auto idx = thread_index; idx < data_size; idx += total_threads) { // Check if we are out of time - const auto current_time = std::chrono::time_point_cast(std::chrono::system_clock::now()); + const auto current_time = std::chrono::time_point_cast( + std::chrono::system_clock::now()); if(current_time - start_time > time_budget) { MIOPEN_LOG_I2("Thread: " << thread_index << " Done"); auto tmp = std::make_tuple({}, {}, true); comp_queue.push(std::move(tmp)); break; - } - auto& current_config = data.at(idx); + } + auto& current_config = data.at(idx); ConvSolution current_solution = s.GetSolution(context, current_config); for(const auto& kernel : current_solution.construction_params) { @@ -338,7 +339,8 @@ void CompileAgent( continue; std::ignore = profile_h.LoadProgram(kernel.kernel_file, kernel.comp_options, false, ""); } - auto tup = std::make_tuple(std::move(current_config), std::move(current_solution), false); + auto tup = std::make_tuple( + std::move(current_config), std::move(current_solution), false); comp_queue.push(std::move(tup)); } return; @@ -376,14 +378,15 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam all_configs.push_back(kinder); } // shuffle the configs - auto rd = std::random_device{}; + auto rd = std::random_device{}; auto rng = std::default_random_engine{rd()}; std::shuffle(all_configs.begin(), all_configs.end(), rng); const std::size_t n_runs_total = std::min(static_cast(std::distance(all_configs.begin(), all_configs.end())), GetTuningIterationsMax()); - all_configs = std::vector(all_configs.begin(), all_configs.begin() + n_runs_total); + all_configs = + std::vector(all_configs.begin(), all_configs.begin() + n_runs_total); bool is_passed = false; // left false only if all iterations failed. float best_time = std::numeric_limits::max(); @@ -392,34 +395,33 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam HeartBeat heartbeat; heartbeat.Start(); - const auto total_threads = GetTuningThreadsMax(); + const auto total_threads = GetTuningThreadsMax(); ThreadSafeQueue> solution_queue; std::vector compile_agents; for(auto idx = 0; idx < total_threads; ++idx) { compile_agents.emplace_back(CompileAgent, - idx, - total_threads, - std::cref(s), - std::cref(context), - std::ref(all_configs), - std::ref(solution_queue)); + idx, + total_threads, + std::cref(s), + std::cref(context), + std::ref(all_configs), + std::ref(solution_queue)); } - if(!IsEnabled(MIOPEN_DEBUG_COMPILE_ONLY{})) { - size_t n_current = 0; + size_t n_current = 0; auto threads_remaining = total_threads; while(true) { if(n_current >= n_runs_total) break; MIOPEN_LOG_I2("Waiting for item in queue"); - const auto kinder = solution_queue.front(); - auto current_config = std::get<0>(kinder); - auto current_solution = std::get<1>(kinder); - + const auto kinder = solution_queue.front(); + auto current_config = std::get<0>(kinder); + auto current_solution = std::get<1>(kinder); + if(std::get<2>(kinder)) { threads_remaining--; @@ -430,7 +432,6 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam continue; } } - float elapsed_time = 0.0f; int ret = 0; @@ -540,7 +541,7 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam "Running kernels on GPU is disabled. Search skipped"); } - for(auto& agent: compile_agents) + for(auto& agent : compile_agents) agent.join(); MIOPEN_LOG_W("Done: " << n_runs_total << '/' << n_failed << '/' << n_runs_total << ", best #" diff --git a/src/include/miopen/mt_queue.hpp b/src/include/miopen/mt_queue.hpp index 584f759dab..58c92e8358 100644 --- a/src/include/miopen/mt_queue.hpp +++ b/src/include/miopen/mt_queue.hpp @@ -31,13 +31,15 @@ #include template -class ThreadSafeQueue { +class ThreadSafeQueue +{ std::mutex mutex; std::condition_variable cond_var; std::queue queue; public: - void push(T&& item) { + void push(T&& item) + { { std::lock_guard lock(mutex); queue.push(item); @@ -46,13 +48,15 @@ class ThreadSafeQueue { cond_var.notify_one(); } - T& front() { + T& front() + { std::unique_lock lock(mutex); - cond_var.wait(lock, [&]{ return !queue.empty(); }); + cond_var.wait(lock, [&] { return !queue.empty(); }); return queue.front(); } - void pop() { + void pop() + { std::lock_guard lock(mutex); queue.pop(); } diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp index e673fb2615..76e9ef3eee 100644 --- a/src/include/miopen/sqlite_db.hpp +++ b/src/include/miopen/sqlite_db.hpp @@ -439,13 +439,13 @@ class SQLitePerfDb : public SQLiteBase { if(dbInvalid) return boost::none; - + const auto pdb_ovr = miopen::GetStringEnv(MIOPEN_DEBUG_PERFDB_OVERRIDE{}); if(pdb_ovr != nullptr) { MIOPEN_LOG_I2("overriding tuning params with: " << pdb_ovr); DbRecord ovr_rec; - const auto solv_vals = SplitDelim(pdb_ovr, ':'); + const auto solv_vals = SplitDelim(pdb_ovr, ':'); for(const auto& solv_val : solv_vals) { const auto vals = SplitDelim(solv_val, ';'); diff --git a/test/gtest/mt_queue.cpp b/test/gtest/mt_queue.cpp index 5d51b505b0..593d6cd213 100644 --- a/test/gtest/mt_queue.cpp +++ b/test/gtest/mt_queue.cpp @@ -33,15 +33,15 @@ static std::atomic num_prod{}; static const auto total_producers = std::thread::hardware_concurrency(); -const auto data_len = 100; +const auto data_len = 100; -template +template using data_t = std::vector; -template +template void producer(int thread_idx, data_t& common_data, ThreadSafeQueue& comp_queue) { - return; + return; for(auto idx = thread_idx; idx < data_len; idx += total_producers) { comp_queue.push(std::move(common_data.at(idx))); @@ -50,34 +50,32 @@ void producer(int thread_idx, data_t& common_data, ThreadSafeQueue& comp_q } } - - TEST(UtilMultiThreadQueue, Basic) { - ThreadSafeQueue comp_queue; - int num_cons = 0; - data_t common_data; - for(auto idx = 0; idx < data_len; ++idx) - common_data.emplace_back(idx); + ThreadSafeQueue comp_queue; + int num_cons = 0; + data_t common_data; + for(auto idx = 0; idx < data_len; ++idx) + common_data.emplace_back(idx); - std::vector producers; - for(int idx = 0;idx < total_producers; idx++) - { - producers.emplace_back(producer, idx, std::ref(common_data), std::ref(comp_queue)); - } + std::vector producers; + for(int idx = 0; idx < total_producers; idx++) + { + producers.emplace_back(producer, idx, std::ref(common_data), std::ref(comp_queue)); + } - for(auto idx = 0; idx < data_len; ++idx) - { - std::cerr << comp_queue.front() << std::endl; - num_cons++; - comp_queue.pop(); - } + for(auto idx = 0; idx < data_len; ++idx) + { + std::cerr << comp_queue.front() << std::endl; + num_cons++; + comp_queue.pop(); + } - for(auto& prod: producers) - prod.join(); + for(auto& prod : producers) + prod.join(); - std::cout << "Stage 2" << std::endl; - for(const auto& tmp: common_data) - std::cout << tmp << std::endl; - EXPECT_EQ(num_prod, num_cons); + std::cout << "Stage 2" << std::endl; + for(const auto& tmp : common_data) + std::cout << tmp << std::endl; + EXPECT_EQ(num_prod, num_cons); } From a562ac21a086186aec95d46a4a097f8714910cfb Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Wed, 22 Feb 2023 18:50:57 +0000 Subject: [PATCH 04/10] prealloc vector, remove redundant return --- src/include/miopen/generic_search.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/miopen/generic_search.hpp b/src/include/miopen/generic_search.hpp index b23898cd8c..6e5de49ceb 100644 --- a/src/include/miopen/generic_search.hpp +++ b/src/include/miopen/generic_search.hpp @@ -343,7 +343,6 @@ void CompileAgent(size_t thread_index, std::move(current_config), std::move(current_solution), false); comp_queue.push(std::move(tup)); } - return; } template @@ -398,6 +397,7 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam const auto total_threads = GetTuningThreadsMax(); ThreadSafeQueue> solution_queue; std::vector compile_agents; + compile_agents.reserve(total_threads); for(auto idx = 0; idx < total_threads; ++idx) { compile_agents.emplace_back(CompileAgent, From 3abd75aa441d1354192537a1095b02b1f5f2df37 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Wed, 22 Feb 2023 20:47:30 +0000 Subject: [PATCH 05/10] remove return from test, clean std vector logic --- src/include/miopen/generic_search.hpp | 9 ++------- test/gtest/mt_queue.cpp | 1 - 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/include/miopen/generic_search.hpp b/src/include/miopen/generic_search.hpp index 6e5de49ceb..666b234101 100644 --- a/src/include/miopen/generic_search.hpp +++ b/src/include/miopen/generic_search.hpp @@ -372,10 +372,7 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam auto tmp_all_configs = GetAllConfigs(s, context); // For random access std::vector all_configs; - for(auto& kinder : tmp_all_configs) - { - all_configs.push_back(kinder); - } + std::copy(tmp_all_configs.begin(), tmp_all_configs.end(), std::back_inserter(all_configs)); // shuffle the configs auto rd = std::random_device{}; auto rng = std::default_random_engine{rd()}; @@ -383,9 +380,7 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam const std::size_t n_runs_total = std::min(static_cast(std::distance(all_configs.begin(), all_configs.end())), GetTuningIterationsMax()); - - all_configs = - std::vector(all_configs.begin(), all_configs.begin() + n_runs_total); + all_configs.resize(n_runs_total); bool is_passed = false; // left false only if all iterations failed. float best_time = std::numeric_limits::max(); diff --git a/test/gtest/mt_queue.cpp b/test/gtest/mt_queue.cpp index 593d6cd213..ca970db974 100644 --- a/test/gtest/mt_queue.cpp +++ b/test/gtest/mt_queue.cpp @@ -41,7 +41,6 @@ using data_t = std::vector; template void producer(int thread_idx, data_t& common_data, ThreadSafeQueue& comp_queue) { - return; for(auto idx = thread_idx; idx < data_len; idx += total_producers) { comp_queue.push(std::move(common_data.at(idx))); From 87d0b51b743acded45a48d9a15223a94f25ae4f1 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Wed, 22 Feb 2023 22:21:49 +0000 Subject: [PATCH 06/10] random_device deleted ctor --- src/include/miopen/generic_search.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/miopen/generic_search.hpp b/src/include/miopen/generic_search.hpp index 666b234101..aa796a0d5c 100644 --- a/src/include/miopen/generic_search.hpp +++ b/src/include/miopen/generic_search.hpp @@ -374,7 +374,7 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam std::vector all_configs; std::copy(tmp_all_configs.begin(), tmp_all_configs.end(), std::back_inserter(all_configs)); // shuffle the configs - auto rd = std::random_device{}; + std::random_device rd{}; auto rng = std::default_random_engine{rd()}; std::shuffle(all_configs.begin(), all_configs.end(), rng); const std::size_t n_runs_total = From 51c3564e3e1e1a06b9740f2baa26d50a6537095c Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Fri, 24 Feb 2023 18:40:33 +0000 Subject: [PATCH 07/10] address reviews --- src/generic_search.cpp | 4 ++-- src/include/miopen/generic_search.hpp | 8 +++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/generic_search.cpp b/src/generic_search.cpp index 8234a6bab4..d66a6c0820 100644 --- a/src/generic_search.cpp +++ b/src/generic_search.cpp @@ -45,7 +45,7 @@ std::size_t GetTuningIterationsMax() std::chrono::milliseconds GetTuningTimeMax() { - const auto fallback = + static const auto fallback = std::chrono::duration_cast(std::chrono::hours{2}); static const auto res = std::chrono::milliseconds{Value(MIOPEN_TUNING_TIME_MS_MAX{}, fallback.count())}; @@ -57,7 +57,7 @@ std::size_t GetTuningThreadsMax() #if MIOPEN_USE_COMGR const auto def_max = 1; // COMGR is not parallelizable #else - const auto def_max = 20; + const int def_max = std::thread::hardware_concurrency() / 2; #endif return Value(MIOPEN_COMPILE_PARALLEL_LEVEL{}, def_max); } diff --git a/src/include/miopen/generic_search.hpp b/src/include/miopen/generic_search.hpp index aa796a0d5c..1542aa4e32 100644 --- a/src/include/miopen/generic_search.hpp +++ b/src/include/miopen/generic_search.hpp @@ -312,7 +312,7 @@ void CompileAgent(size_t thread_index, ThreadSafeQueue>& comp_queue) { const auto start_time = - std::chrono::time_point_cast(std::chrono::system_clock::now()); + std::chrono::time_point_cast(std::chrono::steady_clock::now()); const auto data_size = data.size(); const auto time_budget = GetTuningTimeMax(); auto context = context_; // Not sure if context is thread safe @@ -323,7 +323,7 @@ void CompileAgent(size_t thread_index, { // Check if we are out of time const auto current_time = std::chrono::time_point_cast( - std::chrono::system_clock::now()); + std::chrono::steady_clock::now()); if(current_time - start_time > time_budget) { MIOPEN_LOG_I2("Thread: " << thread_index << " Done"); @@ -377,9 +377,7 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam std::random_device rd{}; auto rng = std::default_random_engine{rd()}; std::shuffle(all_configs.begin(), all_configs.end(), rng); - const std::size_t n_runs_total = - std::min(static_cast(std::distance(all_configs.begin(), all_configs.end())), - GetTuningIterationsMax()); + const std::size_t n_runs_total = std::min(all_configs.size(), GetTuningIterationsMax()); all_configs.resize(n_runs_total); bool is_passed = false; // left false only if all iterations failed. From 371120c43d16cd4df8e677669e6b1bbe98f5cb07 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Tue, 28 Feb 2023 19:31:48 +0000 Subject: [PATCH 08/10] add env var validation --- src/include/miopen/sqlite_db.hpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp index 76e9ef3eee..626f5098d8 100644 --- a/src/include/miopen/sqlite_db.hpp +++ b/src/include/miopen/sqlite_db.hpp @@ -446,14 +446,22 @@ class SQLitePerfDb : public SQLiteBase MIOPEN_LOG_I2("overriding tuning params with: " << pdb_ovr); DbRecord ovr_rec; const auto solv_vals = SplitDelim(pdb_ovr, ':'); + bool success = true; for(const auto& solv_val : solv_vals) { const auto vals = SplitDelim(solv_val, ';'); - assert(vals.size() == 2); + if(vals.size() != 2) + { + MIOPEN_LOG_W("Invalid value for MIOPEN_DEBUG_PERFDB_OVERRIDE. Format: " + ";:;params"); + success = false; + break; + } MIOPEN_LOG_I2("Inserting Overriding PDB entry: " << vals[0] << ";" << vals[1]); ovr_rec.SetValues(vals.at(0), vals.at(1)); } - return {ovr_rec}; + if(success) + return {ovr_rec}; } std::string clause; std::vector values; From 12aff85f162aa1cf7929ab877626866620b448e9 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Mon, 6 Mar 2023 19:44:54 +0000 Subject: [PATCH 09/10] move env vars to header, add logging, update mt_queue --- src/generic_search.cpp | 6 +-- src/include/miopen/generic_search.hpp | 19 ++++----- .../miopen/generic_search_controls.hpp | 39 +++++++++++++++++++ src/include/miopen/mt_queue.hpp | 9 +++-- src/solver.cpp | 5 +-- test/gtest/mt_queue.cpp | 4 +- 6 files changed, 58 insertions(+), 24 deletions(-) create mode 100644 src/include/miopen/generic_search_controls.hpp diff --git a/src/generic_search.cpp b/src/generic_search.cpp index d66a6c0820..f7647b8f95 100644 --- a/src/generic_search.cpp +++ b/src/generic_search.cpp @@ -24,8 +24,8 @@ * *******************************************************************************/ -#include #include +#include #include #include @@ -34,10 +34,6 @@ namespace miopen { namespace solver { -MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_TUNING_ITERATIONS_MAX) -MIOPEN_DECLARE_ENV_VAR(MIOPEN_TUNING_TIME_MS_MAX) -MIOPEN_DECLARE_ENV_VAR(MIOPEN_COMPILE_PARALLEL_LEVEL) - std::size_t GetTuningIterationsMax() { return Value(MIOPEN_DEBUG_TUNING_ITERATIONS_MAX{}, std::numeric_limits::max()); diff --git a/src/include/miopen/generic_search.hpp b/src/include/miopen/generic_search.hpp index 1542aa4e32..dd24c639df 100644 --- a/src/include/miopen/generic_search.hpp +++ b/src/include/miopen/generic_search.hpp @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -51,8 +52,6 @@ namespace miopen { namespace solver { -MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_COMPILE_ONLY) - /// This STL-like container together with corresponding iterator provide access /// to a set of all available performance configs for the given problem config. /// @@ -307,17 +306,15 @@ template void CompileAgent(size_t thread_index, size_t total_threads, const Solver& s, - const Context& context_, + const Context& context, std::vector& data, ThreadSafeQueue>& comp_queue) { const auto start_time = std::chrono::time_point_cast(std::chrono::steady_clock::now()); - const auto data_size = data.size(); - const auto time_budget = GetTuningTimeMax(); - auto context = context_; // Not sure if context is thread safe - context.is_for_generic_search = true; - const auto& profile_h = context.GetStream(); + const auto data_size = data.size(); + const auto time_budget = GetTuningTimeMax(); + const auto& profile_h = context.GetStream(); // start the counter for(auto idx = thread_index; idx < data_size; idx += total_threads) { @@ -326,7 +323,7 @@ void CompileAgent(size_t thread_index, std::chrono::steady_clock::now()); if(current_time - start_time > time_budget) { - MIOPEN_LOG_I2("Thread: " << thread_index << " Done"); + MIOPEN_LOG_I2("Thread: " << thread_index << " Done, exhausted time budget"); auto tmp = std::make_tuple({}, {}, true); comp_queue.push(std::move(tmp)); break; @@ -343,6 +340,7 @@ void CompileAgent(size_t thread_index, std::move(current_config), std::move(current_solution), false); comp_queue.push(std::move(tup)); } + MIOPEN_LOG_I2("Thread: " << thread_index << " Done, completed tuning"); } template @@ -411,7 +409,7 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam if(n_current >= n_runs_total) break; MIOPEN_LOG_I2("Waiting for item in queue"); - const auto kinder = solution_queue.front(); + const auto kinder = solution_queue.pop(); auto current_config = std::get<0>(kinder); auto current_solution = std::get<1>(kinder); @@ -525,7 +523,6 @@ auto GenericSearch(const Solver s, const Context& context_, const AnyInvokeParam n_runs_total, current_config); ++n_current; - solution_queue.pop(); } } else diff --git a/src/include/miopen/generic_search_controls.hpp b/src/include/miopen/generic_search_controls.hpp new file mode 100644 index 0000000000..37d14d0639 --- /dev/null +++ b/src/include/miopen/generic_search_controls.hpp @@ -0,0 +1,39 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once +#include + +namespace miopen { +namespace solver { + +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_TUNING_ITERATIONS_MAX) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_TUNING_TIME_MS_MAX) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_COMPILE_PARALLEL_LEVEL) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_COMPILE_ONLY) + +} // namespace solver +} // namespace miopen diff --git a/src/include/miopen/mt_queue.hpp b/src/include/miopen/mt_queue.hpp index 58c92e8358..12a066cf0f 100644 --- a/src/include/miopen/mt_queue.hpp +++ b/src/include/miopen/mt_queue.hpp @@ -40,6 +40,7 @@ class ThreadSafeQueue public: void push(T&& item) { + { std::lock_guard lock(mutex); queue.push(item); @@ -47,17 +48,19 @@ class ThreadSafeQueue cond_var.notify_one(); } - +#if 0 T& front() { std::unique_lock lock(mutex); cond_var.wait(lock, [&] { return !queue.empty(); }); return queue.front(); } - - void pop() +#endif + T pop() { std::lock_guard lock(mutex); + T ret = queue.front(); queue.pop(); + return ret; } }; diff --git a/src/solver.cpp b/src/solver.cpp index fe2d7bc4b5..da4dd5eadc 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -45,8 +45,6 @@ namespace miopen { namespace solver { -MIOPEN_DECLARE_ENV_VAR(MIOPEN_COMPILE_PARALLEL_LEVEL) - std::ostream& operator<<(std::ostream& os, const KernelInfo& k) { os << k.kernel_file << ", " << k.kernel_name << " g_wk={ "; @@ -65,7 +63,8 @@ std::vector PrecompileKernels(const Handle& h, const std::vector Date: Wed, 8 Mar 2023 19:19:42 +0000 Subject: [PATCH 10/10] fix lock type in mt queue --- src/include/miopen/mt_queue.hpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/include/miopen/mt_queue.hpp b/src/include/miopen/mt_queue.hpp index 12a066cf0f..0b6ffe801a 100644 --- a/src/include/miopen/mt_queue.hpp +++ b/src/include/miopen/mt_queue.hpp @@ -48,17 +48,10 @@ class ThreadSafeQueue cond_var.notify_one(); } -#if 0 - T& front() + T pop() { std::unique_lock lock(mutex); cond_var.wait(lock, [&] { return !queue.empty(); }); - return queue.front(); - } -#endif - T pop() - { - std::lock_guard lock(mutex); T ret = queue.front(); queue.pop(); return ret;