From a2e76009dafd71f5959ba5ddfb5b1a8b98b84633 Mon Sep 17 00:00:00 2001 From: "Kelly (KT) Thompson" Date: Wed, 8 Sep 2021 17:31:36 -0600 Subject: [PATCH 1/2] Improve robustness of tstOMP by limiting max-num-threads. --- src/c4/test/tstOMP.cc | 55 +++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/src/c4/test/tstOMP.cc b/src/c4/test/tstOMP.cc index 76c275bd2f..c79d6950cb 100644 --- a/src/c4/test/tstOMP.cc +++ b/src/c4/test/tstOMP.cc @@ -4,8 +4,7 @@ * \author Kelly Thompson * \date Tue Jun 6 15:03:08 2006 * \brief Demonstrate basic OMP threads under MPI. - * \note Copyright (C) 2016-2020 Triad National Security, LLC. - * All rights reserved. */ + * \note Copyright (C) 2011-2021 Triad National Security, LLC., All rights reserved. */ //------------------------------------------------------------------------------------------------// #include "c4/ParallelUnitTest.hh" @@ -79,17 +78,14 @@ bool topology_report() { void topo_report(rtt_dsxx::UnitTest &ut, bool &one_mpi_rank_per_node) { // Determine if MPI ranks are on unique machine nodes: // - // If there are multiple MPI ranks per machine node, then don't use OMP - // because OMP can't restrict its threads to running only on an MPI rank's - // cores. The OMP threads will be distributed over the whole machine node. - // For example, we might choose to use 4 MPI ranks on a machine node with 16 - // cores. Ideally, we could allow each MPI rank to use 4 OMP threads for a - // maximum of 4x4=16 OMP threads on the 16 core node. However, because OMP - // doesn't know about the MPI ranks sharing the 16 cores, the even - // distribution of OMP threads is not guaranteed. + // If there are multiple MPI ranks per machine node, then don't use OMP because OMP can't restrict + // its threads to running only on an MPI rank's cores. The OMP threads will be distributed over + // the whole machine node. For example, we might choose to use 4 MPI ranks on a machine node with + // 16 cores. Ideally, we could allow each MPI rank to use 4 OMP threads for a maximum of 4x4=16 + // OMP threads on the 16 core node. However, because OMP doesn't know about the MPI ranks sharing + // the 16 cores, the even distribution of OMP threads is not guaranteed. // - // So - if we have more than one MPI rank per machine node, then turn off OMP - // threads. + // So - if we have more than one MPI rank per machine node, then turn off OMP threads. one_mpi_rank_per_node = topology_report(); std::string procname = rtt_c4::get_processor_name(); @@ -101,9 +97,13 @@ void topo_report(rtt_dsxx::UnitTest &ut, bool &one_mpi_rank_per_node) { int num_dynamic_threads = omp_get_dynamic(); int tid(-1); - int nthreads(-1), maxthreads(-1); + int nthreads(-1); + int maxthreads(-1); maxthreads = omp_get_max_threads(); + // This is just a unit test. Limit the parallelism. + if (maxthreads > 16) + omp_set_num_threads(16); #pragma omp parallel private(tid) { @@ -172,6 +172,12 @@ void sample_sum(rtt_dsxx::UnitTest &ut, bool const omrpn) { #ifdef OPENMP_FOUND { + // This is just a unit test. Limit the parallelism. + int maxthreads(-1); + maxthreads = omp_get_max_threads(); + if (maxthreads > 16) + omp_set_num_threads(16); + // More than 1 MPI rank per node --> turn off OMP. if (!omrpn) omp_set_num_threads(1); @@ -231,9 +237,8 @@ void sample_sum(rtt_dsxx::UnitTest &ut, bool const omrpn) { << std::endl; } - // [2015-11-17 KT] The accumulate test no longer provides enough work - // to offset the overhead of OpenMP, especially for the optimized - // build. Turn this test off... + // [2015-11-17 KT] The accumulate test no longer provides enough work to offset the overhead of + // OpenMP, especially for the optimized build. Turn this test off... // if( omrpn && nthreads > 4 ) // { @@ -251,12 +256,9 @@ void sample_sum(rtt_dsxx::UnitTest &ut, bool const omrpn) { } //------------------------------------------------------------------------------------------------// -// This is a simple demonstration problem for OMP. Nothing really to check -// for PASS/FAIL. +// This is a simple demonstration problem for OMP. Nothing really to check for PASS/FAIL. int MandelbrotCalculate(std::complex c, int maxiter) { - // iterates z = z*z + c until |z| >= 2 or maxiter is reached, returns the - // number of iterations - + // iterates z = z*z + c until |z| >= 2 or maxiter is reached, returns the number of iterations std::complex z = c; int n = 0; for (; n < maxiter; ++n) { @@ -277,16 +279,23 @@ void MandelbrotDriver(rtt_dsxx::UnitTest &ut) { const complex center(-0.7, 0.0); const complex span(2.7, -(4 / 3.0) * 2.7 * height / width); const complex begin = center - span / 2.0; - // const complex end = center+span/2.0; const int maxiter = 100000; // Use OMP threads Timer t; - ostringstream image1, image2; + ostringstream image1; + ostringstream image2; t.start(); int nthreads(-1); #ifdef OPENMP_FOUND + + // This is just a unit test. Limit the parallelism. + int maxthreads(-1); + maxthreads = omp_get_max_threads(); + if (maxthreads > 16) + omp_set_num_threads(16); + #pragma omp parallel { if (node() == 0 && omp_get_thread_num() == 0) { From 93a82f0acc3f3e14ae5f5862dde9b994725cfb02 Mon Sep 17 00:00:00 2001 From: "Kelly (KT) Thompson" Date: Wed, 8 Sep 2021 17:41:45 -0600 Subject: [PATCH 2/2] Also reduce parallelism in tstatomics. --- src/ds++/test/tstatomics.cc | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/src/ds++/test/tstatomics.cc b/src/ds++/test/tstatomics.cc index 047e071778..0a8ab392fc 100644 --- a/src/ds++/test/tstatomics.cc +++ b/src/ds++/test/tstatomics.cc @@ -3,8 +3,7 @@ * \file ds++/test/tstatomics.cc * \author Tim Kelley * \date Thursday, Sept. 6, 2018, 10:51 am - * \note Copyright (C) 2018-2020 Triad National Security, LLC. - * All rights reserved. */ + * \note Copyright (C) 2018-2021 Triad National Security, LLC., All rights reserved. */ //------------------------------------------------------------------------------------------------// #include "ds++/Release.hh" @@ -17,10 +16,9 @@ using rtt_dsxx::UnitTest; //------------------------------------------------------------------------------------------------// -/* Hammer an atomic from each thread. Each iteration, the thread adds - * (tid * iteration) to the counter. The atomic ensures that everyone sees - * a consistent view of the counter: no thread overwrites the contribution - * from any other thread. +/* Hammer an atomic from each thread. Each iteration, the thread adds (tid * iteration) to the + * counter. The atomic ensures that everyone sees a consistent view of the counter: no thread + * overwrites the contribution from any other thread. */ void thread_action(std::atomic &d, size_t N, size_t tid) { auto const did = static_cast(tid); @@ -87,14 +85,12 @@ void test_fetch_add_atomic_1e6(UnitTest &ut) { } // test_fetch_add_atomic // --------------------- non-atomic version -------------------------- -// This should give the wrong answer nearly every time on any respectable -// thread implementation. +// This should give the wrong answer nearly every time on any respectable thread implementation. //------------------------------------------------------------------------------------------------// -/* Similarly, hammer a POD from each thread. Each iteration, the thread adds - * (tid * iteration) to the counter. Since the threads are contending, we expect - * to have a race condition where two threads read the same value from d and - * one of the thread's write (+=) overwrites the other's. +/* Similarly, hammer a POD from each thread. Each iteration, the thread adds (tid * iteration) to + * the counter. Since the threads are contending, we expect to have a race condition where two + * threads read the same value from d and one of the thread's write (+=) overwrites the other's. */ void thread_action_pod(double &d, size_t N, size_t tid) { auto const did = static_cast(tid); @@ -155,8 +151,8 @@ void test_fetch_add_not_atomic(UnitTest & /*ut*/) { // fetch_sub tests -/* Same as thread_action above, except uses fetch_sub. Total sum is just the - * negative of the preceding test. +/* Same as thread_action above, except uses fetch_sub. Total sum is just the negative of the + * preceding test. */ void thread_action_sub(std::atomic &d, size_t N, size_t tid) { auto const did = static_cast(tid); @@ -210,14 +206,14 @@ void fetch_sub_atomic_core(UnitTest &ut, size_t const n_threads, size_t const n_ } // fetch_add_atomic_core void test_fetch_sub_atomic(UnitTest &ut) { - size_t const n_threads(19); + size_t const n_threads(8); size_t const n_iterations(10001); fetch_sub_atomic_core(ut, n_threads, n_iterations); return; } // test_fetch_add_atomic void test_fetch_sub_atomic_1e6(UnitTest &ut) { - size_t const n_threads(19); + size_t const n_threads(8); size_t const n_iterations(1000001); fetch_sub_atomic_core(ut, n_threads, n_iterations); return;