From fe73586808c33d62de7071494be024691664c07d Mon Sep 17 00:00:00 2001 From: Xi Luo Date: Sun, 24 Jun 2018 21:33:57 -0400 Subject: [PATCH 1/7] Add ADAPT module Add comments in the ADAPT module Signed-off-by: Xi Luo Signed-off-by: George Bosilca --- ompi/communicator/comm_init.c | 4 +- ompi/communicator/communicator.h | 4 + ompi/mca/coll/adapt/Makefile.am | 49 + ompi/mca/coll/adapt/coll_adapt.h | 98 ++ ompi/mca/coll/adapt/coll_adapt_algorithms.h | 95 ++ ompi/mca/coll/adapt/coll_adapt_bcast.c | 26 + ompi/mca/coll/adapt/coll_adapt_component.c | 154 ++++ ompi/mca/coll/adapt/coll_adapt_context.c | 71 ++ ompi/mca/coll/adapt/coll_adapt_context.h | 132 +++ ompi/mca/coll/adapt/coll_adapt_ibcast.c | 694 +++++++++++++++ ompi/mca/coll/adapt/coll_adapt_inbuf.c | 24 + ompi/mca/coll/adapt/coll_adapt_inbuf.h | 26 + ompi/mca/coll/adapt/coll_adapt_ireduce.c | 935 ++++++++++++++++++++ ompi/mca/coll/adapt/coll_adapt_item.c | 23 + ompi/mca/coll/adapt/coll_adapt_item.h | 25 + ompi/mca/coll/adapt/coll_adapt_module.c | 162 ++++ ompi/mca/coll/adapt/coll_adapt_reduce.c | 29 + ompi/mca/coll/base/coll_base_functions.h | 4 + ompi/request/request.h | 32 +- 19 files changed, 2579 insertions(+), 8 deletions(-) create mode 100644 ompi/mca/coll/adapt/Makefile.am create mode 100644 ompi/mca/coll/adapt/coll_adapt.h create mode 100644 ompi/mca/coll/adapt/coll_adapt_algorithms.h create mode 100644 ompi/mca/coll/adapt/coll_adapt_bcast.c create mode 100644 ompi/mca/coll/adapt/coll_adapt_component.c create mode 100644 ompi/mca/coll/adapt/coll_adapt_context.c create mode 100644 ompi/mca/coll/adapt/coll_adapt_context.h create mode 100644 ompi/mca/coll/adapt/coll_adapt_ibcast.c create mode 100644 ompi/mca/coll/adapt/coll_adapt_inbuf.c create mode 100644 ompi/mca/coll/adapt/coll_adapt_inbuf.h create mode 100644 ompi/mca/coll/adapt/coll_adapt_ireduce.c create mode 100644 ompi/mca/coll/adapt/coll_adapt_item.c create mode 100644 ompi/mca/coll/adapt/coll_adapt_item.h create mode 100644 ompi/mca/coll/adapt/coll_adapt_module.c create mode 100644 ompi/mca/coll/adapt/coll_adapt_reduce.c diff --git a/ompi/communicator/comm_init.c b/ompi/communicator/comm_init.c index bcac9170452..64dc9faf39c 100644 --- a/ompi/communicator/comm_init.c +++ b/ompi/communicator/comm_init.c @@ -382,7 +382,9 @@ static void ompi_comm_construct(ompi_communicator_t* comm) comm->c_pml_comm = NULL; comm->c_topo = NULL; comm->c_coll = NULL; - + comm->c_ibcast_tag = 0; + comm->c_ireduce_tag = 0; + /* A keyhash will be created if/when an attribute is cached on this communicator */ comm->c_keyhash = NULL; diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h index 87a148dfd72..be7e7acea4e 100644 --- a/ompi/communicator/communicator.h +++ b/ompi/communicator/communicator.h @@ -187,6 +187,10 @@ struct ompi_communicator_t { /* Collectives module interface and data */ mca_coll_base_comm_coll_t *c_coll; + + /* Non-blocking collective tag */ + _Atomic int32_t c_ibcast_tag; + _Atomic int32_t c_ireduce_tag; }; typedef struct ompi_communicator_t ompi_communicator_t; diff --git a/ompi/mca/coll/adapt/Makefile.am b/ompi/mca/coll/adapt/Makefile.am new file mode 100644 index 00000000000..157304e3118 --- /dev/null +++ b/ompi/mca/coll/adapt/Makefile.am @@ -0,0 +1,49 @@ +# +# Copyright (c) 2014 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +sources = \ + coll_adapt_component.c \ + coll_adapt_module.c \ + coll_adapt_bcast.c \ + coll_adapt_ibcast.c \ + coll_adapt_reduce.c \ + coll_adapt_ireduce.c \ + coll_adapt.h \ + coll_adapt_algorithms.h \ + coll_adapt_context.h \ + coll_adapt_context.c \ + coll_adapt_inbuf.c \ + coll_adapt_inbuf.h \ + coll_adapt_item.c \ + coll_adapt_item.h + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +component_noinst = +component_install = +if MCA_BUILD_ompi_coll_adapt_DSO +component_install += mca_coll_adapt.la +else +component_noinst += libmca_coll_adapt.la +endif + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_coll_adapt_la_SOURCES = $(sources) +mca_coll_adapt_la_LDFLAGS = -module -avoid-version +mca_coll_adapt_la_LIBADD = + +noinst_LTLIBRARIES = $(component_noinst) +libmca_coll_adapt_la_SOURCES =$(sources) +libmca_coll_adapt_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/coll/adapt/coll_adapt.h b/ompi/mca/coll/adapt/coll_adapt.h new file mode 100644 index 00000000000..0eaca96e5e7 --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#ifndef MCA_COLL_ADAPT_EXPORT_H +#define MCA_COLL_ADAPT_EXPORT_H + +#include "ompi_config.h" + +#include "mpi.h" +#include "opal/mca/mca.h" +#include "opal/datatype/opal_convertor.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/coll_base_topo.h" + +BEGIN_C_DECLS typedef struct mca_coll_adapt_module_t mca_coll_adapt_module_t; + +/* + * Structure to hold the adapt coll component. First it holds the + * base coll component, and then holds a bunch of + * adapt-coll-component-specific stuff (e.g., current MCA param + * values). + */ +typedef struct mca_coll_adapt_component_t { + /* Base coll component */ + mca_coll_base_component_2_0_0_t super; + + /* MCA parameter: Priority of this component */ + int adapt_priority; + + /* MCA parameter: Output verbose level */ + int adapt_output; + + /* MCA parameter: Maximum number of segment in context free list */ + int adapt_context_free_list_max; + + /* MCA parameter: Minimum number of segment in context free list */ + int adapt_context_free_list_min; + + /* MCA parameter: Increasment number of segment in context free list */ + int adapt_context_free_list_inc; + + /* Bcast MCA parameter */ + int adapt_ibcast_algorithm; + size_t adapt_ibcast_segment_size; + int adapt_ibcast_max_send_requests; + int adapt_ibcast_max_recv_requests; + /* Bcast free list */ + opal_free_list_t *adapt_ibcast_context_free_list; + _Atomic int32_t adapt_ibcast_context_free_list_enabled; + + /* Reduce MCA parameter */ + int adapt_ireduce_algorithm; + size_t adapt_ireduce_segment_size; + int adapt_ireduce_max_send_requests; + int adapt_ireduce_max_recv_requests; + int adapt_inbuf_free_list_min; + int adapt_inbuf_free_list_max; + int adapt_inbuf_free_list_inc; + + /* Reduce free list */ + opal_free_list_t *adapt_ireduce_context_free_list; + _Atomic int32_t adapt_ireduce_context_free_list_enabled; + +} mca_coll_adapt_component_t; + +/* Coll adapt module per communicator*/ +struct mca_coll_adapt_module_t { + /* Base module */ + mca_coll_base_module_t super; + + /* Whether this module has been lazily initialized or not yet */ + bool enabled; + /* Pointer to mca_coll_adapt_component */ + mca_coll_adapt_component_t *adapt_component; +}; +OBJ_CLASS_DECLARATION(mca_coll_adapt_module_t); + +/* Global component instance */ +OMPI_MODULE_DECLSPEC extern mca_coll_adapt_component_t mca_coll_adapt_component; + +/* ADAPT module functions */ +int mca_coll_adapt_init_query(bool enable_progress_threads, bool enable_mpi_threads); + +mca_coll_base_module_t *mca_coll_adapt_comm_query(struct ompi_communicator_t *comm, int *priority); + +/* Free ADAPT quest */ +int adapt_request_free(ompi_request_t ** request); + +#endif /* MCA_COLL_ADAPT_EXPORT_H */ diff --git a/ompi/mca/coll/adapt/coll_adapt_algorithms.h b/ompi/mca/coll/adapt/coll_adapt_algorithms.h new file mode 100644 index 00000000000..8b7b7cebd4f --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt_algorithms.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/coll_base_topo.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include + +typedef struct mca_coll_adapt_algorithm_index_s { + int algorithm_index; + uintptr_t algorithm_fn_ptr; +} mca_coll_adapt_algorithm_index_t; + +/* Bcast */ +int mca_coll_adapt_ibcast_init(void); +int mca_coll_adapt_ibcast_fini(void); +int mca_coll_adapt_bcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); +int mca_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module); +int mca_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, ompi_coll_tree_t * tree, + size_t seg_size, int ibcast_tag); +int mca_coll_adapt_ibcast_binomial(void *buff, int count, struct ompi_datatype_t *datatype, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, mca_coll_base_module_t * module, + int ibcast_tag); +int mca_coll_adapt_ibcast_in_order_binomial(void *buff, int count, struct ompi_datatype_t *datatype, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t * module, int ibcast_tag); +int mca_coll_adapt_ibcast_binary(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ibcast_tag); +int mca_coll_adapt_ibcast_pipeline(void *buff, int count, struct ompi_datatype_t *datatype, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, mca_coll_base_module_t * module, + int ibcast_tag); +int mca_coll_adapt_ibcast_chain(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ibcast_tag); +int mca_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ibcast_tag); + + +/* Reduce */ +int mca_coll_adapt_ireduce_init(void); +int mca_coll_adapt_ireduce_fini(void); +int mca_coll_adapt_reduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, + struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); +int mca_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, + struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, mca_coll_base_module_t * module); +int mca_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, ompi_coll_tree_t * tree, + size_t seg_size, int ireduce_tag); +int mca_coll_adapt_ireduce_binomial(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ireduce_tag); +int mca_coll_adapt_ireduce_in_order_binomial(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t * module, int ireduce_tag); +int mca_coll_adapt_ireduce_binary(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ireduce_tag); +int mca_coll_adapt_ireduce_pipeline(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ireduce_tag); +int mca_coll_adapt_ireduce_chain(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ireduce_tag); +int mca_coll_adapt_ireduce_linear(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ireduce_tag); diff --git a/ompi/mca/coll/adapt/coll_adapt_bcast.c b/ompi/mca/coll/adapt/coll_adapt_bcast.c new file mode 100644 index 00000000000..4348f2dc3b5 --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt_bcast.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_adapt.h" +#include "coll_adapt_algorithms.h" + +int mca_coll_adapt_bcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) +{ + if (count == 0) { + return MPI_SUCCESS; + } else { + ompi_request_t *request; + int err = mca_coll_adapt_ibcast(buff, count, datatype, root, comm, &request, module); + ompi_request_wait(&request, MPI_STATUS_IGNORE); + return err; + } +} diff --git a/ompi/mca/coll/adapt/coll_adapt_component.c b/ompi/mca/coll/adapt/coll_adapt_component.c new file mode 100644 index 00000000000..6079c4d92ea --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt_component.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "opal/util/show_help.h" +#include "ompi/constants.h" +#include "ompi/mca/coll/coll.h" +#include "coll_adapt.h" +#include "coll_adapt_algorithms.h" + +/* + * Public string showing the coll ompi_adapt component version number + */ +const char *mca_coll_adapt_component_version_string = + "Open MPI ADAPT collective MCA component version " OMPI_VERSION; + +/* + * Local functions + */ +static int adapt_open(void); +static int adapt_close(void); +static int adapt_register(void); + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +mca_coll_adapt_component_t mca_coll_adapt_component = { + + /* First, fill in the super */ + + { + /* First, the mca_component_t struct containing meta + information about the component itself */ + + { + MCA_COLL_BASE_VERSION_2_0_0, + + /* Component name and version */ + "adapt", + OMPI_MAJOR_VERSION, + OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION, + + /* Component functions */ + adapt_open, /* open */ + adapt_close, + NULL, /* query */ + adapt_register}, + { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE}, + + /* Initialization / querying functions */ + mca_coll_adapt_init_query, + mca_coll_adapt_comm_query, + }, + + /* adapt-component specific information */ + + /* (default) priority */ + 0, + + /* (default) verbose level */ + 0, + + /* default values for non-MCA parameters */ + /* Not specifying values here gives us all 0's */ +}; + +/* Open the component */ +static int adapt_open(void) +{ + return OMPI_SUCCESS; +} + + +/* Shut down the component */ +static int adapt_close(void) +{ + mca_coll_adapt_ibcast_fini(); + mca_coll_adapt_ireduce_fini(); + + return OMPI_SUCCESS; +} + +static int adapt_verify_mca_variables(void) +{ + return OMPI_SUCCESS; +} + +/* + * Register MCA params + */ +static int adapt_register(void) +{ + mca_base_component_t *c = &mca_coll_adapt_component.super.collm_version; + mca_coll_adapt_component_t *cs = &mca_coll_adapt_component; + + /* If we want to be selected (i.e., all procs on one node), then + we should have a high priority */ + cs->adapt_priority = 0; + (void) mca_base_component_var_register(c, "priority", "Priority of the adapt coll component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->adapt_priority); + + int adapt_verbose = 0; + (void) mca_base_component_var_register(c, "verbose", + "Verbose level", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &adapt_verbose); + cs->adapt_output = opal_output_open(NULL); + opal_output_set_verbosity(cs->adapt_output, adapt_verbose); + + cs->adapt_context_free_list_min = 10; + (void) mca_base_component_var_register(c, "context_free_list_max", + "Minimum number of segments in context free list", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->adapt_context_free_list_min); + + cs->adapt_context_free_list_max = 10000; + (void) mca_base_component_var_register(c, "context_free_list_min", + "Maximum number of segments in context free list", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->adapt_context_free_list_max); + + cs->adapt_context_free_list_inc = 10; + (void) mca_base_component_var_register(c, "context_free_list_inc", + "Increasement number of segments in context free list", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->adapt_context_free_list_inc); + mca_coll_adapt_ibcast_init(); + mca_coll_adapt_ireduce_init(); + + return adapt_verify_mca_variables(); +} diff --git a/ompi/mca/coll/adapt/coll_adapt_context.c b/ompi/mca/coll/adapt/coll_adapt_context.c new file mode 100644 index 00000000000..978739df9ab --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt_context.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi/mca/coll/coll.h" +#include "coll_adapt_context.h" + +static void mca_coll_adapt_bcast_context_constructor(mca_coll_adapt_bcast_context_t * bcast_context) +{ +} + +static void mca_coll_adapt_bcast_context_destructor(mca_coll_adapt_bcast_context_t * bcast_context) +{ + +} + +static void +mca_coll_adapt_constant_bcast_context_constructor(mca_coll_adapt_constant_bcast_context_t * con) +{ +} + +static void mca_coll_adapt_constant_bcast_context_destructor(mca_coll_adapt_constant_bcast_context_t + * con) +{ +} + + +OBJ_CLASS_INSTANCE(mca_coll_adapt_bcast_context_t, opal_free_list_item_t, + mca_coll_adapt_bcast_context_constructor, + mca_coll_adapt_bcast_context_destructor); + +OBJ_CLASS_INSTANCE(mca_coll_adapt_constant_bcast_context_t, opal_object_t, + mca_coll_adapt_constant_bcast_context_constructor, + mca_coll_adapt_constant_bcast_context_destructor); + +static void mca_coll_adapt_reduce_context_constructor(mca_coll_adapt_reduce_context_t * + reduce_context) +{ +} + +static void mca_coll_adapt_reduce_context_destructor(mca_coll_adapt_reduce_context_t * + reduce_context) +{ + +} + +static void +mca_coll_adapt_constant_reduce_context_constructor(mca_coll_adapt_constant_reduce_context_t * con) +{ +} + +static void +mca_coll_adapt_constant_reduce_context_destructor(mca_coll_adapt_constant_reduce_context_t * con) +{ +} + + +OBJ_CLASS_INSTANCE(mca_coll_adapt_reduce_context_t, opal_free_list_item_t, + mca_coll_adapt_reduce_context_constructor, + mca_coll_adapt_reduce_context_destructor); + +OBJ_CLASS_INSTANCE(mca_coll_adapt_constant_reduce_context_t, opal_object_t, + mca_coll_adapt_constant_reduce_context_constructor, + mca_coll_adapt_constant_reduce_context_destructor); diff --git a/ompi/mca/coll/adapt/coll_adapt_context.h b/ompi/mca/coll/adapt/coll_adapt_context.h new file mode 100644 index 00000000000..917e3d48861 --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt_context.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi/mca/coll/coll.h" +#include "opal/class/opal_free_list.h" +#include "opal/class/opal_list.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/communicator/communicator.h" +#include "ompi/op/op.h" +#include "ompi/mca/coll/base/coll_base_topo.h" +#include "coll_adapt_inbuf.h" + +/* Bcast constant context in bcast context */ +struct mca_coll_adapt_constant_bcast_context_s { + opal_object_t super; + int root; + size_t count; + size_t seg_count; + ompi_datatype_t *datatype; + ompi_communicator_t *comm; + int real_seg_size; + int num_segs; + ompi_request_t *request; + opal_mutex_t *mutex; + int *recv_array; + int *send_array; + /* Length of the fragment array, which is the number of recevied segments */ + int num_recv_segs; + /* Number of segments that is finishing recving */ + int num_recv_fini; + /* Store the number of sent segments */ + int num_sent_segs; + ompi_coll_tree_t *tree; + int ibcast_tag; +}; + +typedef struct mca_coll_adapt_constant_bcast_context_s mca_coll_adapt_constant_bcast_context_t; + +OBJ_CLASS_DECLARATION(mca_coll_adapt_constant_bcast_context_t); + + +/* Bcast context of each segment*/ +typedef struct mca_coll_adapt_bcast_context_s mca_coll_adapt_bcast_context_t; + +typedef int (*mca_coll_adapt_bcast_cuda_callback_fn_t) (mca_coll_adapt_bcast_context_t * context); + +struct mca_coll_adapt_bcast_context_s { + opal_free_list_item_t super; + char *buff; + int frag_id; + int child_id; + int peer; + mca_coll_adapt_constant_bcast_context_t *con; +}; + +OBJ_CLASS_DECLARATION(mca_coll_adapt_bcast_context_t); + +/* Reduce constant context in reduce context */ +struct mca_coll_adapt_constant_reduce_context_s { + opal_object_t super; + size_t count; + size_t seg_count; + ompi_datatype_t *datatype; + ompi_communicator_t *comm; + size_t real_seg_size; + /* Increment of each segment */ + int segment_increment; + int num_segs; + ompi_request_t *request; + int rank; + /* Length of the fragment array, which is the number of recevied segments */ + int32_t num_recv_segs; + /* Number of sent segments */ + int32_t num_sent_segs; + /* Next seg need to be received for every children */ + _Atomic int32_t *next_recv_segs; + /* Mutex to protect recv_list */ + opal_mutex_t *mutex_recv_list; + /* Mutex to protect num_recv_segs */ + opal_mutex_t *mutex_num_recv_segs; + /* Mutex to protect num_sent */ + opal_mutex_t *mutex_num_sent; + /* Mutex to protect each segment when do the reduce op */ + opal_mutex_t **mutex_op_list; + /* Reduce operation */ + ompi_op_t *op; + ompi_coll_tree_t *tree; + /* Accumulate buff */ + char **accumbuf; + opal_free_list_t *inbuf_list; + /* A list to store the segments which are received and not yet be sent */ + opal_list_t *recv_list; + ptrdiff_t lower_bound; + /* How many sends are posted but not finished */ + _Atomic int32_t ongoing_send; + char *sbuf; + char *rbuf; + int root; + /* The distance between the address of inbuf->buff and the address of inbuf */ + int distance; + int ireduce_tag; +}; + +typedef struct mca_coll_adapt_constant_reduce_context_s mca_coll_adapt_constant_reduce_context_t; + +OBJ_CLASS_DECLARATION(mca_coll_adapt_constant_reduce_context_t); + +/* Reduce context of each segment */ +typedef struct mca_coll_adapt_reduce_context_s mca_coll_adapt_reduce_context_t; + +typedef int (*mca_coll_adapt_reduce_cuda_callback_fn_t) (mca_coll_adapt_reduce_context_t * context); + +struct mca_coll_adapt_reduce_context_s { + opal_free_list_item_t super; + char *buff; + int frag_id; + int child_id; + int peer; + mca_coll_adapt_constant_reduce_context_t *con; + /* store the incoming segment */ + mca_coll_adapt_inbuf_t *inbuf; +}; + +OBJ_CLASS_DECLARATION(mca_coll_adapt_reduce_context_t); diff --git a/ompi/mca/coll/adapt/coll_adapt_ibcast.c b/ompi/mca/coll/adapt/coll_adapt_ibcast.c new file mode 100644 index 00000000000..3582bafcb62 --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt_ibcast.c @@ -0,0 +1,694 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "ompi/mca/pml/pml.h" +#include "coll_adapt.h" +#include "coll_adapt_algorithms.h" +#include "coll_adapt_context.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "opal/util/bit_ops.h" +#include "opal/sys/atomic.h" +#include "ompi/mca/pml/ob1/pml_ob1.h" + + +typedef int (*mca_coll_adapt_ibcast_fn_t) (void *buff, + int count, + struct ompi_datatype_t * datatype, + int root, + struct ompi_communicator_t * comm, + ompi_request_t ** request, + mca_coll_base_module_t * module, int ibcast_tag); + +static mca_coll_adapt_algorithm_index_t mca_coll_adapt_ibcast_algorithm_index[] = { + {1, (uintptr_t) mca_coll_adapt_ibcast_binomial}, + {2, (uintptr_t) mca_coll_adapt_ibcast_in_order_binomial}, + {3, (uintptr_t) mca_coll_adapt_ibcast_binary}, + {4, (uintptr_t) mca_coll_adapt_ibcast_pipeline}, + {5, (uintptr_t) mca_coll_adapt_ibcast_chain}, + {6, (uintptr_t) mca_coll_adapt_ibcast_linear}, +}; + +/* + * Set up MCA parameters of MPI_Bcast and MPI_IBcast + */ +int mca_coll_adapt_ibcast_init(void) +{ + mca_base_component_t *c = &mca_coll_adapt_component.super.collm_version; + + mca_coll_adapt_component.adapt_ibcast_algorithm = 1; + mca_base_component_var_register(c, "bcast_algorithm", + "Algorithm of broadcast, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_adapt_component.adapt_ibcast_algorithm); + + mca_coll_adapt_component.adapt_ibcast_segment_size = 0; + mca_base_component_var_register(c, "bcast_segment_size", + "Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_adapt_component.adapt_ibcast_segment_size); + + mca_coll_adapt_component.adapt_ibcast_max_send_requests = 2; + mca_base_component_var_register(c, "bcast_max_send_requests", + "Maximum number of send requests", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_adapt_component.adapt_ibcast_max_send_requests); + + mca_coll_adapt_component.adapt_ibcast_max_recv_requests = 3; + mca_base_component_var_register(c, "bcast_max_recv_requests", + "Maximum number of receive requests", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_adapt_component.adapt_ibcast_max_recv_requests); + + mca_coll_adapt_component.adapt_ibcast_context_free_list = NULL; + mca_coll_adapt_component.adapt_ibcast_context_free_list_enabled = 0; + return OMPI_SUCCESS; +} + +/* + * Release the free list created in mca_coll_adapt_ibcast_generic + */ +int mca_coll_adapt_ibcast_fini(void) +{ + if (NULL != mca_coll_adapt_component.adapt_ibcast_context_free_list) { + OBJ_RELEASE(mca_coll_adapt_component.adapt_ibcast_context_free_list); + mca_coll_adapt_component.adapt_ibcast_context_free_list = NULL; + mca_coll_adapt_component.adapt_ibcast_context_free_list_enabled = 0; + OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "ibcast fini\n")); + } + return OMPI_SUCCESS; +} + +/* + * Finish a ibcast request + */ +static int ibcast_request_fini(mca_coll_adapt_bcast_context_t * context) +{ + ompi_request_t *temp_req = context->con->request; + if (context->con->tree->tree_nextsize != 0) { + free(context->con->send_array); + } + if (context->con->num_segs != 0) { + free(context->con->recv_array); + } + OBJ_RELEASE(context->con->mutex); + OBJ_RELEASE(context->con); + OBJ_RELEASE(context->con); + opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list, + (opal_free_list_item_t *) context); + ompi_request_complete(temp_req, 1); + + return OMPI_SUCCESS; +} + +/* + * Callback function of isend + */ +static int send_cb(ompi_request_t * req) +{ + mca_coll_adapt_bcast_context_t *context = + (mca_coll_adapt_bcast_context_t *) req->req_complete_cb_data; + + int err; + + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: Send(cb): segment %d to %d at buff %p root %d\n", + ompi_comm_rank(context->con->comm), context->frag_id, + context->peer, (void *) context->buff, context->con->root)); + + OPAL_THREAD_LOCK(context->con->mutex); + int sent_id = context->con->send_array[context->child_id]; + /* If the current process has fragments in recv_array can be sent */ + if (sent_id < context->con->num_recv_segs) { + ompi_request_t *send_req; + int new_id = context->con->recv_array[sent_id]; + mca_coll_adapt_bcast_context_t *send_context = + (mca_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component. + adapt_ibcast_context_free_list); + send_context->buff = + context->buff + (new_id - context->frag_id) * context->con->real_seg_size; + send_context->frag_id = new_id; + send_context->child_id = context->child_id; + send_context->peer = context->peer; + send_context->con = context->con; + OBJ_RETAIN(context->con); + int send_count = send_context->con->seg_count; + if (new_id == (send_context->con->num_segs - 1)) { + send_count = send_context->con->count - new_id * send_context->con->seg_count; + } + ++(send_context->con->send_array[send_context->child_id]); + char *send_buff = send_context->buff; + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: Send(start in send cb): segment %d to %d at buff %p send_count %d tag %d\n", + ompi_comm_rank(send_context->con->comm), send_context->frag_id, + send_context->peer, (void *) send_context->buff, send_count, + (send_context->con->ibcast_tag << 16) + new_id)); + err = + MCA_PML_CALL(isend + (send_buff, send_count, send_context->con->datatype, send_context->peer, + (send_context->con->ibcast_tag << 16) + new_id, + MCA_PML_BASE_SEND_SYNCHRONOUS, send_context->con->comm, &send_req)); + if (MPI_SUCCESS != err) { + OPAL_THREAD_UNLOCK(context->con->mutex); + return err; + } + /* Invoke send call back */ + OPAL_THREAD_UNLOCK(context->con->mutex); + ompi_request_set_callback(send_req, send_cb, send_context); + OPAL_THREAD_LOCK(context->con->mutex); + } + + int num_sent = ++(context->con->num_sent_segs); + int num_recv_fini_t = context->con->num_recv_fini; + int rank = ompi_comm_rank(context->con->comm); + opal_mutex_t *mutex_temp = context->con->mutex; + /* Check whether signal the condition */ + if ((rank == context->con->root + && num_sent == context->con->tree->tree_nextsize * context->con->num_segs) + || (context->con->tree->tree_nextsize > 0 && rank != context->con->root + && num_sent == context->con->tree->tree_nextsize * context->con->num_segs + && num_recv_fini_t == context->con->num_segs) || (context->con->tree->tree_nextsize == 0 + && num_recv_fini_t == + context->con->num_segs)) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in send\n", + ompi_comm_rank(context->con->comm))); + OPAL_THREAD_UNLOCK(mutex_temp); + ibcast_request_fini(context); + } else { + OBJ_RELEASE(context->con); + opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list, + (opal_free_list_item_t *) context); + OPAL_THREAD_UNLOCK(mutex_temp); + } + req->req_free(&req); + /* Call back function return 1, which means successful */ + return 1; +} + +/* + * Callback function of irecv + */ +static int recv_cb(ompi_request_t * req) +{ + /* Get necessary info from request */ + mca_coll_adapt_bcast_context_t *context = + (mca_coll_adapt_bcast_context_t *) req->req_complete_cb_data; + + int err, i; + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: Recv(cb): segment %d from %d at buff %p root %d\n", + ompi_comm_rank(context->con->comm), context->frag_id, + context->peer, (void *) context->buff, context->con->root)); + + /* Store the frag_id to seg array */ + OPAL_THREAD_LOCK(context->con->mutex); + int num_recv_segs_t = ++(context->con->num_recv_segs); + context->con->recv_array[num_recv_segs_t - 1] = context->frag_id; + + int new_id = num_recv_segs_t + mca_coll_adapt_component.adapt_ibcast_max_recv_requests - 1; + /* Receive new segment */ + if (new_id < context->con->num_segs) { + ompi_request_t *recv_req; + /* Get new context item from free list */ + mca_coll_adapt_bcast_context_t *recv_context = + (mca_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component. + adapt_ibcast_context_free_list); + recv_context->buff = + context->buff + (new_id - context->frag_id) * context->con->real_seg_size; + recv_context->frag_id = new_id; + recv_context->child_id = context->child_id; + recv_context->peer = context->peer; + recv_context->con = context->con; + OBJ_RETAIN(context->con); + int recv_count = recv_context->con->seg_count; + if (new_id == (recv_context->con->num_segs - 1)) { + recv_count = recv_context->con->count - new_id * recv_context->con->seg_count; + } + char *recv_buff = recv_context->buff; + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: Recv(start in recv cb): segment %d from %d at buff %p recv_count %d tag %d\n", + ompi_comm_rank(context->con->comm), context->frag_id, context->peer, + (void *) recv_buff, recv_count, + (recv_context->con->ibcast_tag << 16) + recv_context->frag_id)); + MCA_PML_CALL(irecv + (recv_buff, recv_count, recv_context->con->datatype, recv_context->peer, + (recv_context->con->ibcast_tag << 16) + recv_context->frag_id, + recv_context->con->comm, &recv_req)); + + /* Invoke recvive call back */ + OPAL_THREAD_UNLOCK(context->con->mutex); + ompi_request_set_callback(recv_req, recv_cb, recv_context); + OPAL_THREAD_LOCK(context->con->mutex); + } + + /* Send segment to its children */ + for (i = 0; i < context->con->tree->tree_nextsize; i++) { + /* If the current process can send the segment now, which means the only segment need to be sent is the just arrived one */ + if (num_recv_segs_t - 1 == context->con->send_array[i]) { + ompi_request_t *send_req; + int send_count = context->con->seg_count; + if (context->frag_id == (context->con->num_segs - 1)) { + send_count = context->con->count - context->frag_id * context->con->seg_count; + } + + mca_coll_adapt_bcast_context_t *send_context = + (mca_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component. + adapt_ibcast_context_free_list); + send_context->buff = context->buff; + send_context->frag_id = context->frag_id; + send_context->child_id = i; + send_context->peer = context->con->tree->tree_next[i]; + send_context->con = context->con; + OBJ_RETAIN(context->con); + ++(send_context->con->send_array[i]); + char *send_buff = send_context->buff; + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: Send(start in recv cb): segment %d to %d at buff %p send_count %d tag %d\n", + ompi_comm_rank(send_context->con->comm), send_context->frag_id, + send_context->peer, (void *) send_context->buff, send_count, + (send_context->con->ibcast_tag << 16) + send_context->frag_id)); + err = + MCA_PML_CALL(isend + (send_buff, send_count, send_context->con->datatype, + send_context->peer, + (send_context->con->ibcast_tag << 16) + send_context->frag_id, + MCA_PML_BASE_SEND_SYNCHRONOUS, send_context->con->comm, &send_req)); + if (MPI_SUCCESS != err) { + OPAL_THREAD_UNLOCK(context->con->mutex); + return err; + } + /* Invoke send call back */ + OPAL_THREAD_UNLOCK(context->con->mutex); + ompi_request_set_callback(send_req, send_cb, send_context); + OPAL_THREAD_LOCK(context->con->mutex); + } + } + + int num_sent = context->con->num_sent_segs; + int num_recv_fini_t = ++(context->con->num_recv_fini); + int rank = ompi_comm_rank(context->con->comm); + opal_mutex_t *mutex_temp = context->con->mutex; + + /* If this process is leaf and has received all the segments */ + if ((rank == context->con->root + && num_sent == context->con->tree->tree_nextsize * context->con->num_segs) + || (context->con->tree->tree_nextsize > 0 && rank != context->con->root + && num_sent == context->con->tree->tree_nextsize * context->con->num_segs + && num_recv_fini_t == context->con->num_segs) || (context->con->tree->tree_nextsize == 0 + && num_recv_fini_t == + context->con->num_segs)) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in recv\n", + ompi_comm_rank(context->con->comm))); + OPAL_THREAD_UNLOCK(mutex_temp); + ibcast_request_fini(context); + } else { + OBJ_RELEASE(context->con); + opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list, + (opal_free_list_item_t *) context); + OPAL_THREAD_UNLOCK(mutex_temp); + } + req->req_free(&req); + return 1; +} + +int mca_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) +{ + if (count == 0) { + ompi_request_t *temp_request; + temp_request = OBJ_NEW(ompi_request_t); + OMPI_REQUEST_INIT(temp_request, false); + temp_request->req_type = 0; + temp_request->req_free = adapt_request_free; + temp_request->req_status.MPI_SOURCE = 0; + temp_request->req_status.MPI_TAG = 0; + temp_request->req_status.MPI_ERROR = 0; + temp_request->req_status._cancelled = 0; + temp_request->req_status._ucount = 0; + ompi_request_complete(temp_request, 1); + *request = temp_request; + return MPI_SUCCESS; + } else { + int rank = ompi_comm_rank(comm); + if (rank == root) { + OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, + "ibcast root %d, algorithm %d, coll_adapt_ibcast_segment_size %zu, coll_adapt_ibcast_max_send_requests %d, coll_adapt_ibcast_max_recv_requests %d\n", + root, mca_coll_adapt_component.adapt_ibcast_algorithm, + mca_coll_adapt_component.adapt_ibcast_segment_size, + mca_coll_adapt_component.adapt_ibcast_max_send_requests, + mca_coll_adapt_component.adapt_ibcast_max_recv_requests)); + } + int ibcast_tag = opal_atomic_add_fetch_32(&(comm->c_ibcast_tag), 1); + ibcast_tag = ibcast_tag % 4096; + mca_coll_adapt_ibcast_fn_t bcast_func = + (mca_coll_adapt_ibcast_fn_t) + mca_coll_adapt_ibcast_algorithm_index[mca_coll_adapt_component.adapt_ibcast_algorithm]. + algorithm_fn_ptr; + return bcast_func(buff, count, datatype, root, comm, request, module, ibcast_tag); + } +} + +/* + * Ibcast functions with different algorithms + */ +int mca_coll_adapt_ibcast_binomial(void *buff, int count, struct ompi_datatype_t *datatype, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, mca_coll_base_module_t * module, + int ibcast_tag) +{ + ompi_coll_tree_t *tree = ompi_coll_base_topo_build_bmtree(comm, root); + int err = + mca_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + mca_coll_adapt_component.adapt_ibcast_segment_size, + ibcast_tag); + return err; +} + +int mca_coll_adapt_ibcast_in_order_binomial(void *buff, int count, struct ompi_datatype_t *datatype, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t * module, int ibcast_tag) +{ + ompi_coll_tree_t *tree = ompi_coll_base_topo_build_in_order_bmtree(comm, root); + int err = + mca_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + mca_coll_adapt_component.adapt_ibcast_segment_size, + ibcast_tag); + return err; +} + + +int mca_coll_adapt_ibcast_binary(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ibcast_tag) +{ + ompi_coll_tree_t *tree = ompi_coll_base_topo_build_tree(2, comm, root); + int err = + mca_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + mca_coll_adapt_component.adapt_ibcast_segment_size, + ibcast_tag); + return err; +} + +int mca_coll_adapt_ibcast_pipeline(void *buff, int count, struct ompi_datatype_t *datatype, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, mca_coll_base_module_t * module, + int ibcast_tag) +{ + ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(1, comm, root); + int err = + mca_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + mca_coll_adapt_component.adapt_ibcast_segment_size, + ibcast_tag); + return err; +} + + +int mca_coll_adapt_ibcast_chain(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ibcast_tag) +{ + ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(4, comm, root); + int err = + mca_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + mca_coll_adapt_component.adapt_ibcast_segment_size, + ibcast_tag); + return err; +} + +int mca_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ibcast_tag) +{ + int fanout = ompi_comm_size(comm) - 1; + ompi_coll_tree_t *tree; + if (fanout < 1) { + tree = ompi_coll_base_topo_build_chain(1, comm, root); + } else if (fanout <= MAXTREEFANOUT) { + tree = ompi_coll_base_topo_build_tree(ompi_comm_size(comm) - 1, comm, root); + } else { + tree = ompi_coll_base_topo_build_tree(MAXTREEFANOUT, comm, root); + } + int err = + mca_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + mca_coll_adapt_component.adapt_ibcast_segment_size, + ibcast_tag); + return err; +} + + +int mca_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, ompi_coll_tree_t * tree, + size_t seg_size, int ibcast_tag) +{ + /* Tempory variables for iteration */ + int i, j; + /* Rank of this process */ + int rank; + /* Record return value */ + int err; + /* The min of num_segs and SEND_NUM or RECV_NUM, in case the num_segs is less than SEND_NUM or RECV_NUM */ + int min; + + /* Number of datatype in a segment */ + int seg_count = count; + /* Size of a datatype */ + size_t type_size; + /* Real size of a segment */ + size_t real_seg_size; + ptrdiff_t extent, lb; + /* Number of segments */ + int num_segs; + + /* The request passed outside */ + ompi_request_t *temp_request = NULL; + opal_mutex_t *mutex; + /* Store the segments which are received */ + int *recv_array = NULL; + /* Record how many isends have been issued for every child */ + int *send_array = NULL; + + /* Set up free list */ + if (0 == mca_coll_adapt_component.adapt_ibcast_context_free_list_enabled) { + int32_t context_free_list_enabled = + opal_atomic_add_fetch_32(& + (mca_coll_adapt_component. + adapt_ibcast_context_free_list_enabled), 1); + if (1 == context_free_list_enabled) { + mca_coll_adapt_component.adapt_ibcast_context_free_list = OBJ_NEW(opal_free_list_t); + opal_free_list_init(mca_coll_adapt_component.adapt_ibcast_context_free_list, + sizeof(mca_coll_adapt_bcast_context_t), + opal_cache_line_size, + OBJ_CLASS(mca_coll_adapt_bcast_context_t), + 0, opal_cache_line_size, + mca_coll_adapt_component.adapt_context_free_list_min, + mca_coll_adapt_component.adapt_context_free_list_max, + mca_coll_adapt_component.adapt_context_free_list_inc, + NULL, 0, NULL, NULL, NULL); + } + } + + /* Set up request */ + temp_request = OBJ_NEW(ompi_request_t); + OMPI_REQUEST_INIT(temp_request, false); + temp_request->req_state = OMPI_REQUEST_ACTIVE; + temp_request->req_type = 0; + temp_request->req_free = adapt_request_free; + temp_request->req_status.MPI_SOURCE = 0; + temp_request->req_status.MPI_TAG = 0; + temp_request->req_status.MPI_ERROR = 0; + temp_request->req_status._cancelled = 0; + temp_request->req_status._ucount = 0; + *request = temp_request; + + /* Set up mutex */ + mutex = OBJ_NEW(opal_mutex_t); + + rank = ompi_comm_rank(comm); + + /* Determine number of elements sent per operation */ + ompi_datatype_type_size(datatype, &type_size); + COLL_BASE_COMPUTED_SEGCOUNT(seg_size, type_size, seg_count); + + ompi_datatype_get_extent(datatype, &lb, &extent); + num_segs = (count + seg_count - 1) / seg_count; + real_seg_size = (ptrdiff_t) seg_count *extent; + + /* Set memory for recv_array and send_array, created on heap becasue they are needed to be accessed by other functions (callback functions) */ + if (num_segs != 0) { + recv_array = (int *) malloc(sizeof(int) * num_segs); + } + if (tree->tree_nextsize != 0) { + send_array = (int *) malloc(sizeof(int) * tree->tree_nextsize); + } + + /* Set constant context for send and recv call back */ + mca_coll_adapt_constant_bcast_context_t *con = OBJ_NEW(mca_coll_adapt_constant_bcast_context_t); + con->root = root; + con->count = count; + con->seg_count = seg_count; + con->datatype = datatype; + con->comm = comm; + con->real_seg_size = real_seg_size; + con->num_segs = num_segs; + con->recv_array = recv_array; + con->num_recv_segs = 0; + con->num_recv_fini = 0; + con->send_array = send_array; + con->num_sent_segs = 0; + con->mutex = mutex; + con->request = temp_request; + con->tree = tree; + con->ibcast_tag = ibcast_tag; + + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: Ibcast, root %d, tag %d\n", rank, root, + ibcast_tag)); + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: con->mutex = %p, num_children = %d, num_segs = %d, real_seg_size = %d, seg_count = %d, tree_adreess = %p\n", + rank, (void *) con->mutex, tree->tree_nextsize, num_segs, + (int) real_seg_size, seg_count, (void *) con->tree)); + + OPAL_THREAD_LOCK(mutex); + + /* If the current process is root, it sends segment to every children */ + if (rank == root) { + /* Handle the situation when num_segs < SEND_NUM */ + if (num_segs <= mca_coll_adapt_component.adapt_ibcast_max_send_requests) { + min = num_segs; + } else { + min = mca_coll_adapt_component.adapt_ibcast_max_send_requests; + } + + /* Set recv_array, root has already had all the segments */ + for (i = 0; i < num_segs; i++) { + recv_array[i] = i; + } + con->num_recv_segs = num_segs; + /* Set send_array, will send adapt_ibcast_max_send_requests segments */ + for (i = 0; i < tree->tree_nextsize; i++) { + send_array[i] = mca_coll_adapt_component.adapt_ibcast_max_send_requests; + } + + ompi_request_t *send_req; + /* Number of datatypes in each send */ + int send_count = seg_count; + for (i = 0; i < min; i++) { + if (i == (num_segs - 1)) { + send_count = count - i * seg_count; + } + for (j = 0; j < tree->tree_nextsize; j++) { + mca_coll_adapt_bcast_context_t *context = + (mca_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component. + adapt_ibcast_context_free_list); + context->buff = (char *) buff + i * real_seg_size; + context->frag_id = i; + /* The id of peer in in children_list */ + context->child_id = j; + /* Actural rank of the peer */ + context->peer = tree->tree_next[j]; + context->con = con; + OBJ_RETAIN(con); + + char *send_buff = context->buff; + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: Send(start in main): segment %d to %d at buff %p send_count %d tag %d\n", + rank, context->frag_id, context->peer, + (void *) send_buff, send_count, (ibcast_tag << 16) + i)); + err = + MCA_PML_CALL(isend + (send_buff, send_count, datatype, context->peer, + (ibcast_tag << 16) + i, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, + &send_req)); + if (MPI_SUCCESS != err) { + return err; + } + /* Invoke send call back */ + OPAL_THREAD_UNLOCK(mutex); + ompi_request_set_callback(send_req, send_cb, context); + OPAL_THREAD_LOCK(mutex); + } + } + + } + + /* If the current process is not root, it receives data from parent in the tree. */ + else { + /* Handle the situation when num_segs < RECV_NUM */ + if (num_segs <= mca_coll_adapt_component.adapt_ibcast_max_recv_requests) { + min = num_segs; + } else { + min = mca_coll_adapt_component.adapt_ibcast_max_recv_requests; + } + + /* Set recv_array, recv_array is empty */ + for (i = 0; i < num_segs; i++) { + recv_array[i] = 0; + } + /* Set send_array to empty */ + for (i = 0; i < tree->tree_nextsize; i++) { + send_array[i] = 0; + } + + /* Create a recv request */ + ompi_request_t *recv_req; + + /* Recevice some segments from its parent */ + int recv_count = seg_count; + for (i = 0; i < min; i++) { + if (i == (num_segs - 1)) { + recv_count = count - i * seg_count; + } + mca_coll_adapt_bcast_context_t *context = + (mca_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component. + adapt_ibcast_context_free_list); + context->buff = (char *) buff + i * real_seg_size; + context->frag_id = i; + context->peer = tree->tree_prev; + context->con = con; + OBJ_RETAIN(con); + char *recv_buff = context->buff; + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: Recv(start in main): segment %d from %d at buff %p recv_count %d tag %d\n", + ompi_comm_rank(context->con->comm), context->frag_id, + context->peer, (void *) recv_buff, recv_count, + (ibcast_tag << 16) + i)); + err = + MCA_PML_CALL(irecv + (recv_buff, recv_count, datatype, context->peer, + (ibcast_tag << 16) + i, comm, &recv_req)); + if (MPI_SUCCESS != err) { + return err; + } + /* Invoke receive call back */ + OPAL_THREAD_UNLOCK(mutex); + ompi_request_set_callback(recv_req, recv_cb, context); + OPAL_THREAD_LOCK(mutex); + } + + } + + OPAL_THREAD_UNLOCK(mutex); + + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: End of Ibcast\n", rank)); + + return MPI_SUCCESS; +} \ No newline at end of file diff --git a/ompi/mca/coll/adapt/coll_adapt_inbuf.c b/ompi/mca/coll/adapt/coll_adapt_inbuf.c new file mode 100644 index 00000000000..79162966624 --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt_inbuf.c @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_adapt.h" +#include "coll_adapt_inbuf.h" + +static void mca_coll_adapt_inbuf_constructor(mca_coll_adapt_inbuf_t * inbuf) +{ +} + +static void mca_coll_adapt_inbuf_destructor(mca_coll_adapt_inbuf_t * inbuf) +{ +} + +OBJ_CLASS_INSTANCE(mca_coll_adapt_inbuf_t, opal_free_list_item_t, mca_coll_adapt_inbuf_constructor, + mca_coll_adapt_inbuf_destructor); diff --git a/ompi/mca/coll/adapt/coll_adapt_inbuf.h b/ompi/mca/coll/adapt/coll_adapt_inbuf.h new file mode 100644 index 00000000000..1d450e59ff7 --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt_inbuf.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_ADAPT_INBUF_H +#define MCA_COLL_ADAPT_INBUF_H + +#include "opal/class/opal_free_list.h" + +struct mca_coll_adapt_inbuf_s { + opal_free_list_item_t super; + char buff[1]; +}; + +typedef struct mca_coll_adapt_inbuf_s mca_coll_adapt_inbuf_t; + +OBJ_CLASS_DECLARATION(mca_coll_adapt_inbuf_t); + +#endif /* MCA_COLL_ADAPT_INBUF_H */ diff --git a/ompi/mca/coll/adapt/coll_adapt_ireduce.c b/ompi/mca/coll/adapt/coll_adapt_ireduce.c new file mode 100644 index 00000000000..d99bb87f998 --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt_ireduce.c @@ -0,0 +1,935 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "ompi/communicator/communicator.h" +#include "coll_adapt.h" +#include "coll_adapt_algorithms.h" +#include "coll_adapt_context.h" +#include "coll_adapt_item.h" +#include "ompi/constants.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_base_topo.h" + +/* MPI_Reduce and MPI_Ireduce in the ADAPT module only work for commutative operations */ + +typedef int (*mca_coll_adapt_ireduce_fn_t) (const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t * datatype, + struct ompi_op_t * op, + int root, + struct ompi_communicator_t * comm, + ompi_request_t ** request, + mca_coll_base_module_t * module, int ireduce_tag); + +static mca_coll_adapt_algorithm_index_t mca_coll_adapt_ireduce_algorithm_index[] = { + {1, (uintptr_t) mca_coll_adapt_ireduce_binomial}, + {2, (uintptr_t) mca_coll_adapt_ireduce_in_order_binomial}, + {3, (uintptr_t) mca_coll_adapt_ireduce_binary}, + {4, (uintptr_t) mca_coll_adapt_ireduce_pipeline}, + {5, (uintptr_t) mca_coll_adapt_ireduce_chain}, + {6, (uintptr_t) mca_coll_adapt_ireduce_linear}, +}; + +/* + * Set up MCA parameters of MPI_Reduce and MPI_Ireduce + */ +int mca_coll_adapt_ireduce_init(void) +{ + mca_base_component_t *c = &mca_coll_adapt_component.super.collm_version; + + mca_coll_adapt_component.adapt_ireduce_algorithm = 1; + mca_base_component_var_register(c, "reduce_algorithm", + "Algorithm of reduce, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_adapt_component.adapt_ireduce_algorithm); + + mca_coll_adapt_component.adapt_ireduce_segment_size = 163740; + mca_base_component_var_register(c, "reduce_segment_size", + "Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_adapt_component.adapt_ireduce_segment_size); + + mca_coll_adapt_component.adapt_ireduce_max_send_requests = 2; + mca_base_component_var_register(c, "reduce_max_send_requests", + "Maximum number of send requests", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_adapt_component.adapt_ireduce_max_send_requests); + + mca_coll_adapt_component.adapt_ireduce_max_recv_requests = 3; + mca_base_component_var_register(c, "reduce_max_recv_requests", + "Maximum number of receive requests", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_adapt_component.adapt_ireduce_max_recv_requests); + + mca_coll_adapt_component.adapt_inbuf_free_list_min = 10; + mca_base_component_var_register(c, "inbuf_free_list_min", + "Minimum number of segment in inbuf free list", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_adapt_component.adapt_inbuf_free_list_min); + + mca_coll_adapt_component.adapt_inbuf_free_list_max = 10000; + mca_base_component_var_register(c, "inbuf_free_list_max", + "Maximum number of segment in inbuf free list", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_adapt_component.adapt_inbuf_free_list_max); + + + mca_coll_adapt_component.adapt_inbuf_free_list_inc = 10; + mca_base_component_var_register(c, "inbuf_free_list_inc", + "Maximum number of segment in inbuf free list", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_adapt_component.adapt_inbuf_free_list_inc); + + mca_coll_adapt_component.adapt_ireduce_context_free_list = NULL; + mca_coll_adapt_component.adapt_ireduce_context_free_list_enabled = 0; + return OMPI_SUCCESS; +} + +/* + * Release the free list created in mca_coll_adapt_ireduce_generic + */ +int mca_coll_adapt_ireduce_fini(void) +{ + if (NULL != mca_coll_adapt_component.adapt_ireduce_context_free_list) { + OBJ_RELEASE(mca_coll_adapt_component.adapt_ireduce_context_free_list); + mca_coll_adapt_component.adapt_ireduce_context_free_list = NULL; + mca_coll_adapt_component.adapt_ireduce_context_free_list_enabled = 0; + OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "ireduce fini\n")); + } + return OMPI_SUCCESS; +} + +/* + * Functions to access list + */ +static mca_coll_adapt_item_t *get_next_ready_item(opal_list_t * list, int num_children) +{ + mca_coll_adapt_item_t *item; + if (opal_list_is_empty(list)) { + return NULL; + } + for (item = (mca_coll_adapt_item_t *) opal_list_get_first(list); + item != (mca_coll_adapt_item_t *) opal_list_get_end(list); + item = (mca_coll_adapt_item_t *) ((opal_list_item_t *) item)->opal_list_next) { + if (item->count == num_children) { + opal_list_remove_item(list, (opal_list_item_t *) item); + return item; + } + } + return NULL; +} + +static int add_to_list(opal_list_t * list, int id) +{ + mca_coll_adapt_item_t *item; + int ret = 0; + for (item = (mca_coll_adapt_item_t *) opal_list_get_first(list); + item != (mca_coll_adapt_item_t *) opal_list_get_end(list); + item = (mca_coll_adapt_item_t *) ((opal_list_item_t *) item)->opal_list_next) { + if (item->id == id) { + (item->count)++; + ret = 1; + break; + } + } + if (ret == 0) { + item = OBJ_NEW(mca_coll_adapt_item_t); + item->id = id; + item->count = 1; + opal_list_append(list, (opal_list_item_t *) item); + ret = 2; + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "add_to_list_return %d\n", + ret)); + return ret; +} + +/* + * Get the inbuf address + */ +static mca_coll_adapt_inbuf_t *to_inbuf(char *buf, int distance) +{ + return (mca_coll_adapt_inbuf_t *) (buf - distance); +} + +/* + * Finish a ireduce request + */ +static int ireduce_request_fini(mca_coll_adapt_reduce_context_t * context) +{ + /* Return the allocated recourses */ + int i; + ompi_request_t *temp_req = context->con->request; + if (context->con->accumbuf != NULL) { + if (context->con->rank != context->con->root) { + for (i = 0; i < context->con->num_segs; i++) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: Return accumbuf %d %p\n", + ompi_comm_rank(context->con->comm), i, + (void *) to_inbuf(context->con->accumbuf[i], + context->con->distance))); + opal_free_list_return(context->con->inbuf_list, + (opal_free_list_item_t *) to_inbuf(context->con->accumbuf[i], + context->con->distance)); + } + } + free(context->con->accumbuf); + } + OBJ_RELEASE(context->con->recv_list); + for (i = 0; i < context->con->num_segs; i++) { + OBJ_RELEASE(context->con->mutex_op_list[i]); + } + free(context->con->mutex_op_list); + OBJ_RELEASE(context->con->mutex_num_recv_segs); + OBJ_RELEASE(context->con->mutex_recv_list); + OBJ_RELEASE(context->con->mutex_num_sent); + if (context->con->tree->tree_nextsize > 0) { + OBJ_RELEASE(context->con->inbuf_list); + free(context->con->next_recv_segs); + } + OBJ_RELEASE(context->con); + OBJ_RELEASE(context->con); + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "return context_list\n")); + opal_free_list_return(mca_coll_adapt_component.adapt_ireduce_context_free_list, + (opal_free_list_item_t *) context); + /* Complete the request */ + ompi_request_complete(temp_req, 1); + return OMPI_SUCCESS; +} + +/* + * Callback function of isend + */ +static int send_cb(ompi_request_t * req) +{ + mca_coll_adapt_reduce_context_t *context = + (mca_coll_adapt_reduce_context_t *) req->req_complete_cb_data; + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: ireduce_send_cb, peer %d, seg_id %d\n", context->con->rank, + context->peer, context->frag_id)); + int err; + + opal_atomic_sub_fetch_32(&(context->con->ongoing_send), 1); + + /* Send a new segment */ + OPAL_THREAD_LOCK(context->con->mutex_recv_list); + mca_coll_adapt_item_t *item = + get_next_ready_item(context->con->recv_list, context->con->tree->tree_nextsize); + OPAL_THREAD_UNLOCK(context->con->mutex_recv_list); + + if (item != NULL) { + /* Get new context item from free list */ + mca_coll_adapt_reduce_context_t *send_context = + (mca_coll_adapt_reduce_context_t *) opal_free_list_wait(mca_coll_adapt_component. + adapt_ireduce_context_free_list); + if (context->con->tree->tree_nextsize > 0) { + send_context->buff = context->con->accumbuf[item->id]; + + } else { + send_context->buff = + context->buff + (item->id - context->frag_id) * context->con->segment_increment; + } + send_context->frag_id = item->id; + send_context->peer = context->peer; + send_context->con = context->con; + OBJ_RETAIN(context->con); + + opal_atomic_add_fetch_32(&(context->con->ongoing_send), 1); + + int send_count = send_context->con->seg_count; + if (item->id == (send_context->con->num_segs - 1)) { + send_count = send_context->con->count - item->id * send_context->con->seg_count; + } + + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: In send_cb, create isend to seg %d, peer %d, tag %d\n", + send_context->con->rank, send_context->frag_id, send_context->peer, + (send_context->con->ireduce_tag << 16) + send_context->frag_id)); + + ompi_request_t *send_req; + err = + MCA_PML_CALL(isend + (send_context->buff, send_count, send_context->con->datatype, + send_context->peer, + (context->con->ireduce_tag << 16) + send_context->frag_id, + MCA_PML_BASE_SEND_SYNCHRONOUS, send_context->con->comm, &send_req)); + if (MPI_SUCCESS != err) { + return err; + } + + /* Release the item */ + OBJ_RELEASE(item); + + /* Invoke send call back */ + ompi_request_set_callback(send_req, send_cb, send_context); + } + + OPAL_THREAD_LOCK(context->con->mutex_num_sent); + int32_t num_sent = ++(context->con->num_sent_segs); + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: In send_cb, root = %d, num_sent = %d, num_segs = %d\n", + context->con->rank, context->con->tree->tree_root, num_sent, + context->con->num_segs)); + /* Check whether signal the condition, non root and sent all the segments */ + if (context->con->tree->tree_root != context->con->rank && num_sent == context->con->num_segs) { + OPAL_THREAD_UNLOCK(context->con->mutex_num_sent); + ireduce_request_fini(context); + } else { + OPAL_THREAD_UNLOCK(context->con->mutex_num_sent); + OBJ_RELEASE(context->con); + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "return context_list\n")); + opal_free_list_return(mca_coll_adapt_component.adapt_ireduce_context_free_list, + (opal_free_list_item_t *) context); + } + /* Call back function return 1, which means successful */ + req->req_free(&req); + return 1; +} + +/* + * Callback function of irecv + */ +static int recv_cb(ompi_request_t * req) +{ + mca_coll_adapt_reduce_context_t *context = + (mca_coll_adapt_reduce_context_t *) req->req_complete_cb_data; + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: ireduce_recv_cb, peer %d, seg_id %d\n", context->con->rank, + context->peer, context->frag_id)); + + int err; + int32_t new_id = + opal_atomic_add_fetch_32(&(context->con->next_recv_segs[context->child_id]), 1); + + /* Receive new segment */ + if (new_id < context->con->num_segs) { + char *temp_recv_buf = NULL; + mca_coll_adapt_inbuf_t *inbuf = NULL; + /* Set inbuf, if it it first child, recv on rbuf, else recv on inbuf */ + if (context->child_id == 0 && context->con->sbuf != MPI_IN_PLACE + && context->con->root == context->con->rank) { + temp_recv_buf = + (char *) context->con->rbuf + + (ptrdiff_t) new_id *(ptrdiff_t) context->con->segment_increment; + } else { + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: In recv_cb, alloc inbuf\n", context->con->rank)); + inbuf = (mca_coll_adapt_inbuf_t *) opal_free_list_wait(context->con->inbuf_list); + temp_recv_buf = inbuf->buff - context->con->lower_bound; + } + /* Get new context item from free list */ + mca_coll_adapt_reduce_context_t *recv_context = + (mca_coll_adapt_reduce_context_t *) opal_free_list_wait(mca_coll_adapt_component. + adapt_ireduce_context_free_list); + recv_context->buff = temp_recv_buf; + recv_context->frag_id = new_id; + recv_context->child_id = context->child_id; + recv_context->peer = context->peer; + recv_context->con = context->con; + OBJ_RETAIN(context->con); + recv_context->inbuf = inbuf; + int recv_count = recv_context->con->seg_count; + if (new_id == (recv_context->con->num_segs - 1)) { + recv_count = recv_context->con->count - new_id * recv_context->con->seg_count; + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: In recv_cb, create irecv for seg %d, peer %d, inbuf %p, tag %d\n", + context->con->rank, recv_context->frag_id, recv_context->peer, + (void *) inbuf, + (recv_context->con->ireduce_tag << 16) + recv_context->frag_id)); + ompi_request_t *recv_req; + err = + MCA_PML_CALL(irecv + (temp_recv_buf, recv_count, recv_context->con->datatype, + recv_context->peer, + (recv_context->con->ireduce_tag << 16) + recv_context->frag_id, + recv_context->con->comm, &recv_req)); + if (MPI_SUCCESS != err) { + return err; + } + /* Invoke recvive call back */ + ompi_request_set_callback(recv_req, recv_cb, recv_context); + } + + /* Do the op */ + int op_count = context->con->seg_count; + if (context->frag_id == (context->con->num_segs - 1)) { + op_count = context->con->count - context->frag_id * context->con->seg_count; + } + + int keep_inbuf = 0; + OPAL_THREAD_LOCK(context->con->mutex_op_list[context->frag_id]); + if (context->con->accumbuf[context->frag_id] == NULL) { + if (context->inbuf == NULL) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: set accumbuf to rbuf\n", context->con->rank)); + context->con->accumbuf[context->frag_id] = context->buff; + } else { + keep_inbuf = 1; + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: set accumbuf to inbuf\n", context->con->rank)); + context->con->accumbuf[context->frag_id] = + context->inbuf->buff - context->con->lower_bound; + } + /* Op sbuf and accmbuf to accumbuf */ + ompi_op_reduce(context->con->op, + context->con->sbuf + + (ptrdiff_t) context->frag_id * (ptrdiff_t) context->con->segment_increment, + context->con->accumbuf[context->frag_id], op_count, context->con->datatype); + + } else { + if (context->inbuf == NULL) { + /* Op rbuf and accumbuf to rbuf */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: op rbuf and accumbuf to rbuf\n", context->con->rank)); + ompi_op_reduce(context->con->op, context->con->accumbuf[context->frag_id], + context->buff, op_count, context->con->datatype); + /* Free old accumbuf */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: free old accumbuf %p\n", context->con->rank, + (void *) to_inbuf(context->con->accumbuf[context->frag_id], + context->con->distance))); + opal_free_list_return(context->con->inbuf_list, + (opal_free_list_item_t *) to_inbuf(context->con-> + accumbuf[context->frag_id], + context->con->distance)); + /* Set accumbut to rbuf */ + context->con->accumbuf[context->frag_id] = context->buff; + } else { + /* Op inbuf and accmbuf to accumbuf */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: op inbuf and accmbuf to accumbuf\n", context->con->rank)); + ompi_op_reduce(context->con->op, context->inbuf->buff - context->con->lower_bound, + context->con->accumbuf[context->frag_id], op_count, + context->con->datatype); + } + } + + OPAL_THREAD_UNLOCK(context->con->mutex_op_list[context->frag_id]); + + /* Set recv list */ + if (context->con->rank != context->con->tree->tree_root) { + OPAL_THREAD_LOCK(context->con->mutex_recv_list); + add_to_list(context->con->recv_list, context->frag_id); + OPAL_THREAD_UNLOCK(context->con->mutex_recv_list); + } + + /* Send to parent */ + if (context->con->rank != context->con->tree->tree_root + && context->con->ongoing_send < mca_coll_adapt_component.adapt_ireduce_max_send_requests) { + OPAL_THREAD_LOCK(context->con->mutex_recv_list); + mca_coll_adapt_item_t *item = + get_next_ready_item(context->con->recv_list, context->con->tree->tree_nextsize); + OPAL_THREAD_UNLOCK(context->con->mutex_recv_list); + + if (item != NULL) { + /* Gt new context item from free list */ + mca_coll_adapt_reduce_context_t *send_context = + (mca_coll_adapt_reduce_context_t *) opal_free_list_wait(mca_coll_adapt_component. + adapt_ireduce_context_free_list); + send_context->buff = context->con->accumbuf[context->frag_id]; + send_context->frag_id = item->id; + send_context->peer = context->con->tree->tree_prev; + send_context->con = context->con; + OBJ_RETAIN(context->con); + opal_atomic_add_fetch_32(&(context->con->ongoing_send), 1); + + int send_count = send_context->con->seg_count; + if (item->id == (send_context->con->num_segs - 1)) { + send_count = send_context->con->count - item->id * send_context->con->seg_count; + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: In recv_cb, create isend to seg %d, peer %d, tag %d\n", + send_context->con->rank, send_context->frag_id, send_context->peer, + (send_context->con->ireduce_tag << 16) + send_context->frag_id)); + + ompi_request_t *send_req; + err = + MCA_PML_CALL(isend + (send_context->buff, send_count, send_context->con->datatype, + send_context->peer, + (send_context->con->ireduce_tag << 16) + send_context->frag_id, + MCA_PML_BASE_SEND_SYNCHRONOUS, send_context->con->comm, &send_req)); + if (MPI_SUCCESS != err) { + return err; + } + OBJ_RELEASE(item); + + /* Invoke send call back */ + ompi_request_set_callback(send_req, send_cb, send_context); + } + } + + OPAL_THREAD_LOCK(context->con->mutex_num_recv_segs); + int num_recv_segs_t = ++(context->con->num_recv_segs); + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: In recv_cb, tree = %p, root = %d, num_recv = %d, num_segs = %d, num_child = %d\n", + context->con->rank, (void *) context->con->tree, + context->con->tree->tree_root, num_recv_segs_t, context->con->num_segs, + context->con->tree->tree_nextsize)); + /* If this is root and has received all the segments */ + if (context->con->tree->tree_root == context->con->rank + && num_recv_segs_t == context->con->num_segs * context->con->tree->tree_nextsize) { + OPAL_THREAD_UNLOCK(context->con->mutex_num_recv_segs); + if (!keep_inbuf && context->inbuf != NULL) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: root free context inbuf %p", context->con->rank, + (void *) context->inbuf)); + opal_free_list_return(context->con->inbuf_list, + (opal_free_list_item_t *) context->inbuf); + } + ireduce_request_fini(context); + } else { + OPAL_THREAD_UNLOCK(context->con->mutex_num_recv_segs); + if (!keep_inbuf && context->inbuf != NULL) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: free context inbuf %p", context->con->rank, + (void *) context->inbuf)); + opal_free_list_return(context->con->inbuf_list, + (opal_free_list_item_t *) context->inbuf); + } + OBJ_RELEASE(context->con); + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: return context_list", + context->con->rank)); + opal_free_list_return(mca_coll_adapt_component.adapt_ireduce_context_free_list, + (opal_free_list_item_t *) context); + } + req->req_free(&req); + return 1; +} + +int mca_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, + struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, mca_coll_base_module_t * module) +{ + if (count == 0) { + return MPI_SUCCESS; + } else { + int rank = ompi_comm_rank(comm); + if (rank == root) { + OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, + "ireduce root %d, algorithm %d, coll_adapt_ireduce_segment_size %zu, coll_adapt_ireduce_max_send_requests %d, coll_adapt_ireduce_max_recv_requests %d\n", + root, mca_coll_adapt_component.adapt_ireduce_algorithm, + mca_coll_adapt_component.adapt_ireduce_segment_size, + mca_coll_adapt_component.adapt_ireduce_max_send_requests, + mca_coll_adapt_component.adapt_ireduce_max_recv_requests)); + } + /* Get ireduce tag */ + int ireduce_tag = opal_atomic_add_fetch_32(&(comm->c_ireduce_tag), 1); + ireduce_tag = (ireduce_tag % 4096) + 4096; + fflush(stdout); + mca_coll_adapt_ireduce_fn_t reduce_func = + (mca_coll_adapt_ireduce_fn_t) + mca_coll_adapt_ireduce_algorithm_index[mca_coll_adapt_component. + adapt_ireduce_algorithm].algorithm_fn_ptr; + return reduce_func(sbuf, rbuf, count, dtype, op, root, comm, request, module, ireduce_tag); + } +} + +/* + * Ireduce functions with different algorithms + */ +int mca_coll_adapt_ireduce_binomial(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ireduce_tag) +{ + ompi_coll_tree_t *tree = ompi_coll_base_topo_build_bmtree(comm, root); + int err = + mca_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, + tree, mca_coll_adapt_component.adapt_ireduce_segment_size, + ireduce_tag); + return err; +} + +int mca_coll_adapt_ireduce_in_order_binomial(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t * module, int ireduce_tag) +{ + ompi_coll_tree_t *tree = ompi_coll_base_topo_build_in_order_bmtree(comm, root); + int err = + mca_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, + tree, mca_coll_adapt_component.adapt_ireduce_segment_size, + ireduce_tag); + return err; +} + +int mca_coll_adapt_ireduce_binary(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ireduce_tag) +{ + ompi_coll_tree_t *tree = ompi_coll_base_topo_build_tree(2, comm, root); + int err = + mca_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, + tree, mca_coll_adapt_component.adapt_ireduce_segment_size, + ireduce_tag); + return err; +} + +int mca_coll_adapt_ireduce_pipeline(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ireduce_tag) +{ + ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(1, comm, root); + int err = + mca_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, + tree, mca_coll_adapt_component.adapt_ireduce_segment_size, + ireduce_tag); + return err; +} + + +int mca_coll_adapt_ireduce_chain(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ireduce_tag) +{ + ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(4, comm, root); + int err = + mca_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, + tree, mca_coll_adapt_component.adapt_ireduce_segment_size, + ireduce_tag); + return err; +} + +int mca_coll_adapt_ireduce_linear(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, int ireduce_tag) +{ + int fanout = ompi_comm_size(comm) - 1; + ompi_coll_tree_t *tree; + if (fanout < 1) { + tree = ompi_coll_base_topo_build_chain(1, comm, root); + } else if (fanout <= MAXTREEFANOUT) { + tree = ompi_coll_base_topo_build_tree(ompi_comm_size(comm) - 1, comm, root); + } else { + tree = ompi_coll_base_topo_build_tree(MAXTREEFANOUT, comm, root); + } + int err = + mca_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, + tree, mca_coll_adapt_component.adapt_ireduce_segment_size, + ireduce_tag); + return err; +} + + +int mca_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, ompi_coll_tree_t * tree, + size_t seg_size, int ireduce_tag) +{ + + ptrdiff_t extent, lower_bound, segment_increment; + ptrdiff_t true_lower_bound, true_extent, real_seg_size; + size_t typelng; + int seg_count = count, num_segs, rank, recv_count, send_count, i, j, err, min, distance = 0; + int32_t seg_index; + _Atomic int *next_recv_segs = NULL; + /* Used to store the accumuate result, pointer to every segment */ + char **accumbuf = NULL; + /* A free list contains all recv data */ + opal_free_list_t *inbuf_list; + opal_mutex_t *mutex_recv_list; + opal_mutex_t *mutex_num_recv_segs; + opal_mutex_t *mutex_num_sent; + opal_mutex_t **mutex_op_list; + /* A list to store the segments need to be sent */ + opal_list_t *recv_list; + + /* Determine number of segments and number of elements sent per operation */ + rank = ompi_comm_rank(comm); + ompi_datatype_get_extent(dtype, &lower_bound, &extent); + ompi_datatype_type_size(dtype, &typelng); + COLL_BASE_COMPUTED_SEGCOUNT(seg_size, typelng, seg_count); + num_segs = (count + seg_count - 1) / seg_count; + segment_increment = (ptrdiff_t) seg_count *extent; + ompi_datatype_get_true_extent(dtype, &true_lower_bound, &true_extent); + real_seg_size = true_extent + (ptrdiff_t) (seg_count - 1) * extent; + + /* Set up free list */ + if (0 == mca_coll_adapt_component.adapt_ireduce_context_free_list_enabled) { + int32_t context_free_list_enabled = + opal_atomic_add_fetch_32(& + (mca_coll_adapt_component. + adapt_ireduce_context_free_list_enabled), 1); + if (1 == context_free_list_enabled) { + mca_coll_adapt_component.adapt_ireduce_context_free_list = OBJ_NEW(opal_free_list_t); + opal_free_list_init(mca_coll_adapt_component.adapt_ireduce_context_free_list, + sizeof(mca_coll_adapt_reduce_context_t), + opal_cache_line_size, + OBJ_CLASS(mca_coll_adapt_reduce_context_t), + 0, opal_cache_line_size, + mca_coll_adapt_component.adapt_context_free_list_min, + mca_coll_adapt_component.adapt_context_free_list_max, + mca_coll_adapt_component.adapt_context_free_list_inc, + NULL, 0, NULL, NULL, NULL); + } + } + + /* If the current process is not leaf */ + if (tree->tree_nextsize > 0) { + inbuf_list = OBJ_NEW(opal_free_list_t); + opal_free_list_init(inbuf_list, + sizeof(mca_coll_adapt_inbuf_t) + real_seg_size, + opal_cache_line_size, + OBJ_CLASS(mca_coll_adapt_inbuf_t), + 0, opal_cache_line_size, + mca_coll_adapt_component.adapt_inbuf_free_list_min, + mca_coll_adapt_component.adapt_inbuf_free_list_max, + mca_coll_adapt_component.adapt_inbuf_free_list_inc, + NULL, 0, NULL, NULL, NULL); + /* Set up next_recv_segs */ + next_recv_segs = (_Atomic int32_t *) malloc(sizeof(int32_t) * tree->tree_nextsize); + mca_coll_adapt_inbuf_t *temp_inbuf = + (mca_coll_adapt_inbuf_t *) opal_free_list_wait(inbuf_list); + distance = (char *) temp_inbuf->buff - lower_bound - (char *) temp_inbuf; //address of inbuf->buff to address of inbuf + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: distance %d, inbuf %p, inbuf->buff %p, inbuf->buff-lb %p, to_inbuf %p, inbuf_list %p\n", + rank, distance, (void *) temp_inbuf, (void *) temp_inbuf->buff, + (char *) temp_inbuf->buff - lower_bound, + (void *) to_inbuf((char *) temp_inbuf->buff - lower_bound, distance), + (void *) inbuf_list)); + opal_free_list_return(inbuf_list, (opal_free_list_item_t *) temp_inbuf); + } else { + inbuf_list = NULL; + next_recv_segs = NULL; + } + + ompi_request_t *temp_request = NULL; + /* Set up request */ + temp_request = OBJ_NEW(ompi_request_t); + OMPI_REQUEST_INIT(temp_request, false); + temp_request->req_state = OMPI_REQUEST_ACTIVE; + temp_request->req_type = 0; + temp_request->req_free = adapt_request_free; + temp_request->req_status.MPI_SOURCE = 0; + temp_request->req_status.MPI_TAG = 0; + temp_request->req_status.MPI_ERROR = 0; + temp_request->req_status._cancelled = 0; + temp_request->req_status._ucount = 0; + *request = temp_request; + + /* Set up mutex */ + mutex_recv_list = OBJ_NEW(opal_mutex_t); + mutex_num_recv_segs = OBJ_NEW(opal_mutex_t); + mutex_op_list = (opal_mutex_t **) malloc(sizeof(opal_mutex_t *) * num_segs); + for (i = 0; i < num_segs; i++) { + mutex_op_list[i] = OBJ_NEW(opal_mutex_t); + } + mutex_num_sent = OBJ_NEW(opal_mutex_t); + /* Create recv_list */ + recv_list = OBJ_NEW(opal_list_t); + + /* Set constant context for send and recv call back */ + mca_coll_adapt_constant_reduce_context_t *con = + OBJ_NEW(mca_coll_adapt_constant_reduce_context_t); + con->count = count; + con->seg_count = seg_count; + con->datatype = dtype; + con->comm = comm; + con->segment_increment = segment_increment; + con->num_segs = num_segs; + con->request = temp_request; + con->rank = rank; + con->num_recv_segs = 0; + con->num_sent_segs = 0; + con->next_recv_segs = next_recv_segs; + con->mutex_recv_list = mutex_recv_list; + con->mutex_num_recv_segs = mutex_num_recv_segs; + con->mutex_num_sent = mutex_num_sent; + con->mutex_op_list = mutex_op_list; + con->op = op; + con->tree = tree; + con->inbuf_list = inbuf_list; + con->recv_list = recv_list; + con->lower_bound = lower_bound; + con->ongoing_send = 0; + con->sbuf = (char *) sbuf; + con->rbuf = (char *) rbuf; + con->root = root; + con->distance = distance; + con->ireduce_tag = ireduce_tag; + con->real_seg_size = real_seg_size; + + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: start ireduce root %d tag %d\n", rank, tree->tree_root, + ireduce_tag)); + + /* If the current process is not leaf node */ + if (tree->tree_nextsize > 0) { + /* Set up accumbuf */ + accumbuf = (char **) malloc(sizeof(char *) * num_segs); + if (root == rank && sbuf == MPI_IN_PLACE) { + for (i = 0; i < num_segs; i++) { + accumbuf[i] = (char *) rbuf + (ptrdiff_t) i *(ptrdiff_t) segment_increment; + } + } else { + for (i = 0; i < num_segs; i++) { + accumbuf[i] = NULL; + } + } + + con->accumbuf = accumbuf; + + /* For the first batch of segments */ + if (num_segs <= mca_coll_adapt_component.adapt_ireduce_max_recv_requests) { + min = num_segs; + } else { + min = mca_coll_adapt_component.adapt_ireduce_max_recv_requests; + } + for (i = 0; i < tree->tree_nextsize; i++) { + next_recv_segs[i] = min - 1; + } + + for (j = 0; j < min; j++) { + /* For each child */ + for (i = 0; i < tree->tree_nextsize; i++) { + seg_index = j; + if (seg_index < num_segs) { + recv_count = seg_count; + if (seg_index == (num_segs - 1)) { + recv_count = count - (ptrdiff_t) seg_count *(ptrdiff_t) seg_index; + } + char *temp_recv_buf = NULL; + mca_coll_adapt_inbuf_t *inbuf = NULL; + /* Set inbuf, if it it first child, recv on rbuf, else recv on inbuf */ + if (i == 0 && sbuf != MPI_IN_PLACE && root == rank) { + temp_recv_buf = + (char *) rbuf + (ptrdiff_t) j *(ptrdiff_t) segment_increment; + } else { + inbuf = (mca_coll_adapt_inbuf_t *) opal_free_list_wait(inbuf_list); + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: In ireduce, alloc inbuf %p\n", rank, + (void *) inbuf)); + temp_recv_buf = inbuf->buff - lower_bound; + } + /* Get context */ + mca_coll_adapt_reduce_context_t *context = + (mca_coll_adapt_reduce_context_t *) + opal_free_list_wait(mca_coll_adapt_component. + adapt_ireduce_context_free_list); + context->buff = temp_recv_buf; + context->frag_id = seg_index; + context->child_id = i; //the id of peer in in the tree + context->peer = tree->tree_next[i]; //the actural rank of the peer + context->con = con; + OBJ_RETAIN(con); + context->inbuf = inbuf; + + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: In ireduce, create irecv for seg %d, peer %d, recv_count %d, inbuf %p tag %d\n", + context->con->rank, context->frag_id, context->peer, + recv_count, (void *) inbuf, + (ireduce_tag << 16) + seg_index)); + + /* Create a recv request */ + ompi_request_t *recv_req; + err = + MCA_PML_CALL(irecv + (temp_recv_buf, recv_count, dtype, tree->tree_next[i], + (ireduce_tag << 16) + seg_index, comm, &recv_req)); + if (MPI_SUCCESS != err) { + return err; + } + /* Invoke recv call back */ + ompi_request_set_callback(recv_req, recv_cb, context); + } + } + } + } + + /* Leaf nodes */ + else { + mca_coll_adapt_item_t *item; + /* Set up recv_list */ + for (seg_index = 0; seg_index < num_segs; seg_index++) { + item = OBJ_NEW(mca_coll_adapt_item_t); + item->id = seg_index; + item->count = tree->tree_nextsize; + opal_list_append(recv_list, (opal_list_item_t *) item); + } + if (num_segs <= mca_coll_adapt_component.adapt_ireduce_max_send_requests) { + min = num_segs; + } else { + min = mca_coll_adapt_component.adapt_ireduce_max_send_requests; + } + con->accumbuf = accumbuf; + for (i = 0; i < min; i++) { + OPAL_THREAD_LOCK(mutex_recv_list); + item = get_next_ready_item(recv_list, tree->tree_nextsize); + OPAL_THREAD_UNLOCK(mutex_recv_list); + if (item != NULL) { + send_count = seg_count; + if (item->id == (num_segs - 1)) { + send_count = count - (ptrdiff_t) seg_count *(ptrdiff_t) item->id; + } + mca_coll_adapt_reduce_context_t *context = + (mca_coll_adapt_reduce_context_t *) + opal_free_list_wait(mca_coll_adapt_component.adapt_ireduce_context_free_list); + context->buff = + (char *) sbuf + (ptrdiff_t) item->id * (ptrdiff_t) segment_increment; + context->frag_id = item->id; + /* Actural rank of the peer */ + context->peer = tree->tree_prev; + context->con = con; + OBJ_RETAIN(con); + context->inbuf = NULL; + + opal_atomic_add_fetch_32(&(context->con->ongoing_send), 1); + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, + "[%d]: In ireduce, create isend to seg %d, peer %d, send_count %d tag %d\n", + context->con->rank, context->frag_id, context->peer, + send_count, (ireduce_tag << 16) + context->frag_id)); + + /* Create send request */ + ompi_request_t *send_req; + err = + MCA_PML_CALL(isend + (context->buff, send_count, dtype, tree->tree_prev, + (ireduce_tag << 16) + context->frag_id, + MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &send_req)); + if (MPI_SUCCESS != err) { + return err; + } + OBJ_RELEASE(item); + + /* Invoke send call back */ + ompi_request_set_callback(send_req, send_cb, context); + } + } + + } + + return MPI_SUCCESS; +} diff --git a/ompi/mca/coll/adapt/coll_adapt_item.c b/ompi/mca/coll/adapt/coll_adapt_item.c new file mode 100644 index 00000000000..dabe2ce37b8 --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt_item.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_adapt_item.h" + +static void mca_coll_adapt_item_constructor(mca_coll_adapt_item_t * item) +{ +} + +static void mca_coll_adapt_item_destructor(mca_coll_adapt_item_t * item) +{ +} + +OBJ_CLASS_INSTANCE(mca_coll_adapt_item_t, opal_list_item_t, mca_coll_adapt_item_constructor, + mca_coll_adapt_item_destructor); diff --git a/ompi/mca/coll/adapt/coll_adapt_item.h b/ompi/mca/coll/adapt/coll_adapt_item.h new file mode 100644 index 00000000000..2fc6cbdbd03 --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt_item.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal/class/opal_list.h" +#include "coll_adapt_inbuf.h" + +struct mca_coll_adapt_item_s { + opal_list_item_t super; + /* Fragment id */ + int id; + /* The number of children which have received the current segment from */ + int count; +}; + +typedef struct mca_coll_adapt_item_s mca_coll_adapt_item_t; + +OBJ_CLASS_DECLARATION(mca_coll_adapt_item_t); diff --git a/ompi/mca/coll/adapt/coll_adapt_module.c b/ompi/mca/coll/adapt/coll_adapt_module.c new file mode 100644 index 00000000000..e709313361f --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt_module.c @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include +#ifdef HAVE_STRING_H +#include +#endif +#ifdef HAVE_SCHED_H +#include +#endif +#include +#ifdef HAVE_SYS_MMAN_H +#include +#endif /* HAVE_SYS_MMAN_H */ +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ + +#include "mpi.h" +#include "opal_stdint.h" +#include "opal/util/os_path.h" + +#include "ompi/communicator/communicator.h" +#include "ompi/group/group.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/base.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +//#include "ompi/mca/rte/rte.h" +#include "ompi/proc/proc.h" +#include "coll_adapt.h" + +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include "coll_adapt_algorithms.h" + + +/* + * Local functions + */ +static int adapt_module_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm); + +/* + * Module constructor + */ +static void mca_coll_adapt_module_construct(mca_coll_adapt_module_t * module) +{ + module->enabled = false; + module->adapt_component = &mca_coll_adapt_component; +} + +/* + * Module destructor + */ +static void mca_coll_adapt_module_destruct(mca_coll_adapt_module_t * module) +{ + module->enabled = false; +} + + +OBJ_CLASS_INSTANCE(mca_coll_adapt_module_t, + mca_coll_base_module_t, + mca_coll_adapt_module_construct, mca_coll_adapt_module_destruct); + +/* + * Initial query function that is invoked during MPI_INIT, allowing + * this component to disqualify itself if it doesn't support the + * required level of thread support. This function is invoked exactly + * once. + */ +int mca_coll_adapt_init_query(bool enable_progress_threads, bool enable_mpi_threads) +{ + return OMPI_SUCCESS; +} + + +/* + * Invoked when there's a new communicator that has been created. + * Look at the communicator and decide which set of functions and + * priority we want to return. + */ +mca_coll_base_module_t *mca_coll_adapt_comm_query(struct ompi_communicator_t * comm, int *priority) +{ + mca_coll_adapt_module_t *adapt_module; + + /* If we're intercomm, or if there's only one process in the communicator */ + if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm)) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:adapt:comm_query (%d/%s): intercomm, comm is too small; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } + + /* Get the priority level attached to this module. If priority is less than or equal to 0, then the module is unavailable. */ + *priority = mca_coll_adapt_component.adapt_priority; + if (mca_coll_adapt_component.adapt_priority <= 0) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:adapt:comm_query (%d/%s): priority too low; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } + + adapt_module = OBJ_NEW(mca_coll_adapt_module_t); + if (NULL == adapt_module) { + return NULL; + } + + /* All is good -- return a module */ + adapt_module->super.coll_module_enable = adapt_module_enable; + adapt_module->super.ft_event = NULL; + adapt_module->super.coll_allgather = NULL; + adapt_module->super.coll_allgatherv = NULL; + adapt_module->super.coll_allreduce = NULL; + adapt_module->super.coll_alltoall = NULL; + adapt_module->super.coll_alltoallw = NULL; + adapt_module->super.coll_barrier = NULL; + adapt_module->super.coll_bcast = mca_coll_adapt_bcast; + adapt_module->super.coll_exscan = NULL; + adapt_module->super.coll_gather = NULL; + adapt_module->super.coll_gatherv = NULL; + adapt_module->super.coll_reduce = mca_coll_adapt_reduce; + adapt_module->super.coll_reduce_scatter = NULL; + adapt_module->super.coll_scan = NULL; + adapt_module->super.coll_scatter = NULL; + adapt_module->super.coll_scatterv = NULL; + adapt_module->super.coll_ibcast = mca_coll_adapt_ibcast; + adapt_module->super.coll_ireduce = mca_coll_adapt_ireduce; + adapt_module->super.coll_iallreduce = NULL; + + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:adapt:comm_query (%d/%s): pick me! pick me!", + comm->c_contextid, comm->c_name); + return &(adapt_module->super); +} + +/* + * Init module on the communicator + */ +static int adapt_module_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm) +{ + return OMPI_SUCCESS; +} + +/* + * Free ADAPT request + */ +int adapt_request_free(ompi_request_t ** request) +{ + (*request)->req_state = OMPI_REQUEST_INVALID; + OBJ_RELEASE(*request); + *request = MPI_REQUEST_NULL; + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/adapt/coll_adapt_reduce.c b/ompi/mca/coll/adapt/coll_adapt_reduce.c new file mode 100644 index 00000000000..f41afe21484 --- /dev/null +++ b/ompi/mca/coll/adapt/coll_adapt_reduce.c @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2014-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_adapt.h" +#include "coll_adapt_algorithms.h" + +/* MPI_Reduce and MPI_Ireduce in the ADAPT module only work for commutative operations */ +int mca_coll_adapt_reduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, + struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) +{ + if (count == 0) { + return MPI_SUCCESS; + } else { + ompi_request_t *request; + int err = + mca_coll_adapt_ireduce(sbuf, rbuf, count, dtype, op, root, comm, &request, module); + ompi_request_wait(&request, MPI_STATUS_IGNORE); + return err; + } +} diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h index 11b46ba47eb..fcea107e7c7 100644 --- a/ompi/mca/coll/base/coll_base_functions.h +++ b/ompi/mca/coll/base/coll_base_functions.h @@ -492,6 +492,10 @@ struct mca_coll_base_comm_t { /* in-order binary tree (root of the in-order binary tree is rank 0) */ ompi_coll_tree_t *cached_in_order_bintree; + + /* linear */ + ompi_coll_tree_t *cached_linear; + int cached_linear_root; }; typedef struct mca_coll_base_comm_t mca_coll_base_comm_t; OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_comm_t); diff --git a/ompi/request/request.h b/ompi/request/request.h index c0a94a79255..f12882c033c 100644 --- a/ompi/request/request.h +++ b/ompi/request/request.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -436,12 +436,13 @@ static inline void ompi_request_wait_completion(ompi_request_t *req) static inline int ompi_request_complete(ompi_request_t* request, bool with_signal) { int rc = 0; - - if( NULL != request->req_complete_cb) { - rc = request->req_complete_cb( request ); + + if(NULL != request->req_complete_cb) { + ompi_request_complete_fn_t temp = request->req_complete_cb; request->req_complete_cb = NULL; + rc = temp( request ); } - + if (0 == rc) { if( OPAL_LIKELY(with_signal) ) { void *_tmp_ptr = REQUEST_PENDING; @@ -453,13 +454,30 @@ static inline int ompi_request_complete(ompi_request_t* request, bool with_signa if( REQUEST_PENDING != tmp_sync ) wait_sync_update(tmp_sync, 1, request->req_status.MPI_ERROR); } - } else + } else { request->req_complete = REQUEST_COMPLETED; + } } - + return OMPI_SUCCESS; } +static inline int ompi_request_set_callback(ompi_request_t* request, + ompi_request_complete_fn_t cb, + void* cb_data) +{ + request->req_complete_cb_data = cb_data; + request->req_complete_cb = cb; + int rc = 0; + /* If request is completed and the callback is not called, need to call callback */ + if ((NULL != request->req_complete_cb) && (request->req_complete == REQUEST_COMPLETED)) { + ompi_request_complete_fn_t temp = request->req_complete_cb; + request->req_complete_cb = NULL; + rc = temp( request ); + } + return rc; +} + END_C_DECLS #endif From a4be3bb93dcd6f83a7cbdcec020b478fe36f2a48 Mon Sep 17 00:00:00 2001 From: bsergentm Date: Wed, 6 May 2020 18:30:03 +0200 Subject: [PATCH 2/7] Coll/adapt Bull (#15) * piggybacking Bull functionalities * coll/adapt: Fix naming conventions and C11 atomic use This commit fixes some naming convention issues, such as function names which should follow the naming ompi_coll_adapt instead of mca_coll_adapt, reserved for component and module naming (cf. tuned collective component); It also fixes the use of _Atomic construct, which is only valid in C11. OPAL constructs have already been adapted to that use, so use opal_atomic_* types instead. * coll/adapt: Remove unused component field in module This commit removes an unneeded field referencing the component in the module of adapt, as it is already available through the mca_coll_adapt_component global variable. Signed-off-by: Marc Sergent Co-authored-by: Lemarinier, Pierre Co-authored-by: pierrele <31764860+pierrele@users.noreply.github.com> --- ompi/mca/coll/adapt/coll_adapt.h | 25 ++- ompi/mca/coll/adapt/coll_adapt_algorithms.h | 56 ++++--- ompi/mca/coll/adapt/coll_adapt_bcast.c | 4 +- ompi/mca/coll/adapt/coll_adapt_component.c | 82 ++++++---- ompi/mca/coll/adapt/coll_adapt_context.c | 42 +++-- ompi/mca/coll/adapt/coll_adapt_context.h | 40 ++--- ompi/mca/coll/adapt/coll_adapt_ibcast.c | 127 ++++++++------- ompi/mca/coll/adapt/coll_adapt_inbuf.c | 8 +- ompi/mca/coll/adapt/coll_adapt_inbuf.h | 6 +- ompi/mca/coll/adapt/coll_adapt_ireduce.c | 163 +++++++++++--------- ompi/mca/coll/adapt/coll_adapt_item.c | 8 +- ompi/mca/coll/adapt/coll_adapt_item.h | 6 +- ompi/mca/coll/adapt/coll_adapt_module.c | 68 ++++---- ompi/mca/coll/adapt/coll_adapt_reduce.c | 4 +- 14 files changed, 343 insertions(+), 296 deletions(-) diff --git a/ompi/mca/coll/adapt/coll_adapt.h b/ompi/mca/coll/adapt/coll_adapt.h index 0eaca96e5e7..b2a8fcb949c 100644 --- a/ompi/mca/coll/adapt/coll_adapt.h +++ b/ompi/mca/coll/adapt/coll_adapt.h @@ -21,13 +21,15 @@ #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_base_topo.h" -BEGIN_C_DECLS typedef struct mca_coll_adapt_module_t mca_coll_adapt_module_t; +BEGIN_C_DECLS + +typedef struct mca_coll_adapt_module_t mca_coll_adapt_module_t; /* * Structure to hold the adapt coll component. First it holds the * base coll component, and then holds a bunch of * adapt-coll-component-specific stuff (e.g., current MCA param - * values). + * values). */ typedef struct mca_coll_adapt_component_t { /* Base coll component */ @@ -45,7 +47,7 @@ typedef struct mca_coll_adapt_component_t { /* MCA parameter: Minimum number of segment in context free list */ int adapt_context_free_list_min; - /* MCA parameter: Increasment number of segment in context free list */ + /* MCA parameter: Increasement number of segment in context free list */ int adapt_context_free_list_inc; /* Bcast MCA parameter */ @@ -55,7 +57,7 @@ typedef struct mca_coll_adapt_component_t { int adapt_ibcast_max_recv_requests; /* Bcast free list */ opal_free_list_t *adapt_ibcast_context_free_list; - _Atomic int32_t adapt_ibcast_context_free_list_enabled; + opal_atomic_int32_t adapt_ibcast_context_free_list_enabled; /* Reduce MCA parameter */ int adapt_ireduce_algorithm; @@ -68,7 +70,7 @@ typedef struct mca_coll_adapt_component_t { /* Reduce free list */ opal_free_list_t *adapt_ireduce_context_free_list; - _Atomic int32_t adapt_ireduce_context_free_list_enabled; + opal_atomic_int32_t adapt_ireduce_context_free_list_enabled; } mca_coll_adapt_component_t; @@ -78,9 +80,7 @@ struct mca_coll_adapt_module_t { mca_coll_base_module_t super; /* Whether this module has been lazily initialized or not yet */ - bool enabled; - /* Pointer to mca_coll_adapt_component */ - mca_coll_adapt_component_t *adapt_component; + bool adapt_enabled; }; OBJ_CLASS_DECLARATION(mca_coll_adapt_module_t); @@ -88,11 +88,10 @@ OBJ_CLASS_DECLARATION(mca_coll_adapt_module_t); OMPI_MODULE_DECLSPEC extern mca_coll_adapt_component_t mca_coll_adapt_component; /* ADAPT module functions */ -int mca_coll_adapt_init_query(bool enable_progress_threads, bool enable_mpi_threads); - -mca_coll_base_module_t *mca_coll_adapt_comm_query(struct ompi_communicator_t *comm, int *priority); +int ompi_coll_adapt_init_query(bool enable_progress_threads, bool enable_mpi_threads); +mca_coll_base_module_t * ompi_coll_adapt_comm_query(struct ompi_communicator_t *comm, int *priority); /* Free ADAPT quest */ -int adapt_request_free(ompi_request_t ** request); +int ompi_coll_adapt_request_free(ompi_request_t **request); -#endif /* MCA_COLL_ADAPT_EXPORT_H */ +#endif /* MCA_COLL_ADAPT_EXPORT_H */ diff --git a/ompi/mca/coll/adapt/coll_adapt_algorithms.h b/ompi/mca/coll/adapt/coll_adapt_algorithms.h index 8b7b7cebd4f..f0b67b787d8 100644 --- a/ompi/mca/coll/adapt/coll_adapt_algorithms.h +++ b/ompi/mca/coll/adapt/coll_adapt_algorithms.h @@ -14,82 +14,88 @@ #include "ompi/mca/coll/base/coll_base_functions.h" #include -typedef struct mca_coll_adapt_algorithm_index_s { +typedef struct ompi_coll_adapt_algorithm_index_s { int algorithm_index; uintptr_t algorithm_fn_ptr; -} mca_coll_adapt_algorithm_index_t; +} ompi_coll_adapt_algorithm_index_t; /* Bcast */ -int mca_coll_adapt_ibcast_init(void); -int mca_coll_adapt_ibcast_fini(void); -int mca_coll_adapt_bcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, +int ompi_coll_adapt_ibcast_init(void); +int ompi_coll_adapt_ibcast_fini(void); +int ompi_coll_adapt_bcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, +int ompi_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module); -int mca_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t *datatype, int root, +int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, ompi_coll_tree_t * tree, size_t seg_size, int ibcast_tag); -int mca_coll_adapt_ibcast_binomial(void *buff, int count, struct ompi_datatype_t *datatype, +int ompi_coll_adapt_ibcast_binomial(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ibcast_tag); -int mca_coll_adapt_ibcast_in_order_binomial(void *buff, int count, struct ompi_datatype_t *datatype, +int ompi_coll_adapt_ibcast_in_order_binomial(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ibcast_tag); -int mca_coll_adapt_ibcast_binary(void *buff, int count, struct ompi_datatype_t *datatype, int root, +int ompi_coll_adapt_ibcast_binary(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ibcast_tag); -int mca_coll_adapt_ibcast_pipeline(void *buff, int count, struct ompi_datatype_t *datatype, +int ompi_coll_adapt_ibcast_pipeline(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ibcast_tag); -int mca_coll_adapt_ibcast_chain(void *buff, int count, struct ompi_datatype_t *datatype, int root, +int ompi_coll_adapt_ibcast_chain(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ibcast_tag); -int mca_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t *datatype, int root, +int ompi_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ibcast_tag); - +int ompi_coll_adapt_ibcast_tuned(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t *module, int ibcast_tag); /* Reduce */ -int mca_coll_adapt_ireduce_init(void); -int mca_coll_adapt_ireduce_fini(void); -int mca_coll_adapt_reduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, +int ompi_coll_adapt_ireduce_init(void); +int ompi_coll_adapt_ireduce_fini(void); +int ompi_coll_adapt_reduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, +int ompi_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module); -int mca_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, ompi_coll_tree_t * tree, size_t seg_size, int ireduce_tag); -int mca_coll_adapt_ireduce_binomial(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_tuned(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t *module, int ireduce_tag); +int ompi_coll_adapt_ireduce_binomial(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ireduce_tag); -int mca_coll_adapt_ireduce_in_order_binomial(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_in_order_binomial(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ireduce_tag); -int mca_coll_adapt_ireduce_binary(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_binary(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ireduce_tag); -int mca_coll_adapt_ireduce_pipeline(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_pipeline(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ireduce_tag); -int mca_coll_adapt_ireduce_chain(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_chain(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ireduce_tag); -int mca_coll_adapt_ireduce_linear(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_linear(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ireduce_tag); diff --git a/ompi/mca/coll/adapt/coll_adapt_bcast.c b/ompi/mca/coll/adapt/coll_adapt_bcast.c index 4348f2dc3b5..604898b2e54 100644 --- a/ompi/mca/coll/adapt/coll_adapt_bcast.c +++ b/ompi/mca/coll/adapt/coll_adapt_bcast.c @@ -12,14 +12,14 @@ #include "coll_adapt.h" #include "coll_adapt_algorithms.h" -int mca_coll_adapt_bcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, +int ompi_coll_adapt_bcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { if (count == 0) { return MPI_SUCCESS; } else { ompi_request_t *request; - int err = mca_coll_adapt_ibcast(buff, count, datatype, root, comm, &request, module); + int err = ompi_coll_adapt_ibcast(buff, count, datatype, root, comm, &request, module); ompi_request_wait(&request, MPI_STATUS_IGNORE); return err; } diff --git a/ompi/mca/coll/adapt/coll_adapt_component.c b/ompi/mca/coll/adapt/coll_adapt_component.c index 6079c4d92ea..d38cd42b42b 100644 --- a/ompi/mca/coll/adapt/coll_adapt_component.c +++ b/ompi/mca/coll/adapt/coll_adapt_component.c @@ -36,35 +36,32 @@ static int adapt_register(void); */ mca_coll_adapt_component_t mca_coll_adapt_component = { - /* First, fill in the super */ - { - /* First, the mca_component_t struct containing meta - information about the component itself */ - - { - MCA_COLL_BASE_VERSION_2_0_0, - - /* Component name and version */ - "adapt", - OMPI_MAJOR_VERSION, - OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION, - - /* Component functions */ - adapt_open, /* open */ - adapt_close, - NULL, /* query */ - adapt_register}, - { - /* The component is not checkpoint ready */ - MCA_BASE_METADATA_PARAM_NONE}, - - /* Initialization / querying functions */ - mca_coll_adapt_init_query, - mca_coll_adapt_comm_query, - }, + /* First, the mca_component_t struct containing meta + information about the component itself */ + .collm_version = { + MCA_COLL_BASE_VERSION_2_0_0, + + /* Component name and version */ + .mca_component_name = "adapt", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + + /* Component functions */ + .mca_open_component = adapt_open, + .mca_close_component = adapt_close, + .mca_register_component_params = adapt_register, + }, + .collm_data = { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE + }, + + /* Initialization / querying functions */ + .collm_init_query = ompi_coll_adapt_init_query, + .collm_comm_query = ompi_coll_adapt_comm_query, + }, /* adapt-component specific information */ @@ -81,6 +78,25 @@ mca_coll_adapt_component_t mca_coll_adapt_component = { /* Open the component */ static int adapt_open(void) { + int param; + mca_coll_adapt_component_t *cs = &mca_coll_adapt_component; + + /* + * Get the global coll verbosity: it will be ours + */ + param = mca_base_var_find("ompi", "coll", "base", "verbose"); + if (param >= 0) { + const int *verbose = NULL; + mca_base_var_get_value(param, &verbose, NULL, NULL); + if (verbose && verbose[0] > 0) { + cs->adapt_output = opal_output_open(NULL); + opal_output_set_verbosity(cs->adapt_output, verbose[0]); + } + } + + opal_output_verbose(1, cs->adapt_output, + "coll:adapt:component_open: done!"); + return OMPI_SUCCESS; } @@ -88,8 +104,8 @@ static int adapt_open(void) /* Shut down the component */ static int adapt_close(void) { - mca_coll_adapt_ibcast_fini(); - mca_coll_adapt_ireduce_fini(); + ompi_coll_adapt_ibcast_fini(); + ompi_coll_adapt_ireduce_fini(); return OMPI_SUCCESS; } @@ -125,7 +141,7 @@ static int adapt_register(void) opal_output_set_verbosity(cs->adapt_output, adapt_verbose); cs->adapt_context_free_list_min = 10; - (void) mca_base_component_var_register(c, "context_free_list_max", + (void) mca_base_component_var_register(c, "context_free_list_min", "Minimum number of segments in context free list", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, @@ -133,7 +149,7 @@ static int adapt_register(void) &cs->adapt_context_free_list_min); cs->adapt_context_free_list_max = 10000; - (void) mca_base_component_var_register(c, "context_free_list_min", + (void) mca_base_component_var_register(c, "context_free_list_max", "Maximum number of segments in context free list", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, @@ -147,8 +163,8 @@ static int adapt_register(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->adapt_context_free_list_inc); - mca_coll_adapt_ibcast_init(); - mca_coll_adapt_ireduce_init(); + ompi_coll_adapt_ibcast_init(); + ompi_coll_adapt_ireduce_init(); return adapt_verify_mca_variables(); } diff --git a/ompi/mca/coll/adapt/coll_adapt_context.c b/ompi/mca/coll/adapt/coll_adapt_context.c index 978739df9ab..be03127f23c 100644 --- a/ompi/mca/coll/adapt/coll_adapt_context.c +++ b/ompi/mca/coll/adapt/coll_adapt_context.c @@ -12,60 +12,58 @@ #include "ompi/mca/coll/coll.h" #include "coll_adapt_context.h" -static void mca_coll_adapt_bcast_context_constructor(mca_coll_adapt_bcast_context_t * bcast_context) +static void ompi_coll_adapt_bcast_context_constructor(ompi_coll_adapt_bcast_context_t * bcast_context) { } -static void mca_coll_adapt_bcast_context_destructor(mca_coll_adapt_bcast_context_t * bcast_context) +static void ompi_coll_adapt_bcast_context_destructor(ompi_coll_adapt_bcast_context_t * bcast_context) { - } static void -mca_coll_adapt_constant_bcast_context_constructor(mca_coll_adapt_constant_bcast_context_t * con) +ompi_coll_adapt_constant_bcast_context_constructor(ompi_coll_adapt_constant_bcast_context_t * con) { } -static void mca_coll_adapt_constant_bcast_context_destructor(mca_coll_adapt_constant_bcast_context_t +static void ompi_coll_adapt_constant_bcast_context_destructor(ompi_coll_adapt_constant_bcast_context_t * con) { } -OBJ_CLASS_INSTANCE(mca_coll_adapt_bcast_context_t, opal_free_list_item_t, - mca_coll_adapt_bcast_context_constructor, - mca_coll_adapt_bcast_context_destructor); +OBJ_CLASS_INSTANCE(ompi_coll_adapt_bcast_context_t, opal_free_list_item_t, + ompi_coll_adapt_bcast_context_constructor, + ompi_coll_adapt_bcast_context_destructor); -OBJ_CLASS_INSTANCE(mca_coll_adapt_constant_bcast_context_t, opal_object_t, - mca_coll_adapt_constant_bcast_context_constructor, - mca_coll_adapt_constant_bcast_context_destructor); +OBJ_CLASS_INSTANCE(ompi_coll_adapt_constant_bcast_context_t, opal_object_t, + ompi_coll_adapt_constant_bcast_context_constructor, + ompi_coll_adapt_constant_bcast_context_destructor); -static void mca_coll_adapt_reduce_context_constructor(mca_coll_adapt_reduce_context_t * +static void ompi_coll_adapt_reduce_context_constructor(ompi_coll_adapt_reduce_context_t * reduce_context) { } -static void mca_coll_adapt_reduce_context_destructor(mca_coll_adapt_reduce_context_t * +static void ompi_coll_adapt_reduce_context_destructor(ompi_coll_adapt_reduce_context_t * reduce_context) { - } static void -mca_coll_adapt_constant_reduce_context_constructor(mca_coll_adapt_constant_reduce_context_t * con) +ompi_coll_adapt_constant_reduce_context_constructor(ompi_coll_adapt_constant_reduce_context_t * con) { } static void -mca_coll_adapt_constant_reduce_context_destructor(mca_coll_adapt_constant_reduce_context_t * con) +ompi_coll_adapt_constant_reduce_context_destructor(ompi_coll_adapt_constant_reduce_context_t * con) { } -OBJ_CLASS_INSTANCE(mca_coll_adapt_reduce_context_t, opal_free_list_item_t, - mca_coll_adapt_reduce_context_constructor, - mca_coll_adapt_reduce_context_destructor); +OBJ_CLASS_INSTANCE(ompi_coll_adapt_reduce_context_t, opal_free_list_item_t, + ompi_coll_adapt_reduce_context_constructor, + ompi_coll_adapt_reduce_context_destructor); -OBJ_CLASS_INSTANCE(mca_coll_adapt_constant_reduce_context_t, opal_object_t, - mca_coll_adapt_constant_reduce_context_constructor, - mca_coll_adapt_constant_reduce_context_destructor); +OBJ_CLASS_INSTANCE(ompi_coll_adapt_constant_reduce_context_t, opal_object_t, + ompi_coll_adapt_constant_reduce_context_constructor, + ompi_coll_adapt_constant_reduce_context_destructor); diff --git a/ompi/mca/coll/adapt/coll_adapt_context.h b/ompi/mca/coll/adapt/coll_adapt_context.h index 917e3d48861..eea98fb872e 100644 --- a/ompi/mca/coll/adapt/coll_adapt_context.h +++ b/ompi/mca/coll/adapt/coll_adapt_context.h @@ -19,7 +19,7 @@ #include "coll_adapt_inbuf.h" /* Bcast constant context in bcast context */ -struct mca_coll_adapt_constant_bcast_context_s { +struct ompi_coll_adapt_constant_bcast_context_s { opal_object_t super; int root; size_t count; @@ -42,29 +42,29 @@ struct mca_coll_adapt_constant_bcast_context_s { int ibcast_tag; }; -typedef struct mca_coll_adapt_constant_bcast_context_s mca_coll_adapt_constant_bcast_context_t; +typedef struct ompi_coll_adapt_constant_bcast_context_s ompi_coll_adapt_constant_bcast_context_t; -OBJ_CLASS_DECLARATION(mca_coll_adapt_constant_bcast_context_t); +OBJ_CLASS_DECLARATION(ompi_coll_adapt_constant_bcast_context_t); /* Bcast context of each segment*/ -typedef struct mca_coll_adapt_bcast_context_s mca_coll_adapt_bcast_context_t; +typedef struct ompi_coll_adapt_bcast_context_s ompi_coll_adapt_bcast_context_t; -typedef int (*mca_coll_adapt_bcast_cuda_callback_fn_t) (mca_coll_adapt_bcast_context_t * context); +typedef int (*ompi_coll_adapt_bcast_cuda_callback_fn_t) (ompi_coll_adapt_bcast_context_t * context); -struct mca_coll_adapt_bcast_context_s { +struct ompi_coll_adapt_bcast_context_s { opal_free_list_item_t super; char *buff; int frag_id; int child_id; int peer; - mca_coll_adapt_constant_bcast_context_t *con; + ompi_coll_adapt_constant_bcast_context_t *con; }; -OBJ_CLASS_DECLARATION(mca_coll_adapt_bcast_context_t); +OBJ_CLASS_DECLARATION(ompi_coll_adapt_bcast_context_t); /* Reduce constant context in reduce context */ -struct mca_coll_adapt_constant_reduce_context_s { +struct ompi_coll_adapt_constant_reduce_context_s { opal_object_t super; size_t count; size_t seg_count; @@ -81,7 +81,7 @@ struct mca_coll_adapt_constant_reduce_context_s { /* Number of sent segments */ int32_t num_sent_segs; /* Next seg need to be received for every children */ - _Atomic int32_t *next_recv_segs; + opal_atomic_int32_t *next_recv_segs; /* Mutex to protect recv_list */ opal_mutex_t *mutex_recv_list; /* Mutex to protect num_recv_segs */ @@ -95,12 +95,14 @@ struct mca_coll_adapt_constant_reduce_context_s { ompi_coll_tree_t *tree; /* Accumulate buff */ char **accumbuf; + /* inbuf list address of accumbuf */ + ompi_coll_adapt_inbuf_t ** accumbuf_to_inbuf; opal_free_list_t *inbuf_list; /* A list to store the segments which are received and not yet be sent */ opal_list_t *recv_list; ptrdiff_t lower_bound; /* How many sends are posted but not finished */ - _Atomic int32_t ongoing_send; + opal_atomic_int32_t ongoing_send; char *sbuf; char *rbuf; int root; @@ -109,24 +111,24 @@ struct mca_coll_adapt_constant_reduce_context_s { int ireduce_tag; }; -typedef struct mca_coll_adapt_constant_reduce_context_s mca_coll_adapt_constant_reduce_context_t; +typedef struct ompi_coll_adapt_constant_reduce_context_s ompi_coll_adapt_constant_reduce_context_t; -OBJ_CLASS_DECLARATION(mca_coll_adapt_constant_reduce_context_t); +OBJ_CLASS_DECLARATION(ompi_coll_adapt_constant_reduce_context_t); /* Reduce context of each segment */ -typedef struct mca_coll_adapt_reduce_context_s mca_coll_adapt_reduce_context_t; +typedef struct ompi_coll_adapt_reduce_context_s ompi_coll_adapt_reduce_context_t; -typedef int (*mca_coll_adapt_reduce_cuda_callback_fn_t) (mca_coll_adapt_reduce_context_t * context); +typedef int (*ompi_coll_adapt_reduce_cuda_callback_fn_t) (ompi_coll_adapt_reduce_context_t * context); -struct mca_coll_adapt_reduce_context_s { +struct ompi_coll_adapt_reduce_context_s { opal_free_list_item_t super; char *buff; int frag_id; int child_id; int peer; - mca_coll_adapt_constant_reduce_context_t *con; + ompi_coll_adapt_constant_reduce_context_t *con; /* store the incoming segment */ - mca_coll_adapt_inbuf_t *inbuf; + ompi_coll_adapt_inbuf_t *inbuf; }; -OBJ_CLASS_DECLARATION(mca_coll_adapt_reduce_context_t); +OBJ_CLASS_DECLARATION(ompi_coll_adapt_reduce_context_t); diff --git a/ompi/mca/coll/adapt/coll_adapt_ibcast.c b/ompi/mca/coll/adapt/coll_adapt_ibcast.c index 3582bafcb62..c3f0868102a 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ibcast.c +++ b/ompi/mca/coll/adapt/coll_adapt_ibcast.c @@ -21,33 +21,35 @@ #include "ompi/mca/pml/ob1/pml_ob1.h" -typedef int (*mca_coll_adapt_ibcast_fn_t) (void *buff, +typedef int (*ompi_coll_adapt_ibcast_fn_t) (void *buff, int count, struct ompi_datatype_t * datatype, int root, struct ompi_communicator_t * comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ibcast_tag); - -static mca_coll_adapt_algorithm_index_t mca_coll_adapt_ibcast_algorithm_index[] = { - {1, (uintptr_t) mca_coll_adapt_ibcast_binomial}, - {2, (uintptr_t) mca_coll_adapt_ibcast_in_order_binomial}, - {3, (uintptr_t) mca_coll_adapt_ibcast_binary}, - {4, (uintptr_t) mca_coll_adapt_ibcast_pipeline}, - {5, (uintptr_t) mca_coll_adapt_ibcast_chain}, - {6, (uintptr_t) mca_coll_adapt_ibcast_linear}, + mca_coll_base_module_t * module, + int ibcast_tag); + +static ompi_coll_adapt_algorithm_index_t ompi_coll_adapt_ibcast_algorithm_index[] = { + {0, (uintptr_t) ompi_coll_adapt_ibcast_tuned}, + {1, (uintptr_t) ompi_coll_adapt_ibcast_binomial}, + {2, (uintptr_t) ompi_coll_adapt_ibcast_in_order_binomial}, + {3, (uintptr_t) ompi_coll_adapt_ibcast_binary}, + {4, (uintptr_t) ompi_coll_adapt_ibcast_pipeline}, + {5, (uintptr_t) ompi_coll_adapt_ibcast_chain}, + {6, (uintptr_t) ompi_coll_adapt_ibcast_linear}, }; /* * Set up MCA parameters of MPI_Bcast and MPI_IBcast */ -int mca_coll_adapt_ibcast_init(void) +int ompi_coll_adapt_ibcast_init(void) { mca_base_component_t *c = &mca_coll_adapt_component.super.collm_version; mca_coll_adapt_component.adapt_ibcast_algorithm = 1; mca_base_component_var_register(c, "bcast_algorithm", - "Algorithm of broadcast, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + "Algorithm of broadcast, 0: tuned, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_adapt_component.adapt_ibcast_algorithm); @@ -81,15 +83,15 @@ int mca_coll_adapt_ibcast_init(void) } /* - * Release the free list created in mca_coll_adapt_ibcast_generic + * Release the free list created in ompi_coll_adapt_ibcast_generic */ -int mca_coll_adapt_ibcast_fini(void) +int ompi_coll_adapt_ibcast_fini(void) { if (NULL != mca_coll_adapt_component.adapt_ibcast_context_free_list) { OBJ_RELEASE(mca_coll_adapt_component.adapt_ibcast_context_free_list); mca_coll_adapt_component.adapt_ibcast_context_free_list = NULL; mca_coll_adapt_component.adapt_ibcast_context_free_list_enabled = 0; - OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "ibcast fini\n")); + OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "ibcast fini\n")); } return OMPI_SUCCESS; } @@ -97,7 +99,7 @@ int mca_coll_adapt_ibcast_fini(void) /* * Finish a ibcast request */ -static int ibcast_request_fini(mca_coll_adapt_bcast_context_t * context) +static int ibcast_request_fini(ompi_coll_adapt_bcast_context_t * context) { ompi_request_t *temp_req = context->con->request; if (context->con->tree->tree_nextsize != 0) { @@ -121,8 +123,8 @@ static int ibcast_request_fini(mca_coll_adapt_bcast_context_t * context) */ static int send_cb(ompi_request_t * req) { - mca_coll_adapt_bcast_context_t *context = - (mca_coll_adapt_bcast_context_t *) req->req_complete_cb_data; + ompi_coll_adapt_bcast_context_t *context = + (ompi_coll_adapt_bcast_context_t *) req->req_complete_cb_data; int err; @@ -136,10 +138,11 @@ static int send_cb(ompi_request_t * req) /* If the current process has fragments in recv_array can be sent */ if (sent_id < context->con->num_recv_segs) { ompi_request_t *send_req; + ompi_coll_adapt_bcast_context_t *send_context; + opal_free_list_t *free_list; int new_id = context->con->recv_array[sent_id]; - mca_coll_adapt_bcast_context_t *send_context = - (mca_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component. - adapt_ibcast_context_free_list); + free_list = mca_coll_adapt_component.adapt_ibcast_context_free_list; + send_context = (ompi_coll_adapt_bcast_context_t *) opal_free_list_wait(free_list); send_context->buff = context->buff + (new_id - context->frag_id) * context->con->real_seg_size; send_context->frag_id = new_id; @@ -206,8 +209,8 @@ static int send_cb(ompi_request_t * req) static int recv_cb(ompi_request_t * req) { /* Get necessary info from request */ - mca_coll_adapt_bcast_context_t *context = - (mca_coll_adapt_bcast_context_t *) req->req_complete_cb_data; + ompi_coll_adapt_bcast_context_t *context = + (ompi_coll_adapt_bcast_context_t *) req->req_complete_cb_data; int err, i; OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, @@ -220,14 +223,15 @@ static int recv_cb(ompi_request_t * req) int num_recv_segs_t = ++(context->con->num_recv_segs); context->con->recv_array[num_recv_segs_t - 1] = context->frag_id; + opal_free_list_t *free_list; int new_id = num_recv_segs_t + mca_coll_adapt_component.adapt_ibcast_max_recv_requests - 1; /* Receive new segment */ if (new_id < context->con->num_segs) { ompi_request_t *recv_req; + ompi_coll_adapt_bcast_context_t *recv_context; + free_list = mca_coll_adapt_component.adapt_ibcast_context_free_list; /* Get new context item from free list */ - mca_coll_adapt_bcast_context_t *recv_context = - (mca_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component. - adapt_ibcast_context_free_list); + recv_context = (ompi_coll_adapt_bcast_context_t *) opal_free_list_wait(free_list); recv_context->buff = context->buff + (new_id - context->frag_id) * context->con->real_seg_size; recv_context->frag_id = new_id; @@ -266,9 +270,9 @@ static int recv_cb(ompi_request_t * req) send_count = context->con->count - context->frag_id * context->con->seg_count; } - mca_coll_adapt_bcast_context_t *send_context = - (mca_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component. - adapt_ibcast_context_free_list); + ompi_coll_adapt_bcast_context_t *send_context; + free_list = mca_coll_adapt_component.adapt_ibcast_context_free_list; + send_context = (ompi_coll_adapt_bcast_context_t *) opal_free_list_wait(free_list); send_context->buff = context->buff; send_context->frag_id = context->frag_id; send_context->child_id = i; @@ -326,7 +330,7 @@ static int recv_cb(ompi_request_t * req) return 1; } -int mca_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, +int ompi_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module) { @@ -335,7 +339,7 @@ int mca_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *datatyp temp_request = OBJ_NEW(ompi_request_t); OMPI_REQUEST_INIT(temp_request, false); temp_request->req_type = 0; - temp_request->req_free = adapt_request_free; + temp_request->req_free = ompi_coll_adapt_request_free; temp_request->req_status.MPI_SOURCE = 0; temp_request->req_status.MPI_TAG = 0; temp_request->req_status.MPI_ERROR = 0; @@ -356,9 +360,9 @@ int mca_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *datatyp } int ibcast_tag = opal_atomic_add_fetch_32(&(comm->c_ibcast_tag), 1); ibcast_tag = ibcast_tag % 4096; - mca_coll_adapt_ibcast_fn_t bcast_func = - (mca_coll_adapt_ibcast_fn_t) - mca_coll_adapt_ibcast_algorithm_index[mca_coll_adapt_component.adapt_ibcast_algorithm]. + ompi_coll_adapt_ibcast_fn_t bcast_func = + (ompi_coll_adapt_ibcast_fn_t) + ompi_coll_adapt_ibcast_algorithm_index[mca_coll_adapt_component.adapt_ibcast_algorithm]. algorithm_fn_ptr; return bcast_func(buff, count, datatype, root, comm, request, module, ibcast_tag); } @@ -367,72 +371,81 @@ int mca_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *datatyp /* * Ibcast functions with different algorithms */ -int mca_coll_adapt_ibcast_binomial(void *buff, int count, struct ompi_datatype_t *datatype, +int ompi_coll_adapt_ibcast_tuned(void *buff, int count, struct ompi_datatype_t *datatype, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t *module, int ibcast_tag) +{ + OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "tuned not implemented\n")); + return OMPI_SUCCESS; +} + +int ompi_coll_adapt_ibcast_binomial(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ibcast_tag) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_bmtree(comm, root); int err = - mca_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, mca_coll_adapt_component.adapt_ibcast_segment_size, ibcast_tag); return err; } -int mca_coll_adapt_ibcast_in_order_binomial(void *buff, int count, struct ompi_datatype_t *datatype, +int ompi_coll_adapt_ibcast_in_order_binomial(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ibcast_tag) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_in_order_bmtree(comm, root); int err = - mca_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, mca_coll_adapt_component.adapt_ibcast_segment_size, ibcast_tag); return err; } -int mca_coll_adapt_ibcast_binary(void *buff, int count, struct ompi_datatype_t *datatype, int root, +int ompi_coll_adapt_ibcast_binary(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ibcast_tag) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_tree(2, comm, root); int err = - mca_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, mca_coll_adapt_component.adapt_ibcast_segment_size, ibcast_tag); return err; } -int mca_coll_adapt_ibcast_pipeline(void *buff, int count, struct ompi_datatype_t *datatype, +int ompi_coll_adapt_ibcast_pipeline(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ibcast_tag) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(1, comm, root); int err = - mca_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, mca_coll_adapt_component.adapt_ibcast_segment_size, ibcast_tag); return err; } -int mca_coll_adapt_ibcast_chain(void *buff, int count, struct ompi_datatype_t *datatype, int root, +int ompi_coll_adapt_ibcast_chain(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ibcast_tag) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(4, comm, root); int err = - mca_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, mca_coll_adapt_component.adapt_ibcast_segment_size, ibcast_tag); return err; } -int mca_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t *datatype, int root, +int ompi_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ibcast_tag) { @@ -446,14 +459,14 @@ int mca_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t * tree = ompi_coll_base_topo_build_tree(MAXTREEFANOUT, comm, root); } int err = - mca_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, mca_coll_adapt_component.adapt_ibcast_segment_size, ibcast_tag); return err; } -int mca_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t *datatype, int root, +int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, ompi_coll_tree_t * tree, size_t seg_size, int ibcast_tag) @@ -494,9 +507,9 @@ int mca_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t if (1 == context_free_list_enabled) { mca_coll_adapt_component.adapt_ibcast_context_free_list = OBJ_NEW(opal_free_list_t); opal_free_list_init(mca_coll_adapt_component.adapt_ibcast_context_free_list, - sizeof(mca_coll_adapt_bcast_context_t), + sizeof(ompi_coll_adapt_bcast_context_t), opal_cache_line_size, - OBJ_CLASS(mca_coll_adapt_bcast_context_t), + OBJ_CLASS(ompi_coll_adapt_bcast_context_t), 0, opal_cache_line_size, mca_coll_adapt_component.adapt_context_free_list_min, mca_coll_adapt_component.adapt_context_free_list_max, @@ -510,7 +523,7 @@ int mca_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t OMPI_REQUEST_INIT(temp_request, false); temp_request->req_state = OMPI_REQUEST_ACTIVE; temp_request->req_type = 0; - temp_request->req_free = adapt_request_free; + temp_request->req_free = ompi_coll_adapt_request_free; temp_request->req_status.MPI_SOURCE = 0; temp_request->req_status.MPI_TAG = 0; temp_request->req_status.MPI_ERROR = 0; @@ -540,7 +553,7 @@ int mca_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t } /* Set constant context for send and recv call back */ - mca_coll_adapt_constant_bcast_context_t *con = OBJ_NEW(mca_coll_adapt_constant_bcast_context_t); + ompi_coll_adapt_constant_bcast_context_t *con = OBJ_NEW(ompi_coll_adapt_constant_bcast_context_t); con->root = root; con->count = count; con->seg_count = seg_count; @@ -582,7 +595,7 @@ int mca_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t recv_array[i] = i; } con->num_recv_segs = num_segs; - /* Set send_array, will send adapt_ibcast_max_send_requests segments */ + /* Set send_array, will send ompi_coll_adapt_ibcast_max_send_requests segments */ for (i = 0; i < tree->tree_nextsize; i++) { send_array[i] = mca_coll_adapt_component.adapt_ibcast_max_send_requests; } @@ -595,8 +608,8 @@ int mca_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t send_count = count - i * seg_count; } for (j = 0; j < tree->tree_nextsize; j++) { - mca_coll_adapt_bcast_context_t *context = - (mca_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component. + ompi_coll_adapt_bcast_context_t *context = + (ompi_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component. adapt_ibcast_context_free_list); context->buff = (char *) buff + i * real_seg_size; context->frag_id = i; @@ -656,8 +669,8 @@ int mca_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t if (i == (num_segs - 1)) { recv_count = count - i * seg_count; } - mca_coll_adapt_bcast_context_t *context = - (mca_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component. + ompi_coll_adapt_bcast_context_t *context = + (ompi_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component. adapt_ibcast_context_free_list); context->buff = (char *) buff + i * real_seg_size; context->frag_id = i; @@ -691,4 +704,4 @@ int mca_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t "[%d]: End of Ibcast\n", rank)); return MPI_SUCCESS; -} \ No newline at end of file +} diff --git a/ompi/mca/coll/adapt/coll_adapt_inbuf.c b/ompi/mca/coll/adapt/coll_adapt_inbuf.c index 79162966624..a1723ac13d0 100644 --- a/ompi/mca/coll/adapt/coll_adapt_inbuf.c +++ b/ompi/mca/coll/adapt/coll_adapt_inbuf.c @@ -12,13 +12,13 @@ #include "coll_adapt.h" #include "coll_adapt_inbuf.h" -static void mca_coll_adapt_inbuf_constructor(mca_coll_adapt_inbuf_t * inbuf) +static void ompi_coll_adapt_inbuf_constructor(ompi_coll_adapt_inbuf_t * inbuf) { } -static void mca_coll_adapt_inbuf_destructor(mca_coll_adapt_inbuf_t * inbuf) +static void ompi_coll_adapt_inbuf_destructor(ompi_coll_adapt_inbuf_t * inbuf) { } -OBJ_CLASS_INSTANCE(mca_coll_adapt_inbuf_t, opal_free_list_item_t, mca_coll_adapt_inbuf_constructor, - mca_coll_adapt_inbuf_destructor); +OBJ_CLASS_INSTANCE(ompi_coll_adapt_inbuf_t, opal_free_list_item_t, ompi_coll_adapt_inbuf_constructor, + ompi_coll_adapt_inbuf_destructor); diff --git a/ompi/mca/coll/adapt/coll_adapt_inbuf.h b/ompi/mca/coll/adapt/coll_adapt_inbuf.h index 1d450e59ff7..93c3060333b 100644 --- a/ompi/mca/coll/adapt/coll_adapt_inbuf.h +++ b/ompi/mca/coll/adapt/coll_adapt_inbuf.h @@ -14,13 +14,13 @@ #include "opal/class/opal_free_list.h" -struct mca_coll_adapt_inbuf_s { +struct ompi_coll_adapt_inbuf_s { opal_free_list_item_t super; char buff[1]; }; -typedef struct mca_coll_adapt_inbuf_s mca_coll_adapt_inbuf_t; +typedef struct ompi_coll_adapt_inbuf_s ompi_coll_adapt_inbuf_t; -OBJ_CLASS_DECLARATION(mca_coll_adapt_inbuf_t); +OBJ_CLASS_DECLARATION(ompi_coll_adapt_inbuf_t); #endif /* MCA_COLL_ADAPT_INBUF_H */ diff --git a/ompi/mca/coll/adapt/coll_adapt_ireduce.c b/ompi/mca/coll/adapt/coll_adapt_ireduce.c index d99bb87f998..f90c14874f8 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ireduce.c +++ b/ompi/mca/coll/adapt/coll_adapt_ireduce.c @@ -24,7 +24,7 @@ /* MPI_Reduce and MPI_Ireduce in the ADAPT module only work for commutative operations */ -typedef int (*mca_coll_adapt_ireduce_fn_t) (const void *sbuf, +typedef int (*ompi_coll_adapt_ireduce_fn_t) (const void *sbuf, void *rbuf, int count, struct ompi_datatype_t * datatype, @@ -34,19 +34,20 @@ typedef int (*mca_coll_adapt_ireduce_fn_t) (const void *sbuf, ompi_request_t ** request, mca_coll_base_module_t * module, int ireduce_tag); -static mca_coll_adapt_algorithm_index_t mca_coll_adapt_ireduce_algorithm_index[] = { - {1, (uintptr_t) mca_coll_adapt_ireduce_binomial}, - {2, (uintptr_t) mca_coll_adapt_ireduce_in_order_binomial}, - {3, (uintptr_t) mca_coll_adapt_ireduce_binary}, - {4, (uintptr_t) mca_coll_adapt_ireduce_pipeline}, - {5, (uintptr_t) mca_coll_adapt_ireduce_chain}, - {6, (uintptr_t) mca_coll_adapt_ireduce_linear}, +static ompi_coll_adapt_algorithm_index_t ompi_coll_adapt_ireduce_algorithm_index[] = { + {0, (uintptr_t)ompi_coll_adapt_ireduce_tuned}, + {1, (uintptr_t) ompi_coll_adapt_ireduce_binomial}, + {2, (uintptr_t) ompi_coll_adapt_ireduce_in_order_binomial}, + {3, (uintptr_t) ompi_coll_adapt_ireduce_binary}, + {4, (uintptr_t) ompi_coll_adapt_ireduce_pipeline}, + {5, (uintptr_t) ompi_coll_adapt_ireduce_chain}, + {6, (uintptr_t) ompi_coll_adapt_ireduce_linear}, }; /* * Set up MCA parameters of MPI_Reduce and MPI_Ireduce */ -int mca_coll_adapt_ireduce_init(void) +int ompi_coll_adapt_ireduce_init(void) { mca_base_component_t *c = &mca_coll_adapt_component.super.collm_version; @@ -111,9 +112,9 @@ int mca_coll_adapt_ireduce_init(void) } /* - * Release the free list created in mca_coll_adapt_ireduce_generic + * Release the free list created in ompi_coll_adapt_ireduce_generic */ -int mca_coll_adapt_ireduce_fini(void) +int ompi_coll_adapt_ireduce_fini(void) { if (NULL != mca_coll_adapt_component.adapt_ireduce_context_free_list) { OBJ_RELEASE(mca_coll_adapt_component.adapt_ireduce_context_free_list); @@ -127,15 +128,15 @@ int mca_coll_adapt_ireduce_fini(void) /* * Functions to access list */ -static mca_coll_adapt_item_t *get_next_ready_item(opal_list_t * list, int num_children) +static ompi_coll_adapt_item_t *get_next_ready_item(opal_list_t * list, int num_children) { - mca_coll_adapt_item_t *item; + ompi_coll_adapt_item_t *item; if (opal_list_is_empty(list)) { return NULL; } - for (item = (mca_coll_adapt_item_t *) opal_list_get_first(list); - item != (mca_coll_adapt_item_t *) opal_list_get_end(list); - item = (mca_coll_adapt_item_t *) ((opal_list_item_t *) item)->opal_list_next) { + for (item = (ompi_coll_adapt_item_t *) opal_list_get_first(list); + item != (ompi_coll_adapt_item_t *) opal_list_get_end(list); + item = (ompi_coll_adapt_item_t *) ((opal_list_item_t *) item)->opal_list_next) { if (item->count == num_children) { opal_list_remove_item(list, (opal_list_item_t *) item); return item; @@ -146,11 +147,11 @@ static mca_coll_adapt_item_t *get_next_ready_item(opal_list_t * list, int num_ch static int add_to_list(opal_list_t * list, int id) { - mca_coll_adapt_item_t *item; + ompi_coll_adapt_item_t *item; int ret = 0; - for (item = (mca_coll_adapt_item_t *) opal_list_get_first(list); - item != (mca_coll_adapt_item_t *) opal_list_get_end(list); - item = (mca_coll_adapt_item_t *) ((opal_list_item_t *) item)->opal_list_next) { + for (item = (ompi_coll_adapt_item_t *) opal_list_get_first(list); + item != (ompi_coll_adapt_item_t *) opal_list_get_end(list); + item = (ompi_coll_adapt_item_t *) ((opal_list_item_t *) item)->opal_list_next) { if (item->id == id) { (item->count)++; ret = 1; @@ -158,7 +159,7 @@ static int add_to_list(opal_list_t * list, int id) } } if (ret == 0) { - item = OBJ_NEW(mca_coll_adapt_item_t); + item = OBJ_NEW(ompi_coll_adapt_item_t); item->id = id; item->count = 1; opal_list_append(list, (opal_list_item_t *) item); @@ -172,15 +173,15 @@ static int add_to_list(opal_list_t * list, int id) /* * Get the inbuf address */ -static mca_coll_adapt_inbuf_t *to_inbuf(char *buf, int distance) +static ompi_coll_adapt_inbuf_t *to_inbuf(char *buf, int distance) { - return (mca_coll_adapt_inbuf_t *) (buf - distance); + return (ompi_coll_adapt_inbuf_t *) (buf - distance); } /* * Finish a ireduce request */ -static int ireduce_request_fini(mca_coll_adapt_reduce_context_t * context) +static int ireduce_request_fini(ompi_coll_adapt_reduce_context_t * context) { /* Return the allocated recourses */ int i; @@ -227,8 +228,8 @@ static int ireduce_request_fini(mca_coll_adapt_reduce_context_t * context) */ static int send_cb(ompi_request_t * req) { - mca_coll_adapt_reduce_context_t *context = - (mca_coll_adapt_reduce_context_t *) req->req_complete_cb_data; + ompi_coll_adapt_reduce_context_t *context = + (ompi_coll_adapt_reduce_context_t *) req->req_complete_cb_data; OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: ireduce_send_cb, peer %d, seg_id %d\n", context->con->rank, context->peer, context->frag_id)); @@ -238,14 +239,14 @@ static int send_cb(ompi_request_t * req) /* Send a new segment */ OPAL_THREAD_LOCK(context->con->mutex_recv_list); - mca_coll_adapt_item_t *item = + ompi_coll_adapt_item_t *item = get_next_ready_item(context->con->recv_list, context->con->tree->tree_nextsize); OPAL_THREAD_UNLOCK(context->con->mutex_recv_list); if (item != NULL) { /* Get new context item from free list */ - mca_coll_adapt_reduce_context_t *send_context = - (mca_coll_adapt_reduce_context_t *) opal_free_list_wait(mca_coll_adapt_component. + ompi_coll_adapt_reduce_context_t *send_context = + (ompi_coll_adapt_reduce_context_t *) opal_free_list_wait(mca_coll_adapt_component. adapt_ireduce_context_free_list); if (context->con->tree->tree_nextsize > 0) { send_context->buff = context->con->accumbuf[item->id]; @@ -316,8 +317,8 @@ static int send_cb(ompi_request_t * req) */ static int recv_cb(ompi_request_t * req) { - mca_coll_adapt_reduce_context_t *context = - (mca_coll_adapt_reduce_context_t *) req->req_complete_cb_data; + ompi_coll_adapt_reduce_context_t *context = + (ompi_coll_adapt_reduce_context_t *) req->req_complete_cb_data; OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: ireduce_recv_cb, peer %d, seg_id %d\n", context->con->rank, context->peer, context->frag_id)); @@ -329,7 +330,7 @@ static int recv_cb(ompi_request_t * req) /* Receive new segment */ if (new_id < context->con->num_segs) { char *temp_recv_buf = NULL; - mca_coll_adapt_inbuf_t *inbuf = NULL; + ompi_coll_adapt_inbuf_t *inbuf = NULL; /* Set inbuf, if it it first child, recv on rbuf, else recv on inbuf */ if (context->child_id == 0 && context->con->sbuf != MPI_IN_PLACE && context->con->root == context->con->rank) { @@ -339,12 +340,12 @@ static int recv_cb(ompi_request_t * req) } else { OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: In recv_cb, alloc inbuf\n", context->con->rank)); - inbuf = (mca_coll_adapt_inbuf_t *) opal_free_list_wait(context->con->inbuf_list); + inbuf = (ompi_coll_adapt_inbuf_t *) opal_free_list_wait(context->con->inbuf_list); temp_recv_buf = inbuf->buff - context->con->lower_bound; } /* Get new context item from free list */ - mca_coll_adapt_reduce_context_t *recv_context = - (mca_coll_adapt_reduce_context_t *) opal_free_list_wait(mca_coll_adapt_component. + ompi_coll_adapt_reduce_context_t *recv_context = + (ompi_coll_adapt_reduce_context_t *) opal_free_list_wait(mca_coll_adapt_component. adapt_ireduce_context_free_list); recv_context->buff = temp_recv_buf; recv_context->frag_id = new_id; @@ -372,7 +373,7 @@ static int recv_cb(ompi_request_t * req) if (MPI_SUCCESS != err) { return err; } - /* Invoke recvive call back */ + /* Invoke receive call back */ ompi_request_set_callback(recv_req, recv_cb, recv_context); } @@ -443,14 +444,14 @@ static int recv_cb(ompi_request_t * req) if (context->con->rank != context->con->tree->tree_root && context->con->ongoing_send < mca_coll_adapt_component.adapt_ireduce_max_send_requests) { OPAL_THREAD_LOCK(context->con->mutex_recv_list); - mca_coll_adapt_item_t *item = + ompi_coll_adapt_item_t *item = get_next_ready_item(context->con->recv_list, context->con->tree->tree_nextsize); OPAL_THREAD_UNLOCK(context->con->mutex_recv_list); if (item != NULL) { - /* Gt new context item from free list */ - mca_coll_adapt_reduce_context_t *send_context = - (mca_coll_adapt_reduce_context_t *) opal_free_list_wait(mca_coll_adapt_component. + /* Get new context item from free list */ + ompi_coll_adapt_reduce_context_t *send_context = + (ompi_coll_adapt_reduce_context_t *) opal_free_list_wait(mca_coll_adapt_component. adapt_ireduce_context_free_list); send_context->buff = context->con->accumbuf[context->frag_id]; send_context->frag_id = item->id; @@ -523,7 +524,7 @@ static int recv_cb(ompi_request_t * req) return 1; } -int mca_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, +int ompi_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module) { @@ -543,9 +544,9 @@ int mca_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi_ int ireduce_tag = opal_atomic_add_fetch_32(&(comm->c_ireduce_tag), 1); ireduce_tag = (ireduce_tag % 4096) + 4096; fflush(stdout); - mca_coll_adapt_ireduce_fn_t reduce_func = - (mca_coll_adapt_ireduce_fn_t) - mca_coll_adapt_ireduce_algorithm_index[mca_coll_adapt_component. + ompi_coll_adapt_ireduce_fn_t reduce_func = + (ompi_coll_adapt_ireduce_fn_t) + ompi_coll_adapt_ireduce_algorithm_index[mca_coll_adapt_component. adapt_ireduce_algorithm].algorithm_fn_ptr; return reduce_func(sbuf, rbuf, count, dtype, op, root, comm, request, module, ireduce_tag); } @@ -554,20 +555,30 @@ int mca_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi_ /* * Ireduce functions with different algorithms */ -int mca_coll_adapt_ireduce_binomial(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_tuned(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t *module, int ireduce_tag) +{ + OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "tuned not implemented\n")); + return OMPI_SUCCESS; +} + +int ompi_coll_adapt_ireduce_binomial(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ireduce_tag) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_bmtree(comm, root); int err = - mca_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, + ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, tree, mca_coll_adapt_component.adapt_ireduce_segment_size, ireduce_tag); return err; } -int mca_coll_adapt_ireduce_in_order_binomial(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_in_order_binomial(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, @@ -575,53 +586,53 @@ int mca_coll_adapt_ireduce_in_order_binomial(const void *sbuf, void *rbuf, int c { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_in_order_bmtree(comm, root); int err = - mca_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, + ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, tree, mca_coll_adapt_component.adapt_ireduce_segment_size, ireduce_tag); return err; } -int mca_coll_adapt_ireduce_binary(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_binary(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ireduce_tag) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_tree(2, comm, root); int err = - mca_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, + ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, tree, mca_coll_adapt_component.adapt_ireduce_segment_size, ireduce_tag); return err; } -int mca_coll_adapt_ireduce_pipeline(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_pipeline(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ireduce_tag) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(1, comm, root); int err = - mca_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, + ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, tree, mca_coll_adapt_component.adapt_ireduce_segment_size, ireduce_tag); return err; } -int mca_coll_adapt_ireduce_chain(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_chain(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ireduce_tag) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(4, comm, root); int err = - mca_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, + ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, tree, mca_coll_adapt_component.adapt_ireduce_segment_size, ireduce_tag); return err; } -int mca_coll_adapt_ireduce_linear(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_linear(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, int ireduce_tag) @@ -636,14 +647,14 @@ int mca_coll_adapt_ireduce_linear(const void *sbuf, void *rbuf, int count, tree = ompi_coll_base_topo_build_tree(MAXTREEFANOUT, comm, root); } int err = - mca_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, + ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, tree, mca_coll_adapt_component.adapt_ireduce_segment_size, ireduce_tag); return err; } -int mca_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, +int ompi_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module, ompi_coll_tree_t * tree, @@ -655,7 +666,7 @@ int mca_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, size_t typelng; int seg_count = count, num_segs, rank, recv_count, send_count, i, j, err, min, distance = 0; int32_t seg_index; - _Atomic int *next_recv_segs = NULL; + opal_atomic_int_t *next_recv_segs = NULL; /* Used to store the accumuate result, pointer to every segment */ char **accumbuf = NULL; /* A free list contains all recv data */ @@ -686,9 +697,9 @@ int mca_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, if (1 == context_free_list_enabled) { mca_coll_adapt_component.adapt_ireduce_context_free_list = OBJ_NEW(opal_free_list_t); opal_free_list_init(mca_coll_adapt_component.adapt_ireduce_context_free_list, - sizeof(mca_coll_adapt_reduce_context_t), + sizeof(ompi_coll_adapt_reduce_context_t), opal_cache_line_size, - OBJ_CLASS(mca_coll_adapt_reduce_context_t), + OBJ_CLASS(ompi_coll_adapt_reduce_context_t), 0, opal_cache_line_size, mca_coll_adapt_component.adapt_context_free_list_min, mca_coll_adapt_component.adapt_context_free_list_max, @@ -701,18 +712,18 @@ int mca_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, if (tree->tree_nextsize > 0) { inbuf_list = OBJ_NEW(opal_free_list_t); opal_free_list_init(inbuf_list, - sizeof(mca_coll_adapt_inbuf_t) + real_seg_size, + sizeof(ompi_coll_adapt_inbuf_t) + real_seg_size, opal_cache_line_size, - OBJ_CLASS(mca_coll_adapt_inbuf_t), + OBJ_CLASS(ompi_coll_adapt_inbuf_t), 0, opal_cache_line_size, mca_coll_adapt_component.adapt_inbuf_free_list_min, mca_coll_adapt_component.adapt_inbuf_free_list_max, mca_coll_adapt_component.adapt_inbuf_free_list_inc, NULL, 0, NULL, NULL, NULL); /* Set up next_recv_segs */ - next_recv_segs = (_Atomic int32_t *) malloc(sizeof(int32_t) * tree->tree_nextsize); - mca_coll_adapt_inbuf_t *temp_inbuf = - (mca_coll_adapt_inbuf_t *) opal_free_list_wait(inbuf_list); + next_recv_segs = (opal_atomic_int32_t *) malloc(sizeof(int32_t) * tree->tree_nextsize); + ompi_coll_adapt_inbuf_t *temp_inbuf = + (ompi_coll_adapt_inbuf_t *) opal_free_list_wait(inbuf_list); distance = (char *) temp_inbuf->buff - lower_bound - (char *) temp_inbuf; //address of inbuf->buff to address of inbuf OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: distance %d, inbuf %p, inbuf->buff %p, inbuf->buff-lb %p, to_inbuf %p, inbuf_list %p\n", @@ -732,7 +743,7 @@ int mca_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, OMPI_REQUEST_INIT(temp_request, false); temp_request->req_state = OMPI_REQUEST_ACTIVE; temp_request->req_type = 0; - temp_request->req_free = adapt_request_free; + temp_request->req_free = ompi_coll_adapt_request_free; temp_request->req_status.MPI_SOURCE = 0; temp_request->req_status.MPI_TAG = 0; temp_request->req_status.MPI_ERROR = 0; @@ -752,8 +763,8 @@ int mca_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, recv_list = OBJ_NEW(opal_list_t); /* Set constant context for send and recv call back */ - mca_coll_adapt_constant_reduce_context_t *con = - OBJ_NEW(mca_coll_adapt_constant_reduce_context_t); + ompi_coll_adapt_constant_reduce_context_t *con = + OBJ_NEW(ompi_coll_adapt_constant_reduce_context_t); con->count = count; con->seg_count = seg_count; con->datatype = dtype; @@ -822,21 +833,21 @@ int mca_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, recv_count = count - (ptrdiff_t) seg_count *(ptrdiff_t) seg_index; } char *temp_recv_buf = NULL; - mca_coll_adapt_inbuf_t *inbuf = NULL; + ompi_coll_adapt_inbuf_t *inbuf = NULL; /* Set inbuf, if it it first child, recv on rbuf, else recv on inbuf */ if (i == 0 && sbuf != MPI_IN_PLACE && root == rank) { temp_recv_buf = (char *) rbuf + (ptrdiff_t) j *(ptrdiff_t) segment_increment; } else { - inbuf = (mca_coll_adapt_inbuf_t *) opal_free_list_wait(inbuf_list); + inbuf = (ompi_coll_adapt_inbuf_t *) opal_free_list_wait(inbuf_list); OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: In ireduce, alloc inbuf %p\n", rank, (void *) inbuf)); temp_recv_buf = inbuf->buff - lower_bound; } /* Get context */ - mca_coll_adapt_reduce_context_t *context = - (mca_coll_adapt_reduce_context_t *) + ompi_coll_adapt_reduce_context_t *context = + (ompi_coll_adapt_reduce_context_t *) opal_free_list_wait(mca_coll_adapt_component. adapt_ireduce_context_free_list); context->buff = temp_recv_buf; @@ -871,10 +882,10 @@ int mca_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, /* Leaf nodes */ else { - mca_coll_adapt_item_t *item; + ompi_coll_adapt_item_t *item; /* Set up recv_list */ for (seg_index = 0; seg_index < num_segs; seg_index++) { - item = OBJ_NEW(mca_coll_adapt_item_t); + item = OBJ_NEW(ompi_coll_adapt_item_t); item->id = seg_index; item->count = tree->tree_nextsize; opal_list_append(recv_list, (opal_list_item_t *) item); @@ -894,8 +905,8 @@ int mca_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, if (item->id == (num_segs - 1)) { send_count = count - (ptrdiff_t) seg_count *(ptrdiff_t) item->id; } - mca_coll_adapt_reduce_context_t *context = - (mca_coll_adapt_reduce_context_t *) + ompi_coll_adapt_reduce_context_t *context = + (ompi_coll_adapt_reduce_context_t *) opal_free_list_wait(mca_coll_adapt_component.adapt_ireduce_context_free_list); context->buff = (char *) sbuf + (ptrdiff_t) item->id * (ptrdiff_t) segment_increment; diff --git a/ompi/mca/coll/adapt/coll_adapt_item.c b/ompi/mca/coll/adapt/coll_adapt_item.c index dabe2ce37b8..73258326a66 100644 --- a/ompi/mca/coll/adapt/coll_adapt_item.c +++ b/ompi/mca/coll/adapt/coll_adapt_item.c @@ -11,13 +11,13 @@ #include "coll_adapt_item.h" -static void mca_coll_adapt_item_constructor(mca_coll_adapt_item_t * item) +static void ompi_coll_adapt_item_constructor(ompi_coll_adapt_item_t * item) { } -static void mca_coll_adapt_item_destructor(mca_coll_adapt_item_t * item) +static void ompi_coll_adapt_item_destructor(ompi_coll_adapt_item_t * item) { } -OBJ_CLASS_INSTANCE(mca_coll_adapt_item_t, opal_list_item_t, mca_coll_adapt_item_constructor, - mca_coll_adapt_item_destructor); +OBJ_CLASS_INSTANCE(ompi_coll_adapt_item_t, opal_list_item_t, ompi_coll_adapt_item_constructor, + ompi_coll_adapt_item_destructor); diff --git a/ompi/mca/coll/adapt/coll_adapt_item.h b/ompi/mca/coll/adapt/coll_adapt_item.h index 2fc6cbdbd03..768f9f29dc0 100644 --- a/ompi/mca/coll/adapt/coll_adapt_item.h +++ b/ompi/mca/coll/adapt/coll_adapt_item.h @@ -12,7 +12,7 @@ #include "opal/class/opal_list.h" #include "coll_adapt_inbuf.h" -struct mca_coll_adapt_item_s { +struct ompi_coll_adapt_item_s { opal_list_item_t super; /* Fragment id */ int id; @@ -20,6 +20,6 @@ struct mca_coll_adapt_item_s { int count; }; -typedef struct mca_coll_adapt_item_s mca_coll_adapt_item_t; +typedef struct ompi_coll_adapt_item_s ompi_coll_adapt_item_t; -OBJ_CLASS_DECLARATION(mca_coll_adapt_item_t); +OBJ_CLASS_DECLARATION(ompi_coll_adapt_item_t); diff --git a/ompi/mca/coll/adapt/coll_adapt_module.c b/ompi/mca/coll/adapt/coll_adapt_module.c index e709313361f..20f27d2ab24 100644 --- a/ompi/mca/coll/adapt/coll_adapt_module.c +++ b/ompi/mca/coll/adapt/coll_adapt_module.c @@ -14,17 +14,17 @@ #include #ifdef HAVE_STRING_H #include -#endif +#endif /* HAVE_STRING_H */ #ifdef HAVE_SCHED_H #include -#endif +#endif /* HAVE_SCHED_H */ #include #ifdef HAVE_SYS_MMAN_H #include -#endif /* HAVE_SYS_MMAN_H */ +#endif /* HAVE_SYS_MMAN_H */ #ifdef HAVE_UNISTD_H #include -#endif /* HAVE_UNISTD_H */ +#endif /* HAVE_UNISTD_H */ #include "mpi.h" #include "opal_stdint.h" @@ -35,7 +35,6 @@ #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/base.h" #include "ompi/mca/coll/base/coll_base_functions.h" -//#include "ompi/mca/rte/rte.h" #include "ompi/proc/proc.h" #include "coll_adapt.h" @@ -47,29 +46,37 @@ /* * Local functions */ -static int adapt_module_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm); /* * Module constructor */ -static void mca_coll_adapt_module_construct(mca_coll_adapt_module_t * module) +static void adapt_module_construct(mca_coll_adapt_module_t * module) { - module->enabled = false; - module->adapt_component = &mca_coll_adapt_component; + module->adapt_enabled = false; } /* * Module destructor */ -static void mca_coll_adapt_module_destruct(mca_coll_adapt_module_t * module) +static void adapt_module_destruct(mca_coll_adapt_module_t * module) { - module->enabled = false; + module->adapt_enabled = false; } OBJ_CLASS_INSTANCE(mca_coll_adapt_module_t, - mca_coll_base_module_t, - mca_coll_adapt_module_construct, mca_coll_adapt_module_destruct); + mca_coll_base_module_t, + adapt_module_construct, + adapt_module_destruct); + +/* + * Init module on the communicator + */ +static int adapt_module_enable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) +{ + return OMPI_SUCCESS; +} /* * Initial query function that is invoked during MPI_INIT, allowing @@ -77,34 +84,37 @@ OBJ_CLASS_INSTANCE(mca_coll_adapt_module_t, * required level of thread support. This function is invoked exactly * once. */ -int mca_coll_adapt_init_query(bool enable_progress_threads, bool enable_mpi_threads) +int ompi_coll_adapt_init_query(bool enable_progress_threads, bool enable_mpi_threads) { return OMPI_SUCCESS; } - /* * Invoked when there's a new communicator that has been created. * Look at the communicator and decide which set of functions and * priority we want to return. */ -mca_coll_base_module_t *mca_coll_adapt_comm_query(struct ompi_communicator_t * comm, int *priority) +mca_coll_base_module_t *ompi_coll_adapt_comm_query(struct ompi_communicator_t * comm, + int *priority) { mca_coll_adapt_module_t *adapt_module; /* If we're intercomm, or if there's only one process in the communicator */ if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm)) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, - "coll:adapt:comm_query (%d/%s): intercomm, comm is too small; disqualifying myself", + "coll:adapt:comm_query (%d/%s): intercomm, " + "comm is too small; disqualifying myself", comm->c_contextid, comm->c_name); return NULL; } - /* Get the priority level attached to this module. If priority is less than or equal to 0, then the module is unavailable. */ + /* Get the priority level attached to this module. + If priority is less than or equal to 0, then the module is unavailable. */ *priority = mca_coll_adapt_component.adapt_priority; if (mca_coll_adapt_component.adapt_priority <= 0) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, - "coll:adapt:comm_query (%d/%s): priority too low; disqualifying myself", + "coll:adapt:comm_query (%d/%s): priority too low; " + "disqualifying myself", comm->c_contextid, comm->c_name); return NULL; } @@ -123,17 +133,17 @@ mca_coll_base_module_t *mca_coll_adapt_comm_query(struct ompi_communicator_t * c adapt_module->super.coll_alltoall = NULL; adapt_module->super.coll_alltoallw = NULL; adapt_module->super.coll_barrier = NULL; - adapt_module->super.coll_bcast = mca_coll_adapt_bcast; + adapt_module->super.coll_bcast = ompi_coll_adapt_bcast; adapt_module->super.coll_exscan = NULL; adapt_module->super.coll_gather = NULL; adapt_module->super.coll_gatherv = NULL; - adapt_module->super.coll_reduce = mca_coll_adapt_reduce; + adapt_module->super.coll_reduce = ompi_coll_adapt_reduce; adapt_module->super.coll_reduce_scatter = NULL; adapt_module->super.coll_scan = NULL; adapt_module->super.coll_scatter = NULL; adapt_module->super.coll_scatterv = NULL; - adapt_module->super.coll_ibcast = mca_coll_adapt_ibcast; - adapt_module->super.coll_ireduce = mca_coll_adapt_ireduce; + adapt_module->super.coll_ibcast = ompi_coll_adapt_ibcast; + adapt_module->super.coll_ireduce = ompi_coll_adapt_ireduce; adapt_module->super.coll_iallreduce = NULL; opal_output_verbose(10, ompi_coll_base_framework.framework_output, @@ -143,17 +153,9 @@ mca_coll_base_module_t *mca_coll_adapt_comm_query(struct ompi_communicator_t * c } /* - * Init module on the communicator - */ -static int adapt_module_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm) -{ - return OMPI_SUCCESS; -} - -/* - * Free ADAPT request + * Free ADAPT request */ -int adapt_request_free(ompi_request_t ** request) +int ompi_coll_adapt_request_free(ompi_request_t ** request) { (*request)->req_state = OMPI_REQUEST_INVALID; OBJ_RELEASE(*request); diff --git a/ompi/mca/coll/adapt/coll_adapt_reduce.c b/ompi/mca/coll/adapt/coll_adapt_reduce.c index f41afe21484..e45bb3478a9 100644 --- a/ompi/mca/coll/adapt/coll_adapt_reduce.c +++ b/ompi/mca/coll/adapt/coll_adapt_reduce.c @@ -13,7 +13,7 @@ #include "coll_adapt_algorithms.h" /* MPI_Reduce and MPI_Ireduce in the ADAPT module only work for commutative operations */ -int mca_coll_adapt_reduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, +int ompi_coll_adapt_reduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { @@ -22,7 +22,7 @@ int mca_coll_adapt_reduce(const void *sbuf, void *rbuf, int count, struct ompi_d } else { ompi_request_t *request; int err = - mca_coll_adapt_ireduce(sbuf, rbuf, count, dtype, op, root, comm, &request, module); + ompi_coll_adapt_ireduce(sbuf, rbuf, count, dtype, op, root, comm, &request, module); ompi_request_wait(&request, MPI_STATUS_IGNORE); return err; } From d71264569e92f45d72edb26312206c29e7e2949a Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 7 May 2020 14:42:02 -0400 Subject: [PATCH 3/7] Fix the atomic management of the bcast and reduce freelist API consistent with other collective modules Add comments Other minor cleanups. Signed-off-by: George Bosilca --- ompi/communicator/communicator.h | 14 ++- ompi/mca/coll/adapt/Makefile.am | 6 +- ompi/mca/coll/adapt/coll_adapt.h | 7 +- ompi/mca/coll/adapt/coll_adapt_algorithms.h | 113 +++++++------------- ompi/mca/coll/adapt/coll_adapt_bcast.c | 13 ++- ompi/mca/coll/adapt/coll_adapt_component.c | 43 +++----- ompi/mca/coll/adapt/coll_adapt_context.c | 51 +-------- ompi/mca/coll/adapt/coll_adapt_ibcast.c | 92 +++++++--------- ompi/mca/coll/adapt/coll_adapt_inbuf.c | 12 +-- ompi/mca/coll/adapt/coll_adapt_ireduce.c | 97 ++++++++--------- ompi/mca/coll/adapt/coll_adapt_item.c | 12 +-- ompi/mca/coll/adapt/coll_adapt_item.h | 2 +- ompi/mca/coll/adapt/coll_adapt_reduce.c | 14 +-- ompi/mca/coll/base/coll_base_functions.h | 4 - ompi/mca/coll/tuned/coll_tuned_component.c | 13 +-- ompi/request/request.h | 21 ++-- 16 files changed, 185 insertions(+), 329 deletions(-) diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h index be7e7acea4e..c642ab4bfb8 100644 --- a/ompi/communicator/communicator.h +++ b/ompi/communicator/communicator.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -187,10 +187,14 @@ struct ompi_communicator_t { /* Collectives module interface and data */ mca_coll_base_comm_coll_t *c_coll; - - /* Non-blocking collective tag */ - _Atomic int32_t c_ibcast_tag; - _Atomic int32_t c_ireduce_tag; + + /* Non-blocking collective tag. These are added here as they should be + * shared between all non-blocking collective modules (to avoid message + * collisions between them in the case where multiple outstanding + * non-blocking collective coexists using multiple backends). + */ + opal_atomic_int32_t c_ibcast_tag; + opal_atomic_int32_t c_ireduce_tag; }; typedef struct ompi_communicator_t ompi_communicator_t; diff --git a/ompi/mca/coll/adapt/Makefile.am b/ompi/mca/coll/adapt/Makefile.am index 157304e3118..59c97a5a76d 100644 --- a/ompi/mca/coll/adapt/Makefile.am +++ b/ompi/mca/coll/adapt/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright (c) 2014 The University of Tennessee and The University +# Copyright (c) 2014-2020 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # $COPYRIGHT$ @@ -13,11 +13,11 @@ sources = \ coll_adapt_component.c \ coll_adapt_module.c \ - coll_adapt_bcast.c \ + coll_adapt_bcast.c \ coll_adapt_ibcast.c \ coll_adapt_reduce.c \ coll_adapt_ireduce.c \ - coll_adapt.h \ + coll_adapt.h \ coll_adapt_algorithms.h \ coll_adapt_context.h \ coll_adapt_context.c \ diff --git a/ompi/mca/coll/adapt/coll_adapt.h b/ompi/mca/coll/adapt/coll_adapt.h index b2a8fcb949c..a5c5b4a5f4a 100644 --- a/ompi/mca/coll/adapt/coll_adapt.h +++ b/ompi/mca/coll/adapt/coll_adapt.h @@ -38,8 +38,9 @@ typedef struct mca_coll_adapt_component_t { /* MCA parameter: Priority of this component */ int adapt_priority; - /* MCA parameter: Output verbose level */ + /* MCA parameter: Output stream and verbose level */ int adapt_output; + int adapt_verbose; /* MCA parameter: Maximum number of segment in context free list */ int adapt_context_free_list_max; @@ -57,7 +58,6 @@ typedef struct mca_coll_adapt_component_t { int adapt_ibcast_max_recv_requests; /* Bcast free list */ opal_free_list_t *adapt_ibcast_context_free_list; - opal_atomic_int32_t adapt_ibcast_context_free_list_enabled; /* Reduce MCA parameter */ int adapt_ireduce_algorithm; @@ -70,7 +70,6 @@ typedef struct mca_coll_adapt_component_t { /* Reduce free list */ opal_free_list_t *adapt_ireduce_context_free_list; - opal_atomic_int32_t adapt_ireduce_context_free_list_enabled; } mca_coll_adapt_component_t; @@ -91,7 +90,7 @@ OMPI_MODULE_DECLSPEC extern mca_coll_adapt_component_t mca_coll_adapt_component; int ompi_coll_adapt_init_query(bool enable_progress_threads, bool enable_mpi_threads); mca_coll_base_module_t * ompi_coll_adapt_comm_query(struct ompi_communicator_t *comm, int *priority); -/* Free ADAPT quest */ +/* ADAPT request free */ int ompi_coll_adapt_request_free(ompi_request_t **request); #endif /* MCA_COLL_ADAPT_EXPORT_H */ diff --git a/ompi/mca/coll/adapt/coll_adapt_algorithms.h b/ompi/mca/coll/adapt/coll_adapt_algorithms.h index f0b67b787d8..a25d8afb622 100644 --- a/ompi/mca/coll/adapt/coll_adapt_algorithms.h +++ b/ompi/mca/coll/adapt/coll_adapt_algorithms.h @@ -20,82 +20,45 @@ typedef struct ompi_coll_adapt_algorithm_index_s { } ompi_coll_adapt_algorithm_index_t; /* Bcast */ -int ompi_coll_adapt_ibcast_init(void); +int ompi_coll_adapt_ibcast_register(void); int ompi_coll_adapt_ibcast_fini(void); -int ompi_coll_adapt_bcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int ompi_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module); -int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, ompi_coll_tree_t * tree, - size_t seg_size, int ibcast_tag); -int ompi_coll_adapt_ibcast_binomial(void *buff, int count, struct ompi_datatype_t *datatype, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, mca_coll_base_module_t * module, - int ibcast_tag); -int ompi_coll_adapt_ibcast_in_order_binomial(void *buff, int count, struct ompi_datatype_t *datatype, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t * module, int ibcast_tag); -int ompi_coll_adapt_ibcast_binary(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ibcast_tag); -int ompi_coll_adapt_ibcast_pipeline(void *buff, int count, struct ompi_datatype_t *datatype, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, mca_coll_base_module_t * module, - int ibcast_tag); -int ompi_coll_adapt_ibcast_chain(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ibcast_tag); -int ompi_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ibcast_tag); -int ompi_coll_adapt_ibcast_tuned(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t *module, int ibcast_tag); +int ompi_coll_adapt_bcast(BCAST_ARGS); +int ompi_coll_adapt_ibcast(IBCAST_ARGS); +int ompi_coll_adapt_ibcast_generic(IBCAST_ARGS, + ompi_coll_tree_t * tree, size_t seg_size, int ibcast_tag); +int ompi_coll_adapt_ibcast_binomial(IBCAST_ARGS, + int ibcast_tag); +int ompi_coll_adapt_ibcast_in_order_binomial(IBCAST_ARGS, + int ibcast_tag); +int ompi_coll_adapt_ibcast_binary(IBCAST_ARGS, + int ibcast_tag); +int ompi_coll_adapt_ibcast_pipeline(IBCAST_ARGS, + int ibcast_tag); +int ompi_coll_adapt_ibcast_chain(IBCAST_ARGS, + int ibcast_tag); +int ompi_coll_adapt_ibcast_linear(IBCAST_ARGS, + int ibcast_tag); +int ompi_coll_adapt_ibcast_tuned(IBCAST_ARGS, + int ibcast_tag); /* Reduce */ -int ompi_coll_adapt_ireduce_init(void); +int ompi_coll_adapt_ireduce_register(void); int ompi_coll_adapt_ireduce_fini(void); -int ompi_coll_adapt_reduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, - struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t * module); -int ompi_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, - struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, mca_coll_base_module_t * module); -int ompi_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, ompi_coll_tree_t * tree, - size_t seg_size, int ireduce_tag); -int ompi_coll_adapt_ireduce_tuned(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t *module, int ireduce_tag); -int ompi_coll_adapt_ireduce_binomial(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ireduce_tag); -int ompi_coll_adapt_ireduce_in_order_binomial(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t * module, int ireduce_tag); -int ompi_coll_adapt_ireduce_binary(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ireduce_tag); -int ompi_coll_adapt_ireduce_pipeline(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ireduce_tag); -int ompi_coll_adapt_ireduce_chain(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ireduce_tag); -int ompi_coll_adapt_ireduce_linear(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ireduce_tag); +int ompi_coll_adapt_reduce(REDUCE_ARGS); +int ompi_coll_adapt_ireduce(IREDUCE_ARGS); +int ompi_coll_adapt_ireduce_generic(IREDUCE_ARGS, + ompi_coll_tree_t * tree, size_t seg_size, int ireduce_tag); +int ompi_coll_adapt_ireduce_tuned(IREDUCE_ARGS, + int ireduce_tag); +int ompi_coll_adapt_ireduce_binomial(IREDUCE_ARGS, + int ireduce_tag); +int ompi_coll_adapt_ireduce_in_order_binomial(IREDUCE_ARGS, + int ireduce_tag); +int ompi_coll_adapt_ireduce_binary(IREDUCE_ARGS, + int ireduce_tag); +int ompi_coll_adapt_ireduce_pipeline(IREDUCE_ARGS, + int ireduce_tag); +int ompi_coll_adapt_ireduce_chain(IREDUCE_ARGS, + int ireduce_tag); +int ompi_coll_adapt_ireduce_linear(IREDUCE_ARGS, + int ireduce_tag); diff --git a/ompi/mca/coll/adapt/coll_adapt_bcast.c b/ompi/mca/coll/adapt/coll_adapt_bcast.c index 604898b2e54..2497b6b9905 100644 --- a/ompi/mca/coll/adapt/coll_adapt_bcast.c +++ b/ompi/mca/coll/adapt/coll_adapt_bcast.c @@ -17,10 +17,13 @@ int ompi_coll_adapt_bcast(void *buff, int count, struct ompi_datatype_t *datatyp { if (count == 0) { return MPI_SUCCESS; - } else { - ompi_request_t *request; - int err = ompi_coll_adapt_ibcast(buff, count, datatype, root, comm, &request, module); - ompi_request_wait(&request, MPI_STATUS_IGNORE); - return err; } + ompi_request_t *request = NULL; + int err = ompi_coll_adapt_ibcast(buff, count, datatype, root, comm, &request, module); + if( MPI_SUCCESS != err ) { + if( NULL == request ) + return err; + } + ompi_request_wait(&request, MPI_STATUS_IGNORE); + return err; } diff --git a/ompi/mca/coll/adapt/coll_adapt_component.c b/ompi/mca/coll/adapt/coll_adapt_component.c index d38cd42b42b..6ec9964e902 100644 --- a/ompi/mca/coll/adapt/coll_adapt_component.c +++ b/ompi/mca/coll/adapt/coll_adapt_component.c @@ -65,11 +65,10 @@ mca_coll_adapt_component_t mca_coll_adapt_component = { /* adapt-component specific information */ - /* (default) priority */ - 0, + 0, /* (default) priority */ - /* (default) verbose level */ - 0, + 0, /* (default) output stream */ + 0, /* (default) verbose level */ /* default values for non-MCA parameters */ /* Not specifying values here gives us all 0's */ @@ -78,25 +77,13 @@ mca_coll_adapt_component_t mca_coll_adapt_component = { /* Open the component */ static int adapt_open(void) { - int param; mca_coll_adapt_component_t *cs = &mca_coll_adapt_component; - /* - * Get the global coll verbosity: it will be ours - */ - param = mca_base_var_find("ompi", "coll", "base", "verbose"); - if (param >= 0) { - const int *verbose = NULL; - mca_base_var_get_value(param, &verbose, NULL, NULL); - if (verbose && verbose[0] > 0) { - cs->adapt_output = opal_output_open(NULL); - opal_output_set_verbosity(cs->adapt_output, verbose[0]); - } + if (cs->adapt_verbose > 0) { + cs->adapt_output = opal_output_open(NULL); + opal_output_set_verbosity(cs->adapt_output, cs->adapt_verbose); } - opal_output_verbose(1, cs->adapt_output, - "coll:adapt:component_open: done!"); - return OMPI_SUCCESS; } @@ -131,16 +118,14 @@ static int adapt_register(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->adapt_priority); - int adapt_verbose = 0; + cs->adapt_verbose = ompi_coll_base_framework.framework_verbose; (void) mca_base_component_var_register(c, "verbose", - "Verbose level", + "Verbose level (default set to the collective framework verbosity)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &adapt_verbose); - cs->adapt_output = opal_output_open(NULL); - opal_output_set_verbosity(cs->adapt_output, adapt_verbose); + MCA_BASE_VAR_SCOPE_READONLY, &cs->adapt_verbose); - cs->adapt_context_free_list_min = 10; + cs->adapt_context_free_list_min = 64; (void) mca_base_component_var_register(c, "context_free_list_min", "Minimum number of segments in context free list", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, @@ -148,7 +133,7 @@ static int adapt_register(void) MCA_BASE_VAR_SCOPE_READONLY, &cs->adapt_context_free_list_min); - cs->adapt_context_free_list_max = 10000; + cs->adapt_context_free_list_max = 1024; (void) mca_base_component_var_register(c, "context_free_list_max", "Maximum number of segments in context free list", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, @@ -156,15 +141,15 @@ static int adapt_register(void) MCA_BASE_VAR_SCOPE_READONLY, &cs->adapt_context_free_list_max); - cs->adapt_context_free_list_inc = 10; + cs->adapt_context_free_list_inc = 32; (void) mca_base_component_var_register(c, "context_free_list_inc", "Increasement number of segments in context free list", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->adapt_context_free_list_inc); - ompi_coll_adapt_ibcast_init(); - ompi_coll_adapt_ireduce_init(); + ompi_coll_adapt_ibcast_register(); + ompi_coll_adapt_ireduce_register(); return adapt_verify_mca_variables(); } diff --git a/ompi/mca/coll/adapt/coll_adapt_context.c b/ompi/mca/coll/adapt/coll_adapt_context.c index be03127f23c..087eccc9ba9 100644 --- a/ompi/mca/coll/adapt/coll_adapt_context.c +++ b/ompi/mca/coll/adapt/coll_adapt_context.c @@ -12,58 +12,15 @@ #include "ompi/mca/coll/coll.h" #include "coll_adapt_context.h" -static void ompi_coll_adapt_bcast_context_constructor(ompi_coll_adapt_bcast_context_t * bcast_context) -{ -} - -static void ompi_coll_adapt_bcast_context_destructor(ompi_coll_adapt_bcast_context_t * bcast_context) -{ -} - -static void -ompi_coll_adapt_constant_bcast_context_constructor(ompi_coll_adapt_constant_bcast_context_t * con) -{ -} - -static void ompi_coll_adapt_constant_bcast_context_destructor(ompi_coll_adapt_constant_bcast_context_t - * con) -{ -} - OBJ_CLASS_INSTANCE(ompi_coll_adapt_bcast_context_t, opal_free_list_item_t, - ompi_coll_adapt_bcast_context_constructor, - ompi_coll_adapt_bcast_context_destructor); + NULL, NULL); OBJ_CLASS_INSTANCE(ompi_coll_adapt_constant_bcast_context_t, opal_object_t, - ompi_coll_adapt_constant_bcast_context_constructor, - ompi_coll_adapt_constant_bcast_context_destructor); - -static void ompi_coll_adapt_reduce_context_constructor(ompi_coll_adapt_reduce_context_t * - reduce_context) -{ -} - -static void ompi_coll_adapt_reduce_context_destructor(ompi_coll_adapt_reduce_context_t * - reduce_context) -{ -} - -static void -ompi_coll_adapt_constant_reduce_context_constructor(ompi_coll_adapt_constant_reduce_context_t * con) -{ -} - -static void -ompi_coll_adapt_constant_reduce_context_destructor(ompi_coll_adapt_constant_reduce_context_t * con) -{ -} - + NULL, NULL); OBJ_CLASS_INSTANCE(ompi_coll_adapt_reduce_context_t, opal_free_list_item_t, - ompi_coll_adapt_reduce_context_constructor, - ompi_coll_adapt_reduce_context_destructor); + NULL, NULL); OBJ_CLASS_INSTANCE(ompi_coll_adapt_constant_reduce_context_t, opal_object_t, - ompi_coll_adapt_constant_reduce_context_constructor, - ompi_coll_adapt_constant_reduce_context_destructor); + NULL, NULL); diff --git a/ompi/mca/coll/adapt/coll_adapt_ibcast.c b/ompi/mca/coll/adapt/coll_adapt_ibcast.c index c3f0868102a..1b4e8de364f 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ibcast.c +++ b/ompi/mca/coll/adapt/coll_adapt_ibcast.c @@ -43,7 +43,7 @@ static ompi_coll_adapt_algorithm_index_t ompi_coll_adapt_ibcast_algorithm_index[ /* * Set up MCA parameters of MPI_Bcast and MPI_IBcast */ -int ompi_coll_adapt_ibcast_init(void) +int ompi_coll_adapt_ibcast_register(void) { mca_base_component_t *c = &mca_coll_adapt_component.super.collm_version; @@ -78,7 +78,6 @@ int ompi_coll_adapt_ibcast_init(void) &mca_coll_adapt_component.adapt_ibcast_max_recv_requests); mca_coll_adapt_component.adapt_ibcast_context_free_list = NULL; - mca_coll_adapt_component.adapt_ibcast_context_free_list_enabled = 0; return OMPI_SUCCESS; } @@ -90,7 +89,6 @@ int ompi_coll_adapt_ibcast_fini(void) if (NULL != mca_coll_adapt_component.adapt_ibcast_context_free_list) { OBJ_RELEASE(mca_coll_adapt_component.adapt_ibcast_context_free_list); mca_coll_adapt_component.adapt_ibcast_context_free_list = NULL; - mca_coll_adapt_component.adapt_ibcast_context_free_list_enabled = 0; OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "ibcast fini\n")); } return OMPI_SUCCESS; @@ -179,7 +177,6 @@ static int send_cb(ompi_request_t * req) int num_sent = ++(context->con->num_sent_segs); int num_recv_fini_t = context->con->num_recv_fini; int rank = ompi_comm_rank(context->con->comm); - opal_mutex_t *mutex_temp = context->con->mutex; /* Check whether signal the condition */ if ((rank == context->con->root && num_sent == context->con->tree->tree_nextsize * context->con->num_segs) @@ -190,13 +187,13 @@ static int send_cb(ompi_request_t * req) context->con->num_segs)) { OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in send\n", ompi_comm_rank(context->con->comm))); - OPAL_THREAD_UNLOCK(mutex_temp); + OPAL_THREAD_UNLOCK(context->con->mutex); ibcast_request_fini(context); } else { OBJ_RELEASE(context->con); opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list, (opal_free_list_item_t *) context); - OPAL_THREAD_UNLOCK(mutex_temp); + OPAL_THREAD_UNLOCK(context->con->mutex); } req->req_free(&req); /* Call back function return 1, which means successful */ @@ -306,7 +303,6 @@ static int recv_cb(ompi_request_t * req) int num_sent = context->con->num_sent_segs; int num_recv_fini_t = ++(context->con->num_recv_fini); int rank = ompi_comm_rank(context->con->comm); - opal_mutex_t *mutex_temp = context->con->mutex; /* If this process is leaf and has received all the segments */ if ((rank == context->con->root @@ -318,13 +314,13 @@ static int recv_cb(ompi_request_t * req) context->con->num_segs)) { OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in recv\n", ompi_comm_rank(context->con->comm))); - OPAL_THREAD_UNLOCK(mutex_temp); + OPAL_THREAD_UNLOCK(context->con->mutex); ibcast_request_fini(context); } else { OBJ_RELEASE(context->con); opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list, (opal_free_list_item_t *) context); - OPAL_THREAD_UNLOCK(mutex_temp); + OPAL_THREAD_UNLOCK(context->con->mutex); } req->req_free(&req); return 1; @@ -334,9 +330,8 @@ int ompi_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *dataty struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module) { - if (count == 0) { - ompi_request_t *temp_request; - temp_request = OBJ_NEW(ompi_request_t); + if (0 == count) { + ompi_request_t *temp_request = OBJ_NEW(ompi_request_t); OMPI_REQUEST_INIT(temp_request, false); temp_request->req_type = 0; temp_request->req_free = ompi_coll_adapt_request_free; @@ -348,24 +343,22 @@ int ompi_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *dataty ompi_request_complete(temp_request, 1); *request = temp_request; return MPI_SUCCESS; - } else { - int rank = ompi_comm_rank(comm); - if (rank == root) { - OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, - "ibcast root %d, algorithm %d, coll_adapt_ibcast_segment_size %zu, coll_adapt_ibcast_max_send_requests %d, coll_adapt_ibcast_max_recv_requests %d\n", - root, mca_coll_adapt_component.adapt_ibcast_algorithm, - mca_coll_adapt_component.adapt_ibcast_segment_size, - mca_coll_adapt_component.adapt_ibcast_max_send_requests, - mca_coll_adapt_component.adapt_ibcast_max_recv_requests)); - } - int ibcast_tag = opal_atomic_add_fetch_32(&(comm->c_ibcast_tag), 1); - ibcast_tag = ibcast_tag % 4096; - ompi_coll_adapt_ibcast_fn_t bcast_func = - (ompi_coll_adapt_ibcast_fn_t) - ompi_coll_adapt_ibcast_algorithm_index[mca_coll_adapt_component.adapt_ibcast_algorithm]. - algorithm_fn_ptr; - return bcast_func(buff, count, datatype, root, comm, request, module, ibcast_tag); } + int ibcast_tag = opal_atomic_add_fetch_32(&(comm->c_ibcast_tag), 1); + ibcast_tag = ibcast_tag % 4096; + + OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, + "ibcast tag %d root %d, algorithm %d, coll_adapt_ibcast_segment_size %zu, coll_adapt_ibcast_max_send_requests %d, coll_adapt_ibcast_max_recv_requests %d\n", + ibcast_tag, root, mca_coll_adapt_component.adapt_ibcast_algorithm, + mca_coll_adapt_component.adapt_ibcast_segment_size, + mca_coll_adapt_component.adapt_ibcast_max_send_requests, + mca_coll_adapt_component.adapt_ibcast_max_recv_requests)); + + ompi_coll_adapt_ibcast_fn_t bcast_func = + (ompi_coll_adapt_ibcast_fn_t) + ompi_coll_adapt_ibcast_algorithm_index[mca_coll_adapt_component.adapt_ibcast_algorithm]. + algorithm_fn_ptr; + return bcast_func(buff, count, datatype, root, comm, request, module, ibcast_tag); } /* @@ -377,7 +370,7 @@ int ompi_coll_adapt_ibcast_tuned(void *buff, int count, struct ompi_datatype_t * mca_coll_base_module_t *module, int ibcast_tag) { OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "tuned not implemented\n")); - return OMPI_SUCCESS; + return OMPI_ERR_NOT_IMPLEMENTED; } int ompi_coll_adapt_ibcast_binomial(void *buff, int count, struct ompi_datatype_t *datatype, @@ -471,12 +464,7 @@ int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t mca_coll_base_module_t * module, ompi_coll_tree_t * tree, size_t seg_size, int ibcast_tag) { - /* Tempory variables for iteration */ - int i, j; - /* Rank of this process */ - int rank; - /* Record return value */ - int err; + int i, j, rank, err; /* The min of num_segs and SEND_NUM or RECV_NUM, in case the num_segs is less than SEND_NUM or RECV_NUM */ int min; @@ -498,23 +486,21 @@ int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t /* Record how many isends have been issued for every child */ int *send_array = NULL; - /* Set up free list */ - if (0 == mca_coll_adapt_component.adapt_ibcast_context_free_list_enabled) { - int32_t context_free_list_enabled = - opal_atomic_add_fetch_32(& - (mca_coll_adapt_component. - adapt_ibcast_context_free_list_enabled), 1); - if (1 == context_free_list_enabled) { - mca_coll_adapt_component.adapt_ibcast_context_free_list = OBJ_NEW(opal_free_list_t); - opal_free_list_init(mca_coll_adapt_component.adapt_ibcast_context_free_list, - sizeof(ompi_coll_adapt_bcast_context_t), - opal_cache_line_size, - OBJ_CLASS(ompi_coll_adapt_bcast_context_t), - 0, opal_cache_line_size, - mca_coll_adapt_component.adapt_context_free_list_min, - mca_coll_adapt_component.adapt_context_free_list_max, - mca_coll_adapt_component.adapt_context_free_list_inc, - NULL, 0, NULL, NULL, NULL); + /* Atomically set up free list */ + if (NULL == mca_coll_adapt_component.adapt_ibcast_context_free_list) { + opal_free_list_t* fl = OBJ_NEW(opal_free_list_t); + opal_free_list_init(fl, + sizeof(ompi_coll_adapt_bcast_context_t), + opal_cache_line_size, + OBJ_CLASS(ompi_coll_adapt_bcast_context_t), + 0, opal_cache_line_size, + mca_coll_adapt_component.adapt_context_free_list_min, + mca_coll_adapt_component.adapt_context_free_list_max, + mca_coll_adapt_component.adapt_context_free_list_inc, + NULL, 0, NULL, NULL, NULL); + if( !OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR((opal_atomic_intptr_t *)&mca_coll_adapt_component.adapt_ibcast_context_free_list, + &(intptr_t){0}, fl) ) { + OBJ_RELEASE(fl); } } diff --git a/ompi/mca/coll/adapt/coll_adapt_inbuf.c b/ompi/mca/coll/adapt/coll_adapt_inbuf.c index a1723ac13d0..aed2f309e34 100644 --- a/ompi/mca/coll/adapt/coll_adapt_inbuf.c +++ b/ompi/mca/coll/adapt/coll_adapt_inbuf.c @@ -12,13 +12,5 @@ #include "coll_adapt.h" #include "coll_adapt_inbuf.h" -static void ompi_coll_adapt_inbuf_constructor(ompi_coll_adapt_inbuf_t * inbuf) -{ -} - -static void ompi_coll_adapt_inbuf_destructor(ompi_coll_adapt_inbuf_t * inbuf) -{ -} - -OBJ_CLASS_INSTANCE(ompi_coll_adapt_inbuf_t, opal_free_list_item_t, ompi_coll_adapt_inbuf_constructor, - ompi_coll_adapt_inbuf_destructor); +OBJ_CLASS_INSTANCE(ompi_coll_adapt_inbuf_t, opal_free_list_item_t, + NULL, NULL); diff --git a/ompi/mca/coll/adapt/coll_adapt_ireduce.c b/ompi/mca/coll/adapt/coll_adapt_ireduce.c index f90c14874f8..9fc7cb63ea0 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ireduce.c +++ b/ompi/mca/coll/adapt/coll_adapt_ireduce.c @@ -47,7 +47,7 @@ static ompi_coll_adapt_algorithm_index_t ompi_coll_adapt_ireduce_algorithm_index /* * Set up MCA parameters of MPI_Reduce and MPI_Ireduce */ -int ompi_coll_adapt_ireduce_init(void) +int ompi_coll_adapt_ireduce_register(void) { mca_base_component_t *c = &mca_coll_adapt_component.super.collm_version; @@ -107,7 +107,6 @@ int ompi_coll_adapt_ireduce_init(void) &mca_coll_adapt_component.adapt_inbuf_free_list_inc); mca_coll_adapt_component.adapt_ireduce_context_free_list = NULL; - mca_coll_adapt_component.adapt_ireduce_context_free_list_enabled = 0; return OMPI_SUCCESS; } @@ -119,14 +118,13 @@ int ompi_coll_adapt_ireduce_fini(void) if (NULL != mca_coll_adapt_component.adapt_ireduce_context_free_list) { OBJ_RELEASE(mca_coll_adapt_component.adapt_ireduce_context_free_list); mca_coll_adapt_component.adapt_ireduce_context_free_list = NULL; - mca_coll_adapt_component.adapt_ireduce_context_free_list_enabled = 0; OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "ireduce fini\n")); } return OMPI_SUCCESS; } /* - * Functions to access list + * Functions to access list */ static ompi_coll_adapt_item_t *get_next_ready_item(opal_list_t * list, int num_children) { @@ -148,26 +146,22 @@ static ompi_coll_adapt_item_t *get_next_ready_item(opal_list_t * list, int num_c static int add_to_list(opal_list_t * list, int id) { ompi_coll_adapt_item_t *item; - int ret = 0; for (item = (ompi_coll_adapt_item_t *) opal_list_get_first(list); item != (ompi_coll_adapt_item_t *) opal_list_get_end(list); item = (ompi_coll_adapt_item_t *) ((opal_list_item_t *) item)->opal_list_next) { if (item->id == id) { (item->count)++; - ret = 1; - break; + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "add_to_list_return 1\n")); + return 1; } } - if (ret == 0) { - item = OBJ_NEW(ompi_coll_adapt_item_t); - item->id = id; - item->count = 1; - opal_list_append(list, (opal_list_item_t *) item); - ret = 2; - } - OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "add_to_list_return %d\n", - ret)); - return ret; + /* Add a new object to the list with count set to 1 */ + item = OBJ_NEW(ompi_coll_adapt_item_t); + item->id = id; + item->count = 1; + opal_list_append(list, (opal_list_item_t *) item); + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "add_to_list_return 1\n")); + return 2; } /* @@ -250,7 +244,6 @@ static int send_cb(ompi_request_t * req) adapt_ireduce_context_free_list); if (context->con->tree->tree_nextsize > 0) { send_context->buff = context->con->accumbuf[item->id]; - } else { send_context->buff = context->buff + (item->id - context->frag_id) * context->con->segment_increment; @@ -530,26 +523,22 @@ int ompi_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi { if (count == 0) { return MPI_SUCCESS; - } else { - int rank = ompi_comm_rank(comm); - if (rank == root) { - OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, - "ireduce root %d, algorithm %d, coll_adapt_ireduce_segment_size %zu, coll_adapt_ireduce_max_send_requests %d, coll_adapt_ireduce_max_recv_requests %d\n", - root, mca_coll_adapt_component.adapt_ireduce_algorithm, - mca_coll_adapt_component.adapt_ireduce_segment_size, - mca_coll_adapt_component.adapt_ireduce_max_send_requests, - mca_coll_adapt_component.adapt_ireduce_max_recv_requests)); - } - /* Get ireduce tag */ - int ireduce_tag = opal_atomic_add_fetch_32(&(comm->c_ireduce_tag), 1); - ireduce_tag = (ireduce_tag % 4096) + 4096; - fflush(stdout); - ompi_coll_adapt_ireduce_fn_t reduce_func = - (ompi_coll_adapt_ireduce_fn_t) - ompi_coll_adapt_ireduce_algorithm_index[mca_coll_adapt_component. - adapt_ireduce_algorithm].algorithm_fn_ptr; - return reduce_func(sbuf, rbuf, count, dtype, op, root, comm, request, module, ireduce_tag); } + int ireduce_tag = opal_atomic_add_fetch_32(&(comm->c_ireduce_tag), 1); + ireduce_tag = (ireduce_tag % 4096) + 4096; + + OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, + "ireduce tag %d root %d, algorithm %d, coll_adapt_ireduce_segment_size %zu, coll_adapt_ireduce_max_send_requests %d, coll_adapt_ireduce_max_recv_requests %d\n", + ireduce_tag, root, mca_coll_adapt_component.adapt_ireduce_algorithm, + mca_coll_adapt_component.adapt_ireduce_segment_size, + mca_coll_adapt_component.adapt_ireduce_max_send_requests, + mca_coll_adapt_component.adapt_ireduce_max_recv_requests)); + + ompi_coll_adapt_ireduce_fn_t reduce_func = + (ompi_coll_adapt_ireduce_fn_t) + ompi_coll_adapt_ireduce_algorithm_index[mca_coll_adapt_component. + adapt_ireduce_algorithm].algorithm_fn_ptr; + return reduce_func(sbuf, rbuf, count, dtype, op, root, comm, request, module, ireduce_tag); } /* @@ -562,7 +551,7 @@ int ompi_coll_adapt_ireduce_tuned(const void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module, int ireduce_tag) { OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "tuned not implemented\n")); - return OMPI_SUCCESS; + return OMPI_ERR_NOT_IMPLEMENTED; } int ompi_coll_adapt_ireduce_binomial(const void *sbuf, void *rbuf, int count, @@ -688,23 +677,21 @@ int ompi_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, ompi_datatype_get_true_extent(dtype, &true_lower_bound, &true_extent); real_seg_size = true_extent + (ptrdiff_t) (seg_count - 1) * extent; - /* Set up free list */ - if (0 == mca_coll_adapt_component.adapt_ireduce_context_free_list_enabled) { - int32_t context_free_list_enabled = - opal_atomic_add_fetch_32(& - (mca_coll_adapt_component. - adapt_ireduce_context_free_list_enabled), 1); - if (1 == context_free_list_enabled) { - mca_coll_adapt_component.adapt_ireduce_context_free_list = OBJ_NEW(opal_free_list_t); - opal_free_list_init(mca_coll_adapt_component.adapt_ireduce_context_free_list, - sizeof(ompi_coll_adapt_reduce_context_t), - opal_cache_line_size, - OBJ_CLASS(ompi_coll_adapt_reduce_context_t), - 0, opal_cache_line_size, - mca_coll_adapt_component.adapt_context_free_list_min, - mca_coll_adapt_component.adapt_context_free_list_max, - mca_coll_adapt_component.adapt_context_free_list_inc, - NULL, 0, NULL, NULL, NULL); + /* Atomically set up free list */ + if (NULL == mca_coll_adapt_component.adapt_ireduce_context_free_list) { + opal_free_list_t* fl = OBJ_NEW(opal_free_list_t); + opal_free_list_init(fl, + sizeof(ompi_coll_adapt_reduce_context_t), + opal_cache_line_size, + OBJ_CLASS(ompi_coll_adapt_reduce_context_t), + 0, opal_cache_line_size, + mca_coll_adapt_component.adapt_context_free_list_min, + mca_coll_adapt_component.adapt_context_free_list_max, + mca_coll_adapt_component.adapt_context_free_list_inc, + NULL, 0, NULL, NULL, NULL); + if( !OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR((opal_atomic_intptr_t *)&mca_coll_adapt_component.adapt_ireduce_context_free_list, + &(intptr_t){0}, fl) ) { + OBJ_RELEASE(fl); } } diff --git a/ompi/mca/coll/adapt/coll_adapt_item.c b/ompi/mca/coll/adapt/coll_adapt_item.c index 73258326a66..1cb144b309c 100644 --- a/ompi/mca/coll/adapt/coll_adapt_item.c +++ b/ompi/mca/coll/adapt/coll_adapt_item.c @@ -11,13 +11,5 @@ #include "coll_adapt_item.h" -static void ompi_coll_adapt_item_constructor(ompi_coll_adapt_item_t * item) -{ -} - -static void ompi_coll_adapt_item_destructor(ompi_coll_adapt_item_t * item) -{ -} - -OBJ_CLASS_INSTANCE(ompi_coll_adapt_item_t, opal_list_item_t, ompi_coll_adapt_item_constructor, - ompi_coll_adapt_item_destructor); +OBJ_CLASS_INSTANCE(ompi_coll_adapt_item_t, opal_list_item_t, + NULL, NULL); diff --git a/ompi/mca/coll/adapt/coll_adapt_item.h b/ompi/mca/coll/adapt/coll_adapt_item.h index 768f9f29dc0..0eb129704d3 100644 --- a/ompi/mca/coll/adapt/coll_adapt_item.h +++ b/ompi/mca/coll/adapt/coll_adapt_item.h @@ -16,7 +16,7 @@ struct ompi_coll_adapt_item_s { opal_list_item_t super; /* Fragment id */ int id; - /* The number of children which have received the current segment from */ + /* The number of children which have received the current segment */ int count; }; diff --git a/ompi/mca/coll/adapt/coll_adapt_reduce.c b/ompi/mca/coll/adapt/coll_adapt_reduce.c index e45bb3478a9..940673f7b97 100644 --- a/ompi/mca/coll/adapt/coll_adapt_reduce.c +++ b/ompi/mca/coll/adapt/coll_adapt_reduce.c @@ -19,11 +19,13 @@ int ompi_coll_adapt_reduce(const void *sbuf, void *rbuf, int count, struct ompi_ { if (count == 0) { return MPI_SUCCESS; - } else { - ompi_request_t *request; - int err = - ompi_coll_adapt_ireduce(sbuf, rbuf, count, dtype, op, root, comm, &request, module); - ompi_request_wait(&request, MPI_STATUS_IGNORE); - return err; } + ompi_request_t *request = NULL; + int err = ompi_coll_adapt_ireduce(sbuf, rbuf, count, dtype, op, root, comm, &request, module); + if( MPI_SUCCESS != err ) { + if( NULL == request ) + return err; + } + ompi_request_wait(&request, MPI_STATUS_IGNORE); + return err; } diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h index fcea107e7c7..11b46ba47eb 100644 --- a/ompi/mca/coll/base/coll_base_functions.h +++ b/ompi/mca/coll/base/coll_base_functions.h @@ -492,10 +492,6 @@ struct mca_coll_base_comm_t { /* in-order binary tree (root of the in-order binary tree is rank 0) */ ompi_coll_tree_t *cached_in_order_bintree; - - /* linear */ - ompi_coll_tree_t *cached_linear; - int cached_linear_root; }; typedef struct mca_coll_base_comm_t mca_coll_base_comm_t; OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_comm_t); diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c index a17cfacb126..7f6764d5f98 100644 --- a/ompi/mca/coll/tuned/coll_tuned_component.c +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -215,17 +215,8 @@ static int tuned_open(void) int rc; #if OPAL_ENABLE_DEBUG - { - int param; - - param = mca_base_var_find("ompi", "coll", "base", "verbose"); - if (param >= 0) { - const int *verbose = NULL; - mca_base_var_get_value(param, &verbose, NULL, NULL); - if (verbose && verbose[0] > 0) { - ompi_coll_tuned_stream = opal_output_open(NULL); - } - } + if (ompi_coll_base_framework.framework_verbose) { + ompi_coll_tuned_stream = opal_output_open(NULL); } #endif /* OPAL_ENABLE_DEBUG */ diff --git a/ompi/request/request.h b/ompi/request/request.h index f12882c033c..706d98a930c 100644 --- a/ompi/request/request.h +++ b/ompi/request/request.h @@ -436,13 +436,14 @@ static inline void ompi_request_wait_completion(ompi_request_t *req) static inline int ompi_request_complete(ompi_request_t* request, bool with_signal) { int rc = 0; - + if(NULL != request->req_complete_cb) { - ompi_request_complete_fn_t temp = request->req_complete_cb; + /* Set the request cb to NULL to allow resetting in the callback */ + ompi_request_complete_fn_t fct = request->req_complete_cb; request->req_complete_cb = NULL; - rc = temp( request ); + rc = fct( request ); } - + if (0 == rc) { if( OPAL_LIKELY(with_signal) ) { void *_tmp_ptr = REQUEST_PENDING; @@ -454,11 +455,10 @@ static inline int ompi_request_complete(ompi_request_t* request, bool with_signa if( REQUEST_PENDING != tmp_sync ) wait_sync_update(tmp_sync, 1, request->req_status.MPI_ERROR); } - } else { + } else request->req_complete = REQUEST_COMPLETED; - } } - + return OMPI_SUCCESS; } @@ -468,14 +468,13 @@ static inline int ompi_request_set_callback(ompi_request_t* request, { request->req_complete_cb_data = cb_data; request->req_complete_cb = cb; - int rc = 0; /* If request is completed and the callback is not called, need to call callback */ if ((NULL != request->req_complete_cb) && (request->req_complete == REQUEST_COMPLETED)) { - ompi_request_complete_fn_t temp = request->req_complete_cb; + ompi_request_complete_fn_t fct = request->req_complete_cb; request->req_complete_cb = NULL; - rc = temp( request ); + return fct( request ); } - return rc; + return OMPI_SUCCESS; } END_C_DECLS From 8582e10d2be0294892499b9b3f6f486c943b8d7d Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 7 May 2020 18:09:35 -0400 Subject: [PATCH 4/7] Consistent handling of zero counts in the MPI API. Signed-off-by: George Bosilca --- ompi/mpi/c/ibcast.c | 7 +++++++ ompi/mpi/c/ireduce_scatter_block.c | 5 +++++ ompi/mpi/c/reduce_scatter_block.c | 3 +++ 3 files changed, 15 insertions(+) diff --git a/ompi/mpi/c/ibcast.c b/ompi/mpi/c/ibcast.c index 94b821c5b4c..f3f248e949b 100644 --- a/ompi/mpi/c/ibcast.c +++ b/ompi/mpi/c/ibcast.c @@ -96,6 +96,13 @@ int MPI_Ibcast(void *buffer, int count, MPI_Datatype datatype, } } + /* If there's only one node, or if the count is 0, we're done */ + + if ((OMPI_COMM_IS_INTRA(comm) && ompi_comm_size(comm) <= 1) || + 0 == count) { + return MPI_SUCCESS; + } + OPAL_CR_ENTER_LIBRARY(); /* Invoke the coll component to perform the back-end operation */ diff --git a/ompi/mpi/c/ireduce_scatter_block.c b/ompi/mpi/c/ireduce_scatter_block.c index fc85df7f893..3c7cc6ec7ff 100644 --- a/ompi/mpi/c/ireduce_scatter_block.c +++ b/ompi/mpi/c/ireduce_scatter_block.c @@ -97,6 +97,11 @@ int MPI_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount, OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME); } + if (0 == recvcount) { + *request = &ompi_request_empty; + return MPI_SUCCESS; + } + OPAL_CR_ENTER_LIBRARY(); /* Invoke the coll component to perform the back-end operation */ diff --git a/ompi/mpi/c/reduce_scatter_block.c b/ompi/mpi/c/reduce_scatter_block.c index 7c12fdd495a..0883ab26773 100644 --- a/ompi/mpi/c/reduce_scatter_block.c +++ b/ompi/mpi/c/reduce_scatter_block.c @@ -94,6 +94,9 @@ int MPI_Reduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount, OMPI_CHECK_DATATYPE_FOR_SEND(err, datatype, recvcount); OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME); } + if (0 == recvcount) { + return MPI_SUCCESS; + } OPAL_CR_ENTER_LIBRARY(); From c2970a36953c8ab0b86ea3610258703affdf07c8 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 7 May 2020 23:59:56 -0400 Subject: [PATCH 5/7] Correctly handle non-blocking collectives tags As it is possible to have multiple outstanding non-blocking collectives provided by different collective modules, we need a consistent mechanism to allow them to select unique tags for each instance of a collective. Signed-off-by: George Bosilca --- ompi/communicator/comm_init.c | 6 +- ompi/communicator/communicator.h | 11 +- ompi/mca/coll/adapt/coll_adapt_algorithms.h | 46 +++---- ompi/mca/coll/adapt/coll_adapt_ibcast.c | 95 ++++++------- ompi/mca/coll/adapt/coll_adapt_ireduce.c | 143 +++++++++----------- ompi/mca/coll/base/coll_base_util.h | 18 +++ ompi/mca/coll/libnbc/coll_libnbc.h | 1 - ompi/mca/coll/libnbc/nbc.c | 23 +--- 8 files changed, 151 insertions(+), 192 deletions(-) diff --git a/ompi/communicator/comm_init.c b/ompi/communicator/comm_init.c index 64dc9faf39c..639372548d0 100644 --- a/ompi/communicator/comm_init.c +++ b/ompi/communicator/comm_init.c @@ -40,6 +40,7 @@ #include "ompi/constants.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/base.h" +#include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/topo/base/base.h" #include "ompi/runtime/params.h" #include "ompi/communicator/communicator.h" @@ -382,9 +383,8 @@ static void ompi_comm_construct(ompi_communicator_t* comm) comm->c_pml_comm = NULL; comm->c_topo = NULL; comm->c_coll = NULL; - comm->c_ibcast_tag = 0; - comm->c_ireduce_tag = 0; - + comm->c_nbc_tag = MCA_COLL_BASE_TAG_NONBLOCKING_BASE; + /* A keyhash will be created if/when an attribute is cached on this communicator */ comm->c_keyhash = NULL; diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h index c642ab4bfb8..8936b7f1df9 100644 --- a/ompi/communicator/communicator.h +++ b/ompi/communicator/communicator.h @@ -188,13 +188,12 @@ struct ompi_communicator_t { /* Collectives module interface and data */ mca_coll_base_comm_coll_t *c_coll; - /* Non-blocking collective tag. These are added here as they should be - * shared between all non-blocking collective modules (to avoid message - * collisions between them in the case where multiple outstanding - * non-blocking collective coexists using multiple backends). + /* Non-blocking collective tag. These tags might be shared between + * all non-blocking collective modules (to avoid message collision + * between them in the case where multiple outstanding non-blocking + * collective coexists using multiple backends). */ - opal_atomic_int32_t c_ibcast_tag; - opal_atomic_int32_t c_ireduce_tag; + opal_atomic_int32_t c_nbc_tag; }; typedef struct ompi_communicator_t ompi_communicator_t; diff --git a/ompi/mca/coll/adapt/coll_adapt_algorithms.h b/ompi/mca/coll/adapt/coll_adapt_algorithms.h index a25d8afb622..700adabea15 100644 --- a/ompi/mca/coll/adapt/coll_adapt_algorithms.h +++ b/ompi/mca/coll/adapt/coll_adapt_algorithms.h @@ -25,21 +25,14 @@ int ompi_coll_adapt_ibcast_fini(void); int ompi_coll_adapt_bcast(BCAST_ARGS); int ompi_coll_adapt_ibcast(IBCAST_ARGS); int ompi_coll_adapt_ibcast_generic(IBCAST_ARGS, - ompi_coll_tree_t * tree, size_t seg_size, int ibcast_tag); -int ompi_coll_adapt_ibcast_binomial(IBCAST_ARGS, - int ibcast_tag); -int ompi_coll_adapt_ibcast_in_order_binomial(IBCAST_ARGS, - int ibcast_tag); -int ompi_coll_adapt_ibcast_binary(IBCAST_ARGS, - int ibcast_tag); -int ompi_coll_adapt_ibcast_pipeline(IBCAST_ARGS, - int ibcast_tag); -int ompi_coll_adapt_ibcast_chain(IBCAST_ARGS, - int ibcast_tag); -int ompi_coll_adapt_ibcast_linear(IBCAST_ARGS, - int ibcast_tag); -int ompi_coll_adapt_ibcast_tuned(IBCAST_ARGS, - int ibcast_tag); + ompi_coll_tree_t * tree, size_t seg_size); +int ompi_coll_adapt_ibcast_binomial(IBCAST_ARGS); +int ompi_coll_adapt_ibcast_in_order_binomial(IBCAST_ARGS); +int ompi_coll_adapt_ibcast_binary(IBCAST_ARGS); +int ompi_coll_adapt_ibcast_pipeline(IBCAST_ARGS); +int ompi_coll_adapt_ibcast_chain(IBCAST_ARGS); +int ompi_coll_adapt_ibcast_linear(IBCAST_ARGS); +int ompi_coll_adapt_ibcast_tuned(IBCAST_ARGS); /* Reduce */ int ompi_coll_adapt_ireduce_register(void); @@ -47,18 +40,11 @@ int ompi_coll_adapt_ireduce_fini(void); int ompi_coll_adapt_reduce(REDUCE_ARGS); int ompi_coll_adapt_ireduce(IREDUCE_ARGS); int ompi_coll_adapt_ireduce_generic(IREDUCE_ARGS, - ompi_coll_tree_t * tree, size_t seg_size, int ireduce_tag); -int ompi_coll_adapt_ireduce_tuned(IREDUCE_ARGS, - int ireduce_tag); -int ompi_coll_adapt_ireduce_binomial(IREDUCE_ARGS, - int ireduce_tag); -int ompi_coll_adapt_ireduce_in_order_binomial(IREDUCE_ARGS, - int ireduce_tag); -int ompi_coll_adapt_ireduce_binary(IREDUCE_ARGS, - int ireduce_tag); -int ompi_coll_adapt_ireduce_pipeline(IREDUCE_ARGS, - int ireduce_tag); -int ompi_coll_adapt_ireduce_chain(IREDUCE_ARGS, - int ireduce_tag); -int ompi_coll_adapt_ireduce_linear(IREDUCE_ARGS, - int ireduce_tag); + ompi_coll_tree_t * tree, size_t seg_size); +int ompi_coll_adapt_ireduce_tuned(IREDUCE_ARGS); +int ompi_coll_adapt_ireduce_binomial(IREDUCE_ARGS); +int ompi_coll_adapt_ireduce_in_order_binomial(IREDUCE_ARGS); +int ompi_coll_adapt_ireduce_binary(IREDUCE_ARGS); +int ompi_coll_adapt_ireduce_pipeline(IREDUCE_ARGS); +int ompi_coll_adapt_ireduce_chain(IREDUCE_ARGS); +int ompi_coll_adapt_ireduce_linear(IREDUCE_ARGS); diff --git a/ompi/mca/coll/adapt/coll_adapt_ibcast.c b/ompi/mca/coll/adapt/coll_adapt_ibcast.c index 1b4e8de364f..3a8555e7fd2 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ibcast.c +++ b/ompi/mca/coll/adapt/coll_adapt_ibcast.c @@ -14,7 +14,7 @@ #include "coll_adapt.h" #include "coll_adapt_algorithms.h" #include "coll_adapt_context.h" -#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/coll/base/coll_base_util.h" #include "ompi/mca/coll/base/coll_base_functions.h" #include "opal/util/bit_ops.h" #include "opal/sys/atomic.h" @@ -27,8 +27,7 @@ typedef int (*ompi_coll_adapt_ibcast_fn_t) (void *buff, int root, struct ompi_communicator_t * comm, ompi_request_t ** request, - mca_coll_base_module_t * module, - int ibcast_tag); + mca_coll_base_module_t * module); static ompi_coll_adapt_algorithm_index_t ompi_coll_adapt_ibcast_algorithm_index[] = { {0, (uintptr_t) ompi_coll_adapt_ibcast_tuned}, @@ -158,11 +157,11 @@ static int send_cb(ompi_request_t * req) "[%d]: Send(start in send cb): segment %d to %d at buff %p send_count %d tag %d\n", ompi_comm_rank(send_context->con->comm), send_context->frag_id, send_context->peer, (void *) send_context->buff, send_count, - (send_context->con->ibcast_tag << 16) + new_id)); + send_context->con->ibcast_tag - new_id)); err = MCA_PML_CALL(isend (send_buff, send_count, send_context->con->datatype, send_context->peer, - (send_context->con->ibcast_tag << 16) + new_id, + send_context->con->ibcast_tag - new_id, MCA_PML_BASE_SEND_SYNCHRONOUS, send_context->con->comm, &send_req)); if (MPI_SUCCESS != err) { OPAL_THREAD_UNLOCK(context->con->mutex); @@ -245,10 +244,10 @@ static int recv_cb(ompi_request_t * req) "[%d]: Recv(start in recv cb): segment %d from %d at buff %p recv_count %d tag %d\n", ompi_comm_rank(context->con->comm), context->frag_id, context->peer, (void *) recv_buff, recv_count, - (recv_context->con->ibcast_tag << 16) + recv_context->frag_id)); + recv_context->con->ibcast_tag - recv_context->frag_id)); MCA_PML_CALL(irecv (recv_buff, recv_count, recv_context->con->datatype, recv_context->peer, - (recv_context->con->ibcast_tag << 16) + recv_context->frag_id, + recv_context->con->ibcast_tag - recv_context->frag_id, recv_context->con->comm, &recv_req)); /* Invoke recvive call back */ @@ -282,12 +281,12 @@ static int recv_cb(ompi_request_t * req) "[%d]: Send(start in recv cb): segment %d to %d at buff %p send_count %d tag %d\n", ompi_comm_rank(send_context->con->comm), send_context->frag_id, send_context->peer, (void *) send_context->buff, send_count, - (send_context->con->ibcast_tag << 16) + send_context->frag_id)); + send_context->con->ibcast_tag - send_context->frag_id)); err = MCA_PML_CALL(isend (send_buff, send_count, send_context->con->datatype, send_context->peer, - (send_context->con->ibcast_tag << 16) + send_context->frag_id, + send_context->con->ibcast_tag - send_context->frag_id, MCA_PML_BASE_SEND_SYNCHRONOUS, send_context->con->comm, &send_req)); if (MPI_SUCCESS != err) { OPAL_THREAD_UNLOCK(context->con->mutex); @@ -344,12 +343,10 @@ int ompi_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *dataty *request = temp_request; return MPI_SUCCESS; } - int ibcast_tag = opal_atomic_add_fetch_32(&(comm->c_ibcast_tag), 1); - ibcast_tag = ibcast_tag % 4096; OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, - "ibcast tag %d root %d, algorithm %d, coll_adapt_ibcast_segment_size %zu, coll_adapt_ibcast_max_send_requests %d, coll_adapt_ibcast_max_recv_requests %d\n", - ibcast_tag, root, mca_coll_adapt_component.adapt_ibcast_algorithm, + "ibcast root %d, algorithm %d, coll_adapt_ibcast_segment_size %zu, coll_adapt_ibcast_max_send_requests %d, coll_adapt_ibcast_max_recv_requests %d\n", + root, mca_coll_adapt_component.adapt_ibcast_algorithm, mca_coll_adapt_component.adapt_ibcast_segment_size, mca_coll_adapt_component.adapt_ibcast_max_send_requests, mca_coll_adapt_component.adapt_ibcast_max_recv_requests)); @@ -358,89 +355,82 @@ int ompi_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *dataty (ompi_coll_adapt_ibcast_fn_t) ompi_coll_adapt_ibcast_algorithm_index[mca_coll_adapt_component.adapt_ibcast_algorithm]. algorithm_fn_ptr; - return bcast_func(buff, count, datatype, root, comm, request, module, ibcast_tag); + return bcast_func(buff, count, datatype, root, comm, request, module); } /* * Ibcast functions with different algorithms */ int ompi_coll_adapt_ibcast_tuned(void *buff, int count, struct ompi_datatype_t *datatype, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module, int ibcast_tag) + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t *module) { OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "tuned not implemented\n")); return OMPI_ERR_NOT_IMPLEMENTED; } int ompi_coll_adapt_ibcast_binomial(void *buff, int count, struct ompi_datatype_t *datatype, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, mca_coll_base_module_t * module, - int ibcast_tag) + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, mca_coll_base_module_t * module) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_bmtree(comm, root); int err = ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, - mca_coll_adapt_component.adapt_ibcast_segment_size, - ibcast_tag); + mca_coll_adapt_component.adapt_ibcast_segment_size); return err; } int ompi_coll_adapt_ibcast_in_order_binomial(void *buff, int count, struct ompi_datatype_t *datatype, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t * module, int ibcast_tag) + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t * module) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_in_order_bmtree(comm, root); int err = ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, - mca_coll_adapt_component.adapt_ibcast_segment_size, - ibcast_tag); + mca_coll_adapt_component.adapt_ibcast_segment_size); return err; } int ompi_coll_adapt_ibcast_binary(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ibcast_tag) + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_tree(2, comm, root); int err = ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, - mca_coll_adapt_component.adapt_ibcast_segment_size, - ibcast_tag); + mca_coll_adapt_component.adapt_ibcast_segment_size); return err; } int ompi_coll_adapt_ibcast_pipeline(void *buff, int count, struct ompi_datatype_t *datatype, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, mca_coll_base_module_t * module, - int ibcast_tag) + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, mca_coll_base_module_t * module) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(1, comm, root); int err = ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, - mca_coll_adapt_component.adapt_ibcast_segment_size, - ibcast_tag); + mca_coll_adapt_component.adapt_ibcast_segment_size); return err; } int ompi_coll_adapt_ibcast_chain(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ibcast_tag) + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(4, comm, root); int err = ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, - mca_coll_adapt_component.adapt_ibcast_segment_size, - ibcast_tag); + mca_coll_adapt_component.adapt_ibcast_segment_size); return err; } int ompi_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ibcast_tag) + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { int fanout = ompi_comm_size(comm) - 1; ompi_coll_tree_t *tree; @@ -453,16 +443,15 @@ int ompi_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t } int err = ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, - mca_coll_adapt_component.adapt_ibcast_segment_size, - ibcast_tag); + mca_coll_adapt_component.adapt_ibcast_segment_size); return err; } int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, ompi_coll_tree_t * tree, - size_t seg_size, int ibcast_tag) + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, ompi_coll_tree_t * tree, + size_t seg_size) { int i, j, rank, err; /* The min of num_segs and SEND_NUM or RECV_NUM, in case the num_segs is less than SEND_NUM or RECV_NUM */ @@ -555,11 +544,11 @@ int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t con->mutex = mutex; con->request = temp_request; con->tree = tree; - con->ibcast_tag = ibcast_tag; + con->ibcast_tag = ompi_coll_base_nbc_reserve_tags(comm, num_segs); OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Ibcast, root %d, tag %d\n", rank, root, - ibcast_tag)); + con->ibcast_tag)); OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: con->mutex = %p, num_children = %d, num_segs = %d, real_seg_size = %d, seg_count = %d, tree_adreess = %p\n", rank, (void *) con->mutex, tree->tree_nextsize, num_segs, @@ -610,11 +599,11 @@ int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Send(start in main): segment %d to %d at buff %p send_count %d tag %d\n", rank, context->frag_id, context->peer, - (void *) send_buff, send_count, (ibcast_tag << 16) + i)); + (void *) send_buff, send_count, con->ibcast_tag - i)); err = MCA_PML_CALL(isend (send_buff, send_count, datatype, context->peer, - (ibcast_tag << 16) + i, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, + con->ibcast_tag - i, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &send_req)); if (MPI_SUCCESS != err) { return err; @@ -668,11 +657,11 @@ int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t "[%d]: Recv(start in main): segment %d from %d at buff %p recv_count %d tag %d\n", ompi_comm_rank(context->con->comm), context->frag_id, context->peer, (void *) recv_buff, recv_count, - (ibcast_tag << 16) + i)); + con->ibcast_tag - i)); err = MCA_PML_CALL(irecv (recv_buff, recv_count, datatype, context->peer, - (ibcast_tag << 16) + i, comm, &recv_req)); + con->ibcast_tag - i, comm, &recv_req)); if (MPI_SUCCESS != err) { return err; } diff --git a/ompi/mca/coll/adapt/coll_adapt_ireduce.c b/ompi/mca/coll/adapt/coll_adapt_ireduce.c index 9fc7cb63ea0..63de926ef53 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ireduce.c +++ b/ompi/mca/coll/adapt/coll_adapt_ireduce.c @@ -17,7 +17,7 @@ #include "coll_adapt_item.h" #include "ompi/constants.h" #include "ompi/mca/coll/coll.h" -#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/coll/base/coll_base_util.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/coll_base_functions.h" #include "ompi/mca/coll/base/coll_base_topo.h" @@ -32,7 +32,7 @@ typedef int (*ompi_coll_adapt_ireduce_fn_t) (const void *sbuf, int root, struct ompi_communicator_t * comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ireduce_tag); + mca_coll_base_module_t * module); static ompi_coll_adapt_algorithm_index_t ompi_coll_adapt_ireduce_algorithm_index[] = { {0, (uintptr_t)ompi_coll_adapt_ireduce_tuned}, @@ -263,14 +263,14 @@ static int send_cb(ompi_request_t * req) OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: In send_cb, create isend to seg %d, peer %d, tag %d\n", send_context->con->rank, send_context->frag_id, send_context->peer, - (send_context->con->ireduce_tag << 16) + send_context->frag_id)); + send_context->con->ireduce_tag - send_context->frag_id)); ompi_request_t *send_req; err = MCA_PML_CALL(isend (send_context->buff, send_count, send_context->con->datatype, send_context->peer, - (context->con->ireduce_tag << 16) + send_context->frag_id, + context->con->ireduce_tag - send_context->frag_id, MCA_PML_BASE_SEND_SYNCHRONOUS, send_context->con->comm, &send_req)); if (MPI_SUCCESS != err) { return err; @@ -355,13 +355,13 @@ static int recv_cb(ompi_request_t * req) "[%d]: In recv_cb, create irecv for seg %d, peer %d, inbuf %p, tag %d\n", context->con->rank, recv_context->frag_id, recv_context->peer, (void *) inbuf, - (recv_context->con->ireduce_tag << 16) + recv_context->frag_id)); + recv_context->con->ireduce_tag - recv_context->frag_id)); ompi_request_t *recv_req; err = MCA_PML_CALL(irecv (temp_recv_buf, recv_count, recv_context->con->datatype, recv_context->peer, - (recv_context->con->ireduce_tag << 16) + recv_context->frag_id, + recv_context->con->ireduce_tag - recv_context->frag_id, recv_context->con->comm, &recv_req)); if (MPI_SUCCESS != err) { return err; @@ -460,14 +460,14 @@ static int recv_cb(ompi_request_t * req) OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: In recv_cb, create isend to seg %d, peer %d, tag %d\n", send_context->con->rank, send_context->frag_id, send_context->peer, - (send_context->con->ireduce_tag << 16) + send_context->frag_id)); + send_context->con->ireduce_tag - send_context->frag_id)); ompi_request_t *send_req; err = MCA_PML_CALL(isend (send_context->buff, send_count, send_context->con->datatype, send_context->peer, - (send_context->con->ireduce_tag << 16) + send_context->frag_id, + send_context->con->ireduce_tag - send_context->frag_id, MCA_PML_BASE_SEND_SYNCHRONOUS, send_context->con->comm, &send_req)); if (MPI_SUCCESS != err) { return err; @@ -524,12 +524,10 @@ int ompi_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi if (count == 0) { return MPI_SUCCESS; } - int ireduce_tag = opal_atomic_add_fetch_32(&(comm->c_ireduce_tag), 1); - ireduce_tag = (ireduce_tag % 4096) + 4096; OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, - "ireduce tag %d root %d, algorithm %d, coll_adapt_ireduce_segment_size %zu, coll_adapt_ireduce_max_send_requests %d, coll_adapt_ireduce_max_recv_requests %d\n", - ireduce_tag, root, mca_coll_adapt_component.adapt_ireduce_algorithm, + "ireduce root %d, algorithm %d, coll_adapt_ireduce_segment_size %zu, coll_adapt_ireduce_max_send_requests %d, coll_adapt_ireduce_max_recv_requests %d\n", + root, mca_coll_adapt_component.adapt_ireduce_algorithm, mca_coll_adapt_component.adapt_ireduce_segment_size, mca_coll_adapt_component.adapt_ireduce_max_send_requests, mca_coll_adapt_component.adapt_ireduce_max_recv_requests)); @@ -538,93 +536,78 @@ int ompi_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi (ompi_coll_adapt_ireduce_fn_t) ompi_coll_adapt_ireduce_algorithm_index[mca_coll_adapt_component. adapt_ireduce_algorithm].algorithm_fn_ptr; - return reduce_func(sbuf, rbuf, count, dtype, op, root, comm, request, module, ireduce_tag); + return reduce_func(sbuf, rbuf, count, dtype, op, root, comm, request, module); } /* * Ireduce functions with different algorithms */ int ompi_coll_adapt_ireduce_tuned(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module, int ireduce_tag) + struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t *module) { OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "tuned not implemented\n")); return OMPI_ERR_NOT_IMPLEMENTED; } int ompi_coll_adapt_ireduce_binomial(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ireduce_tag) + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { - ompi_coll_tree_t *tree = ompi_coll_base_topo_build_bmtree(comm, root); - int err = - ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, - tree, mca_coll_adapt_component.adapt_ireduce_segment_size, - ireduce_tag); - return err; + return ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, + request, module, ompi_coll_base_topo_build_bmtree(comm, root), + mca_coll_adapt_component.adapt_ireduce_segment_size); } int ompi_coll_adapt_ireduce_in_order_binomial(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t * module, int ireduce_tag) + struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t * module) { - ompi_coll_tree_t *tree = ompi_coll_base_topo_build_in_order_bmtree(comm, root); - int err = - ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, - tree, mca_coll_adapt_component.adapt_ireduce_segment_size, - ireduce_tag); - return err; + return ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, + request, module, ompi_coll_base_topo_build_in_order_bmtree(comm, root), + mca_coll_adapt_component.adapt_ireduce_segment_size); } int ompi_coll_adapt_ireduce_binary(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ireduce_tag) + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { - ompi_coll_tree_t *tree = ompi_coll_base_topo_build_tree(2, comm, root); - int err = - ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, - tree, mca_coll_adapt_component.adapt_ireduce_segment_size, - ireduce_tag); - return err; + return ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, + request, module, ompi_coll_base_topo_build_tree(2, comm, root), + mca_coll_adapt_component.adapt_ireduce_segment_size); } int ompi_coll_adapt_ireduce_pipeline(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ireduce_tag) + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { - ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(1, comm, root); - int err = - ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, - tree, mca_coll_adapt_component.adapt_ireduce_segment_size, - ireduce_tag); - return err; + return ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, + request, module, ompi_coll_base_topo_build_chain(1, comm, root), + mca_coll_adapt_component.adapt_ireduce_segment_size); } int ompi_coll_adapt_ireduce_chain(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ireduce_tag) + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { - ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(4, comm, root); - int err = - ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, - tree, mca_coll_adapt_component.adapt_ireduce_segment_size, - ireduce_tag); - return err; + return ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, + request, module, ompi_coll_base_topo_build_chain(4, comm, root), + mca_coll_adapt_component.adapt_ireduce_segment_size); } int ompi_coll_adapt_ireduce_linear(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, int ireduce_tag) + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { int fanout = ompi_comm_size(comm) - 1; ompi_coll_tree_t *tree; @@ -635,19 +618,17 @@ int ompi_coll_adapt_ireduce_linear(const void *sbuf, void *rbuf, int count, } else { tree = ompi_coll_base_topo_build_tree(MAXTREEFANOUT, comm, root); } - int err = - ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, - tree, mca_coll_adapt_component.adapt_ireduce_segment_size, - ireduce_tag); - return err; + return ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, + request, module, tree, + mca_coll_adapt_component.adapt_ireduce_segment_size); } int ompi_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module, ompi_coll_tree_t * tree, - size_t seg_size, int ireduce_tag) + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module, ompi_coll_tree_t * tree, + size_t seg_size) { ptrdiff_t extent, lower_bound, segment_increment; @@ -777,12 +758,12 @@ int ompi_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, con->rbuf = (char *) rbuf; con->root = root; con->distance = distance; - con->ireduce_tag = ireduce_tag; + con->ireduce_tag = ompi_coll_base_nbc_reserve_tags(comm, num_segs); con->real_seg_size = real_seg_size; OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: start ireduce root %d tag %d\n", rank, tree->tree_root, - ireduce_tag)); + con->ireduce_tag)); /* If the current process is not leaf node */ if (tree->tree_nextsize > 0) { @@ -849,14 +830,14 @@ int ompi_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, "[%d]: In ireduce, create irecv for seg %d, peer %d, recv_count %d, inbuf %p tag %d\n", context->con->rank, context->frag_id, context->peer, recv_count, (void *) inbuf, - (ireduce_tag << 16) + seg_index)); + con->ireduce_tag - seg_index)); /* Create a recv request */ ompi_request_t *recv_req; err = MCA_PML_CALL(irecv (temp_recv_buf, recv_count, dtype, tree->tree_next[i], - (ireduce_tag << 16) + seg_index, comm, &recv_req)); + con->ireduce_tag - seg_index, comm, &recv_req)); if (MPI_SUCCESS != err) { return err; } @@ -908,14 +889,14 @@ int ompi_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: In ireduce, create isend to seg %d, peer %d, send_count %d tag %d\n", context->con->rank, context->frag_id, context->peer, - send_count, (ireduce_tag << 16) + context->frag_id)); + send_count, con->ireduce_tag - context->frag_id)); /* Create send request */ ompi_request_t *send_req; err = MCA_PML_CALL(isend (context->buff, send_count, dtype, tree->tree_prev, - (ireduce_tag << 16) + context->frag_id, + con->ireduce_tag - context->frag_id, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &send_req)); if (MPI_SUCCESS != err) { return err; diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h index c83e46c2ddb..b54fc70664b 100644 --- a/ompi/mca/coll/base/coll_base_util.h +++ b/ompi/mca/coll/base/coll_base_util.h @@ -27,6 +27,8 @@ #include "ompi/mca/mca.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/request/request.h" +#include "ompi/communicator/communicator.h" +#include "ompi/mca/coll/base/coll_tags.h" #include "ompi/op/op.h" #include "ompi/mca/pml/pml.h" @@ -60,6 +62,22 @@ struct ompi_coll_base_nbc_request_t { OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_coll_base_nbc_request_t); +static inline int32_t +ompi_coll_base_nbc_reserve_tags(ompi_communicator_t* comm, int32_t reserve) +{ + int32_t tag, old_tag; + assert( reserve > 0 ); + reread_tag: /* In case we fail to atomically update the tag */ + tag = old_tag = comm->c_nbc_tag; + if ((tag - reserve) < MCA_COLL_BASE_TAG_NONBLOCKING_END) { + tag = MCA_COLL_BASE_TAG_NONBLOCKING_BASE; + } + if( !OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_32(&comm->c_nbc_tag, &old_tag, tag - reserve) ) { + goto reread_tag; + } + return tag; +} + typedef struct ompi_coll_base_nbc_request_t ompi_coll_base_nbc_request_t; /** diff --git a/ompi/mca/coll/libnbc/coll_libnbc.h b/ompi/mca/coll/libnbc/coll_libnbc.h index 682010b6910..3bdeb9419fa 100644 --- a/ompi/mca/coll/libnbc/coll_libnbc.h +++ b/ompi/mca/coll/libnbc/coll_libnbc.h @@ -94,7 +94,6 @@ struct ompi_coll_libnbc_module_t { mca_coll_base_module_t super; opal_mutex_t mutex; bool comm_registered; - int tag; #ifdef NBC_CACHE_SCHEDULE void *NBC_Dict[NBC_NUM_COLL]; /* this should point to a struct hb_tree, but since this is a diff --git a/ompi/mca/coll/libnbc/nbc.c b/ompi/mca/coll/libnbc/nbc.c index 171f5a37e9c..35e02fe87bf 100644 --- a/ompi/mca/coll/libnbc/nbc.c +++ b/ompi/mca/coll/libnbc/nbc.c @@ -25,7 +25,7 @@ * Additional copyrights may follow */ #include "nbc_internal.h" -#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/coll/base/coll_base_util.h" #include "ompi/op/op.h" #include "ompi/mca/pml/pml.h" @@ -595,7 +595,6 @@ void NBC_Return_handle(ompi_coll_libnbc_request_t *request) { } int NBC_Init_comm(MPI_Comm comm, NBC_Comminfo *comminfo) { - comminfo->tag= MCA_COLL_BASE_TAG_NONBLOCKING_BASE; #ifdef NBC_CACHE_SCHEDULE /* initialize the NBC_ALLTOALL SchedCache tree */ @@ -672,7 +671,7 @@ int NBC_Start(NBC_Handle *handle) { int NBC_Schedule_request(NBC_Schedule *schedule, ompi_communicator_t *comm, ompi_coll_libnbc_module_t *module, bool persistent, ompi_request_t **request, void *tmpbuf) { - int ret, tmp_tag; + int ret; bool need_register = false; ompi_coll_libnbc_request_t *handle; @@ -685,13 +684,7 @@ int NBC_Schedule_request(NBC_Schedule *schedule, ompi_communicator_t *comm, /* update the module->tag here because other processes may have operations * and they may update the module->tag */ - OPAL_THREAD_LOCK(&module->mutex); - tmp_tag = module->tag--; - if (tmp_tag == MCA_COLL_BASE_TAG_NONBLOCKING_END) { - tmp_tag = module->tag = MCA_COLL_BASE_TAG_NONBLOCKING_BASE; - NBC_DEBUG(2,"resetting tags ...\n"); - } - OPAL_THREAD_UNLOCK(&module->mutex); + (void)ompi_coll_base_nbc_reserve_tags(comm, 1); OBJ_RELEASE(schedule); free(tmpbuf); @@ -712,20 +705,15 @@ int NBC_Schedule_request(NBC_Schedule *schedule, ompi_communicator_t *comm, /******************** Do the tag and shadow comm administration ... ***************/ - OPAL_THREAD_LOCK(&module->mutex); - tmp_tag = module->tag--; - if (tmp_tag == MCA_COLL_BASE_TAG_NONBLOCKING_END) { - tmp_tag = module->tag = MCA_COLL_BASE_TAG_NONBLOCKING_BASE; - NBC_DEBUG(2,"resetting tags ...\n"); - } + handle->tag = ompi_coll_base_nbc_reserve_tags(comm, 1); + OPAL_THREAD_LOCK(&module->mutex); if (true != module->comm_registered) { module->comm_registered = true; need_register = true; } OPAL_THREAD_UNLOCK(&module->mutex); - handle->tag = tmp_tag; /* register progress */ if (need_register) { @@ -737,7 +725,6 @@ int NBC_Schedule_request(NBC_Schedule *schedule, ompi_communicator_t *comm, } handle->comm=comm; - /*printf("got module: %lu tag: %i\n", module, module->tag);*/ /******************** end of tag and shadow comm administration ... ***************/ handle->comminfo = module; From e59bde912e7da2e0d500ebf45e4521474051e8ff Mon Sep 17 00:00:00 2001 From: Xi Luo Date: Fri, 8 May 2020 01:04:27 -0400 Subject: [PATCH 6/7] Remove the code handling zero count cases in ADAPT. Set request in ibcast.c to empty when the count is 0. Signed-off-by: Xi Luo Signed-off-by: George Bosilca --- ompi/mca/coll/adapt/coll_adapt_bcast.c | 3 --- ompi/mca/coll/adapt/coll_adapt_ibcast.c | 15 --------------- ompi/mca/coll/adapt/coll_adapt_ireduce.c | 4 ---- ompi/mca/coll/adapt/coll_adapt_reduce.c | 3 --- ompi/mpi/c/ibcast.c | 1 + 5 files changed, 1 insertion(+), 25 deletions(-) diff --git a/ompi/mca/coll/adapt/coll_adapt_bcast.c b/ompi/mca/coll/adapt/coll_adapt_bcast.c index 2497b6b9905..9cfebd97859 100644 --- a/ompi/mca/coll/adapt/coll_adapt_bcast.c +++ b/ompi/mca/coll/adapt/coll_adapt_bcast.c @@ -15,9 +15,6 @@ int ompi_coll_adapt_bcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - if (count == 0) { - return MPI_SUCCESS; - } ompi_request_t *request = NULL; int err = ompi_coll_adapt_ibcast(buff, count, datatype, root, comm, &request, module); if( MPI_SUCCESS != err ) { diff --git a/ompi/mca/coll/adapt/coll_adapt_ibcast.c b/ompi/mca/coll/adapt/coll_adapt_ibcast.c index 3a8555e7fd2..624e3955332 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ibcast.c +++ b/ompi/mca/coll/adapt/coll_adapt_ibcast.c @@ -329,21 +329,6 @@ int ompi_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *dataty struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module) { - if (0 == count) { - ompi_request_t *temp_request = OBJ_NEW(ompi_request_t); - OMPI_REQUEST_INIT(temp_request, false); - temp_request->req_type = 0; - temp_request->req_free = ompi_coll_adapt_request_free; - temp_request->req_status.MPI_SOURCE = 0; - temp_request->req_status.MPI_TAG = 0; - temp_request->req_status.MPI_ERROR = 0; - temp_request->req_status._cancelled = 0; - temp_request->req_status._ucount = 0; - ompi_request_complete(temp_request, 1); - *request = temp_request; - return MPI_SUCCESS; - } - OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "ibcast root %d, algorithm %d, coll_adapt_ibcast_segment_size %zu, coll_adapt_ibcast_max_send_requests %d, coll_adapt_ibcast_max_recv_requests %d\n", root, mca_coll_adapt_component.adapt_ibcast_algorithm, diff --git a/ompi/mca/coll/adapt/coll_adapt_ireduce.c b/ompi/mca/coll/adapt/coll_adapt_ireduce.c index 63de926ef53..b105181be7a 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ireduce.c +++ b/ompi/mca/coll/adapt/coll_adapt_ireduce.c @@ -521,10 +521,6 @@ int ompi_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t * module) { - if (count == 0) { - return MPI_SUCCESS; - } - OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "ireduce root %d, algorithm %d, coll_adapt_ireduce_segment_size %zu, coll_adapt_ireduce_max_send_requests %d, coll_adapt_ireduce_max_recv_requests %d\n", root, mca_coll_adapt_component.adapt_ireduce_algorithm, diff --git a/ompi/mca/coll/adapt/coll_adapt_reduce.c b/ompi/mca/coll/adapt/coll_adapt_reduce.c index 940673f7b97..e3559ec20df 100644 --- a/ompi/mca/coll/adapt/coll_adapt_reduce.c +++ b/ompi/mca/coll/adapt/coll_adapt_reduce.c @@ -17,9 +17,6 @@ int ompi_coll_adapt_reduce(const void *sbuf, void *rbuf, int count, struct ompi_ struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - if (count == 0) { - return MPI_SUCCESS; - } ompi_request_t *request = NULL; int err = ompi_coll_adapt_ireduce(sbuf, rbuf, count, dtype, op, root, comm, &request, module); if( MPI_SUCCESS != err ) { diff --git a/ompi/mpi/c/ibcast.c b/ompi/mpi/c/ibcast.c index f3f248e949b..f25f9d3ea95 100644 --- a/ompi/mpi/c/ibcast.c +++ b/ompi/mpi/c/ibcast.c @@ -100,6 +100,7 @@ int MPI_Ibcast(void *buffer, int count, MPI_Datatype datatype, if ((OMPI_COMM_IS_INTRA(comm) && ompi_comm_size(comm) <= 1) || 0 == count) { + *request = &ompi_request_empty; return MPI_SUCCESS; } From ee592f367277783c5761abb5b08645f55749b9f6 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 23 Jul 2020 16:39:36 -0400 Subject: [PATCH 7/7] Address the comments on the PR. Signed-off-by: George Bosilca --- ompi/mca/coll/adapt/coll_adapt_algorithms.h | 27 +-- ompi/mca/coll/adapt/coll_adapt_context.h | 2 +- ompi/mca/coll/adapt/coll_adapt_ibcast.c | 197 ++++++++++---------- ompi/mca/coll/adapt/coll_adapt_inbuf.h | 2 +- ompi/mca/coll/adapt/coll_adapt_ireduce.c | 123 ++++++------ 5 files changed, 183 insertions(+), 168 deletions(-) diff --git a/ompi/mca/coll/adapt/coll_adapt_algorithms.h b/ompi/mca/coll/adapt/coll_adapt_algorithms.h index 700adabea15..cfece373043 100644 --- a/ompi/mca/coll/adapt/coll_adapt_algorithms.h +++ b/ompi/mca/coll/adapt/coll_adapt_algorithms.h @@ -14,9 +14,15 @@ #include "ompi/mca/coll/base/coll_base_functions.h" #include +typedef int (*ompi_mca_coll_adapt_ibcast_function_t)(IBCAST_ARGS); +typedef int (*ompi_mca_coll_adapt_ireduce_function_t)(IREDUCE_ARGS); + typedef struct ompi_coll_adapt_algorithm_index_s { int algorithm_index; - uintptr_t algorithm_fn_ptr; + union { + ompi_mca_coll_adapt_ibcast_function_t ibcast_fn_ptr; + ompi_mca_coll_adapt_ireduce_function_t ireduce_fn_ptr; + }; } ompi_coll_adapt_algorithm_index_t; /* Bcast */ @@ -24,27 +30,10 @@ int ompi_coll_adapt_ibcast_register(void); int ompi_coll_adapt_ibcast_fini(void); int ompi_coll_adapt_bcast(BCAST_ARGS); int ompi_coll_adapt_ibcast(IBCAST_ARGS); -int ompi_coll_adapt_ibcast_generic(IBCAST_ARGS, - ompi_coll_tree_t * tree, size_t seg_size); -int ompi_coll_adapt_ibcast_binomial(IBCAST_ARGS); -int ompi_coll_adapt_ibcast_in_order_binomial(IBCAST_ARGS); -int ompi_coll_adapt_ibcast_binary(IBCAST_ARGS); -int ompi_coll_adapt_ibcast_pipeline(IBCAST_ARGS); -int ompi_coll_adapt_ibcast_chain(IBCAST_ARGS); -int ompi_coll_adapt_ibcast_linear(IBCAST_ARGS); -int ompi_coll_adapt_ibcast_tuned(IBCAST_ARGS); /* Reduce */ int ompi_coll_adapt_ireduce_register(void); int ompi_coll_adapt_ireduce_fini(void); int ompi_coll_adapt_reduce(REDUCE_ARGS); int ompi_coll_adapt_ireduce(IREDUCE_ARGS); -int ompi_coll_adapt_ireduce_generic(IREDUCE_ARGS, - ompi_coll_tree_t * tree, size_t seg_size); -int ompi_coll_adapt_ireduce_tuned(IREDUCE_ARGS); -int ompi_coll_adapt_ireduce_binomial(IREDUCE_ARGS); -int ompi_coll_adapt_ireduce_in_order_binomial(IREDUCE_ARGS); -int ompi_coll_adapt_ireduce_binary(IREDUCE_ARGS); -int ompi_coll_adapt_ireduce_pipeline(IREDUCE_ARGS); -int ompi_coll_adapt_ireduce_chain(IREDUCE_ARGS); -int ompi_coll_adapt_ireduce_linear(IREDUCE_ARGS); + diff --git a/ompi/mca/coll/adapt/coll_adapt_context.h b/ompi/mca/coll/adapt/coll_adapt_context.h index eea98fb872e..e96ad5266ff 100644 --- a/ompi/mca/coll/adapt/coll_adapt_context.h +++ b/ompi/mca/coll/adapt/coll_adapt_context.h @@ -89,7 +89,7 @@ struct ompi_coll_adapt_constant_reduce_context_s { /* Mutex to protect num_sent */ opal_mutex_t *mutex_num_sent; /* Mutex to protect each segment when do the reduce op */ - opal_mutex_t **mutex_op_list; + opal_mutex_t *mutex_op_list; /* Reduce operation */ ompi_op_t *op; ompi_coll_tree_t *tree; diff --git a/ompi/mca/coll/adapt/coll_adapt_ibcast.c b/ompi/mca/coll/adapt/coll_adapt_ibcast.c index 624e3955332..35a4dda8ee5 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ibcast.c +++ b/ompi/mca/coll/adapt/coll_adapt_ibcast.c @@ -20,6 +20,15 @@ #include "opal/sys/atomic.h" #include "ompi/mca/pml/ob1/pml_ob1.h" +static int ompi_coll_adapt_ibcast_generic(IBCAST_ARGS, + ompi_coll_tree_t * tree, size_t seg_size); +static int ompi_coll_adapt_ibcast_binomial(IBCAST_ARGS); +static int ompi_coll_adapt_ibcast_in_order_binomial(IBCAST_ARGS); +static int ompi_coll_adapt_ibcast_binary(IBCAST_ARGS); +static int ompi_coll_adapt_ibcast_pipeline(IBCAST_ARGS); +static int ompi_coll_adapt_ibcast_chain(IBCAST_ARGS); +static int ompi_coll_adapt_ibcast_linear(IBCAST_ARGS); +static int ompi_coll_adapt_ibcast_tuned(IBCAST_ARGS); typedef int (*ompi_coll_adapt_ibcast_fn_t) (void *buff, int count, @@ -30,13 +39,13 @@ typedef int (*ompi_coll_adapt_ibcast_fn_t) (void *buff, mca_coll_base_module_t * module); static ompi_coll_adapt_algorithm_index_t ompi_coll_adapt_ibcast_algorithm_index[] = { - {0, (uintptr_t) ompi_coll_adapt_ibcast_tuned}, - {1, (uintptr_t) ompi_coll_adapt_ibcast_binomial}, - {2, (uintptr_t) ompi_coll_adapt_ibcast_in_order_binomial}, - {3, (uintptr_t) ompi_coll_adapt_ibcast_binary}, - {4, (uintptr_t) ompi_coll_adapt_ibcast_pipeline}, - {5, (uintptr_t) ompi_coll_adapt_ibcast_chain}, - {6, (uintptr_t) ompi_coll_adapt_ibcast_linear}, + {0, {ompi_coll_adapt_ibcast_tuned}}, + {1, {ompi_coll_adapt_ibcast_binomial}}, + {2, {ompi_coll_adapt_ibcast_in_order_binomial}}, + {3, {ompi_coll_adapt_ibcast_binary}}, + {4, {ompi_coll_adapt_ibcast_pipeline}}, + {5, {ompi_coll_adapt_ibcast_chain}}, + {6, {ompi_coll_adapt_ibcast_linear}}, }; /* @@ -51,6 +60,10 @@ int ompi_coll_adapt_ibcast_register(void) "Algorithm of broadcast, 0: tuned, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_adapt_component.adapt_ibcast_algorithm); + if( (mca_coll_adapt_component.adapt_ibcast_algorithm < 0) || + (mca_coll_adapt_component.adapt_ibcast_algorithm > (int32_t)(sizeof(ompi_coll_adapt_ibcast_algorithm_index) / sizeof(ompi_coll_adapt_algorithm_index_t))) ) { + mca_coll_adapt_component.adapt_ibcast_algorithm = 1; + } mca_coll_adapt_component.adapt_ibcast_segment_size = 0; mca_base_component_var_register(c, "bcast_segment_size", @@ -107,7 +120,6 @@ static int ibcast_request_fini(ompi_coll_adapt_bcast_context_t * context) } OBJ_RELEASE(context->con->mutex); OBJ_RELEASE(context->con); - OBJ_RELEASE(context->con); opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list, (opal_free_list_item_t *) context); ompi_request_complete(temp_req, 1); @@ -122,7 +134,6 @@ static int send_cb(ompi_request_t * req) { ompi_coll_adapt_bcast_context_t *context = (ompi_coll_adapt_bcast_context_t *) req->req_complete_cb_data; - int err; OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, @@ -134,19 +145,17 @@ static int send_cb(ompi_request_t * req) int sent_id = context->con->send_array[context->child_id]; /* If the current process has fragments in recv_array can be sent */ if (sent_id < context->con->num_recv_segs) { - ompi_request_t *send_req; ompi_coll_adapt_bcast_context_t *send_context; - opal_free_list_t *free_list; int new_id = context->con->recv_array[sent_id]; - free_list = mca_coll_adapt_component.adapt_ibcast_context_free_list; - send_context = (ompi_coll_adapt_bcast_context_t *) opal_free_list_wait(free_list); + ompi_request_t *send_req; + + send_context = (ompi_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component.adapt_ibcast_context_free_list); send_context->buff = context->buff + (new_id - context->frag_id) * context->con->real_seg_size; send_context->frag_id = new_id; send_context->child_id = context->child_id; send_context->peer = context->peer; send_context->con = context->con; - OBJ_RETAIN(context->con); int send_count = send_context->con->seg_count; if (new_id == (send_context->con->num_segs - 1)) { send_count = send_context->con->count - new_id * send_context->con->seg_count; @@ -158,38 +167,42 @@ static int send_cb(ompi_request_t * req) ompi_comm_rank(send_context->con->comm), send_context->frag_id, send_context->peer, (void *) send_context->buff, send_count, send_context->con->ibcast_tag - new_id)); - err = - MCA_PML_CALL(isend - (send_buff, send_count, send_context->con->datatype, send_context->peer, - send_context->con->ibcast_tag - new_id, - MCA_PML_BASE_SEND_SYNCHRONOUS, send_context->con->comm, &send_req)); + err = MCA_PML_CALL(isend + (send_buff, send_count, send_context->con->datatype, send_context->peer, + send_context->con->ibcast_tag - new_id, + MCA_PML_BASE_SEND_SYNCHRONOUS, send_context->con->comm, &send_req)); if (MPI_SUCCESS != err) { + opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list, + (opal_free_list_item_t *)send_context); OPAL_THREAD_UNLOCK(context->con->mutex); + OBJ_RELEASE(context->con); return err; } - /* Invoke send call back */ + /* Set send callback */ OPAL_THREAD_UNLOCK(context->con->mutex); ompi_request_set_callback(send_req, send_cb, send_context); OPAL_THREAD_LOCK(context->con->mutex); + } else { + /* No future send here, we can release the ref */ + OBJ_RELEASE(context->con); } int num_sent = ++(context->con->num_sent_segs); - int num_recv_fini_t = context->con->num_recv_fini; + int num_recv_fini = context->con->num_recv_fini; int rank = ompi_comm_rank(context->con->comm); /* Check whether signal the condition */ if ((rank == context->con->root && num_sent == context->con->tree->tree_nextsize * context->con->num_segs) || (context->con->tree->tree_nextsize > 0 && rank != context->con->root && num_sent == context->con->tree->tree_nextsize * context->con->num_segs - && num_recv_fini_t == context->con->num_segs) || (context->con->tree->tree_nextsize == 0 - && num_recv_fini_t == - context->con->num_segs)) { + && num_recv_fini == context->con->num_segs) + || (context->con->tree->tree_nextsize == 0 + && num_recv_fini == context->con->num_segs)) { OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in send\n", ompi_comm_rank(context->con->comm))); OPAL_THREAD_UNLOCK(context->con->mutex); ibcast_request_fini(context); } else { - OBJ_RELEASE(context->con); opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list, (opal_free_list_item_t *) context); OPAL_THREAD_UNLOCK(context->con->mutex); @@ -216,18 +229,16 @@ static int recv_cb(ompi_request_t * req) /* Store the frag_id to seg array */ OPAL_THREAD_LOCK(context->con->mutex); - int num_recv_segs_t = ++(context->con->num_recv_segs); - context->con->recv_array[num_recv_segs_t - 1] = context->frag_id; + int num_recv_segs = ++(context->con->num_recv_segs); + context->con->recv_array[num_recv_segs - 1] = context->frag_id; - opal_free_list_t *free_list; - int new_id = num_recv_segs_t + mca_coll_adapt_component.adapt_ibcast_max_recv_requests - 1; + int new_id = num_recv_segs + mca_coll_adapt_component.adapt_ibcast_max_recv_requests - 1; /* Receive new segment */ if (new_id < context->con->num_segs) { ompi_request_t *recv_req; ompi_coll_adapt_bcast_context_t *recv_context; - free_list = mca_coll_adapt_component.adapt_ibcast_context_free_list; /* Get new context item from free list */ - recv_context = (ompi_coll_adapt_bcast_context_t *) opal_free_list_wait(free_list); + recv_context = (ompi_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component.adapt_ibcast_context_free_list); recv_context->buff = context->buff + (new_id - context->frag_id) * context->con->real_seg_size; recv_context->frag_id = new_id; @@ -250,16 +261,16 @@ static int recv_cb(ompi_request_t * req) recv_context->con->ibcast_tag - recv_context->frag_id, recv_context->con->comm, &recv_req)); - /* Invoke recvive call back */ + /* Set the receive callback */ OPAL_THREAD_UNLOCK(context->con->mutex); ompi_request_set_callback(recv_req, recv_cb, recv_context); OPAL_THREAD_LOCK(context->con->mutex); } - /* Send segment to its children */ + /* Propagate segment to all children */ for (i = 0; i < context->con->tree->tree_nextsize; i++) { /* If the current process can send the segment now, which means the only segment need to be sent is the just arrived one */ - if (num_recv_segs_t - 1 == context->con->send_array[i]) { + if (num_recv_segs - 1 == context->con->send_array[i]) { ompi_request_t *send_req; int send_count = context->con->seg_count; if (context->frag_id == (context->con->num_segs - 1)) { @@ -267,8 +278,7 @@ static int recv_cb(ompi_request_t * req) } ompi_coll_adapt_bcast_context_t *send_context; - free_list = mca_coll_adapt_component.adapt_ibcast_context_free_list; - send_context = (ompi_coll_adapt_bcast_context_t *) opal_free_list_wait(free_list); + send_context = (ompi_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component.adapt_ibcast_context_free_list); send_context->buff = context->buff; send_context->frag_id = context->frag_id; send_context->child_id = i; @@ -289,18 +299,23 @@ static int recv_cb(ompi_request_t * req) send_context->con->ibcast_tag - send_context->frag_id, MCA_PML_BASE_SEND_SYNCHRONOUS, send_context->con->comm, &send_req)); if (MPI_SUCCESS != err) { + opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list, + (opal_free_list_item_t *)send_context); OPAL_THREAD_UNLOCK(context->con->mutex); + OBJ_RELEASE(context->con); return err; } - /* Invoke send call back */ + /* Set send callback */ OPAL_THREAD_UNLOCK(context->con->mutex); ompi_request_set_callback(send_req, send_cb, send_context); OPAL_THREAD_LOCK(context->con->mutex); } } + OBJ_RELEASE(context->con); + int num_sent = context->con->num_sent_segs; - int num_recv_fini_t = ++(context->con->num_recv_fini); + int num_recv_fini = ++(context->con->num_recv_fini); int rank = ompi_comm_rank(context->con->comm); /* If this process is leaf and has received all the segments */ @@ -308,15 +323,14 @@ static int recv_cb(ompi_request_t * req) && num_sent == context->con->tree->tree_nextsize * context->con->num_segs) || (context->con->tree->tree_nextsize > 0 && rank != context->con->root && num_sent == context->con->tree->tree_nextsize * context->con->num_segs - && num_recv_fini_t == context->con->num_segs) || (context->con->tree->tree_nextsize == 0 - && num_recv_fini_t == - context->con->num_segs)) { + && num_recv_fini == context->con->num_segs) + || (context->con->tree->tree_nextsize == 0 + && num_recv_fini == context->con->num_segs)) { OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in recv\n", ompi_comm_rank(context->con->comm))); OPAL_THREAD_UNLOCK(context->con->mutex); ibcast_request_fini(context); } else { - OBJ_RELEASE(context->con); opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list, (opal_free_list_item_t *) context); OPAL_THREAD_UNLOCK(context->con->mutex); @@ -337,85 +351,80 @@ int ompi_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *dataty mca_coll_adapt_component.adapt_ibcast_max_recv_requests)); ompi_coll_adapt_ibcast_fn_t bcast_func = - (ompi_coll_adapt_ibcast_fn_t) - ompi_coll_adapt_ibcast_algorithm_index[mca_coll_adapt_component.adapt_ibcast_algorithm]. - algorithm_fn_ptr; + ompi_coll_adapt_ibcast_algorithm_index[mca_coll_adapt_component.adapt_ibcast_algorithm].ibcast_fn_ptr; return bcast_func(buff, count, datatype, root, comm, request, module); } /* * Ibcast functions with different algorithms */ -int ompi_coll_adapt_ibcast_tuned(void *buff, int count, struct ompi_datatype_t *datatype, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module) +static int +ompi_coll_adapt_ibcast_tuned(void *buff, int count, struct ompi_datatype_t *datatype, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t *module) { OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "tuned not implemented\n")); return OMPI_ERR_NOT_IMPLEMENTED; } -int ompi_coll_adapt_ibcast_binomial(void *buff, int count, struct ompi_datatype_t *datatype, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, mca_coll_base_module_t * module) +static int +ompi_coll_adapt_ibcast_binomial(void *buff, int count, struct ompi_datatype_t *datatype, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, mca_coll_base_module_t * module) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_bmtree(comm, root); - int err = - ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, - mca_coll_adapt_component.adapt_ibcast_segment_size); - return err; + return ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + mca_coll_adapt_component.adapt_ibcast_segment_size); } -int ompi_coll_adapt_ibcast_in_order_binomial(void *buff, int count, struct ompi_datatype_t *datatype, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t * module) +static int +ompi_coll_adapt_ibcast_in_order_binomial(void *buff, int count, struct ompi_datatype_t *datatype, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t * module) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_in_order_bmtree(comm, root); - int err = - ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, - mca_coll_adapt_component.adapt_ibcast_segment_size); - return err; + return ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + mca_coll_adapt_component.adapt_ibcast_segment_size); } -int ompi_coll_adapt_ibcast_binary(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module) +static int +ompi_coll_adapt_ibcast_binary(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_tree(2, comm, root); - int err = - ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, - mca_coll_adapt_component.adapt_ibcast_segment_size); - return err; + return ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + mca_coll_adapt_component.adapt_ibcast_segment_size); } -int ompi_coll_adapt_ibcast_pipeline(void *buff, int count, struct ompi_datatype_t *datatype, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, mca_coll_base_module_t * module) +static int +ompi_coll_adapt_ibcast_pipeline(void *buff, int count, struct ompi_datatype_t *datatype, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, mca_coll_base_module_t * module) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(1, comm, root); - int err = - ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, - mca_coll_adapt_component.adapt_ibcast_segment_size); - return err; + return ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + mca_coll_adapt_component.adapt_ibcast_segment_size); } -int ompi_coll_adapt_ibcast_chain(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module) +static int +ompi_coll_adapt_ibcast_chain(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(4, comm, root); - int err = - ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, - mca_coll_adapt_component.adapt_ibcast_segment_size); - return err; + return ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + mca_coll_adapt_component.adapt_ibcast_segment_size); } -int ompi_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t *datatype, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module) +static int +ompi_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { int fanout = ompi_comm_size(comm) - 1; ompi_coll_tree_t *tree; @@ -426,10 +435,8 @@ int ompi_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t } else { tree = ompi_coll_base_topo_build_tree(MAXTREEFANOUT, comm, root); } - int err = - ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, - mca_coll_adapt_component.adapt_ibcast_segment_size); - return err; + return ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree, + mca_coll_adapt_component.adapt_ibcast_segment_size); } @@ -482,7 +489,7 @@ int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t temp_request = OBJ_NEW(ompi_request_t); OMPI_REQUEST_INIT(temp_request, false); temp_request->req_state = OMPI_REQUEST_ACTIVE; - temp_request->req_type = 0; + temp_request->req_type = OMPI_REQUEST_COLL; temp_request->req_free = ompi_coll_adapt_request_free; temp_request->req_status.MPI_SOURCE = 0; temp_request->req_status.MPI_TAG = 0; @@ -593,7 +600,7 @@ int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t if (MPI_SUCCESS != err) { return err; } - /* Invoke send call back */ + /* Set send callback */ OPAL_THREAD_UNLOCK(mutex); ompi_request_set_callback(send_req, send_cb, context); OPAL_THREAD_LOCK(mutex); @@ -650,7 +657,7 @@ int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t if (MPI_SUCCESS != err) { return err; } - /* Invoke receive call back */ + /* Set receive callback */ OPAL_THREAD_UNLOCK(mutex); ompi_request_set_callback(recv_req, recv_cb, context); OPAL_THREAD_LOCK(mutex); diff --git a/ompi/mca/coll/adapt/coll_adapt_inbuf.h b/ompi/mca/coll/adapt/coll_adapt_inbuf.h index 93c3060333b..d339256b856 100644 --- a/ompi/mca/coll/adapt/coll_adapt_inbuf.h +++ b/ompi/mca/coll/adapt/coll_adapt_inbuf.h @@ -16,7 +16,7 @@ struct ompi_coll_adapt_inbuf_s { opal_free_list_item_t super; - char buff[1]; + char buff[]; }; typedef struct ompi_coll_adapt_inbuf_s ompi_coll_adapt_inbuf_t; diff --git a/ompi/mca/coll/adapt/coll_adapt_ireduce.c b/ompi/mca/coll/adapt/coll_adapt_ireduce.c index b105181be7a..230c9a60cb8 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ireduce.c +++ b/ompi/mca/coll/adapt/coll_adapt_ireduce.c @@ -22,6 +22,16 @@ #include "ompi/mca/coll/base/coll_base_functions.h" #include "ompi/mca/coll/base/coll_base_topo.h" +static int ompi_coll_adapt_ireduce_tuned(IREDUCE_ARGS); +static int ompi_coll_adapt_ireduce_binomial(IREDUCE_ARGS); +static int ompi_coll_adapt_ireduce_in_order_binomial(IREDUCE_ARGS); +static int ompi_coll_adapt_ireduce_binary(IREDUCE_ARGS); +static int ompi_coll_adapt_ireduce_pipeline(IREDUCE_ARGS); +static int ompi_coll_adapt_ireduce_chain(IREDUCE_ARGS); +static int ompi_coll_adapt_ireduce_linear(IREDUCE_ARGS); +static int ompi_coll_adapt_ireduce_generic(IREDUCE_ARGS, + ompi_coll_tree_t * tree, size_t seg_size); + /* MPI_Reduce and MPI_Ireduce in the ADAPT module only work for commutative operations */ typedef int (*ompi_coll_adapt_ireduce_fn_t) (const void *sbuf, @@ -35,13 +45,13 @@ typedef int (*ompi_coll_adapt_ireduce_fn_t) (const void *sbuf, mca_coll_base_module_t * module); static ompi_coll_adapt_algorithm_index_t ompi_coll_adapt_ireduce_algorithm_index[] = { - {0, (uintptr_t)ompi_coll_adapt_ireduce_tuned}, - {1, (uintptr_t) ompi_coll_adapt_ireduce_binomial}, - {2, (uintptr_t) ompi_coll_adapt_ireduce_in_order_binomial}, - {3, (uintptr_t) ompi_coll_adapt_ireduce_binary}, - {4, (uintptr_t) ompi_coll_adapt_ireduce_pipeline}, - {5, (uintptr_t) ompi_coll_adapt_ireduce_chain}, - {6, (uintptr_t) ompi_coll_adapt_ireduce_linear}, + {0, {.ireduce_fn_ptr = ompi_coll_adapt_ireduce_tuned}}, + {1, {.ireduce_fn_ptr = ompi_coll_adapt_ireduce_binomial}}, + {2, {.ireduce_fn_ptr = ompi_coll_adapt_ireduce_in_order_binomial}}, + {3, {.ireduce_fn_ptr = ompi_coll_adapt_ireduce_binary}}, + {4, {.ireduce_fn_ptr = ompi_coll_adapt_ireduce_pipeline}}, + {5, {.ireduce_fn_ptr = ompi_coll_adapt_ireduce_chain}}, + {6, {.ireduce_fn_ptr = ompi_coll_adapt_ireduce_linear}}, }; /* @@ -56,6 +66,10 @@ int ompi_coll_adapt_ireduce_register(void) "Algorithm of reduce, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_adapt_component.adapt_ireduce_algorithm); + if( (mca_coll_adapt_component.adapt_ireduce_algorithm < 0) || + (mca_coll_adapt_component.adapt_ireduce_algorithm > (int32_t)(sizeof(ompi_coll_adapt_ireduce_algorithm_index) / sizeof(ompi_coll_adapt_algorithm_index_t))) ) { + mca_coll_adapt_component.adapt_ireduce_algorithm = 1; + } mca_coll_adapt_component.adapt_ireduce_segment_size = 163740; mca_base_component_var_register(c, "reduce_segment_size", @@ -197,7 +211,7 @@ static int ireduce_request_fini(ompi_coll_adapt_reduce_context_t * context) } OBJ_RELEASE(context->con->recv_list); for (i = 0; i < context->con->num_segs; i++) { - OBJ_RELEASE(context->con->mutex_op_list[i]); + OBJ_DESTRUCT(&context->con->mutex_op_list[i]); } free(context->con->mutex_op_list); OBJ_RELEASE(context->con->mutex_num_recv_segs); @@ -279,7 +293,7 @@ static int send_cb(ompi_request_t * req) /* Release the item */ OBJ_RELEASE(item); - /* Invoke send call back */ + /* Set the send callback */ ompi_request_set_callback(send_req, send_cb, send_context); } @@ -366,7 +380,7 @@ static int recv_cb(ompi_request_t * req) if (MPI_SUCCESS != err) { return err; } - /* Invoke receive call back */ + /* Set the receive callback */ ompi_request_set_callback(recv_req, recv_cb, recv_context); } @@ -377,7 +391,7 @@ static int recv_cb(ompi_request_t * req) } int keep_inbuf = 0; - OPAL_THREAD_LOCK(context->con->mutex_op_list[context->frag_id]); + OPAL_THREAD_LOCK(&context->con->mutex_op_list[context->frag_id]); if (context->con->accumbuf[context->frag_id] == NULL) { if (context->inbuf == NULL) { OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, @@ -424,7 +438,7 @@ static int recv_cb(ompi_request_t * req) } } - OPAL_THREAD_UNLOCK(context->con->mutex_op_list[context->frag_id]); + OPAL_THREAD_UNLOCK(&context->con->mutex_op_list[context->frag_id]); /* Set recv list */ if (context->con->rank != context->con->tree->tree_root) { @@ -474,7 +488,7 @@ static int recv_cb(ompi_request_t * req) } OBJ_RELEASE(item); - /* Invoke send call back */ + /* Set the send callback */ ompi_request_set_callback(send_req, send_cb, send_context); } } @@ -529,60 +543,63 @@ int ompi_coll_adapt_ireduce(const void *sbuf, void *rbuf, int count, struct ompi mca_coll_adapt_component.adapt_ireduce_max_recv_requests)); ompi_coll_adapt_ireduce_fn_t reduce_func = - (ompi_coll_adapt_ireduce_fn_t) - ompi_coll_adapt_ireduce_algorithm_index[mca_coll_adapt_component. - adapt_ireduce_algorithm].algorithm_fn_ptr; + ompi_coll_adapt_ireduce_algorithm_index[mca_coll_adapt_component.adapt_ireduce_algorithm].ireduce_fn_ptr; return reduce_func(sbuf, rbuf, count, dtype, op, root, comm, request, module); } /* * Ireduce functions with different algorithms */ -int ompi_coll_adapt_ireduce_tuned(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module) +static int +ompi_coll_adapt_ireduce_tuned(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t *module) { OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "tuned not implemented\n")); return OMPI_ERR_NOT_IMPLEMENTED; } -int ompi_coll_adapt_ireduce_binomial(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module) +static int +ompi_coll_adapt_ireduce_binomial(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { return ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, ompi_coll_base_topo_build_bmtree(comm, root), mca_coll_adapt_component.adapt_ireduce_segment_size); } -int ompi_coll_adapt_ireduce_in_order_binomial(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int root, struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t * module) +static int +ompi_coll_adapt_ireduce_in_order_binomial(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root, struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t * module) { return ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, ompi_coll_base_topo_build_in_order_bmtree(comm, root), mca_coll_adapt_component.adapt_ireduce_segment_size); } -int ompi_coll_adapt_ireduce_binary(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module) +static int +ompi_coll_adapt_ireduce_binary(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { return ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, ompi_coll_base_topo_build_tree(2, comm, root), mca_coll_adapt_component.adapt_ireduce_segment_size); } -int ompi_coll_adapt_ireduce_pipeline(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module) +static int +ompi_coll_adapt_ireduce_pipeline(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { return ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, ompi_coll_base_topo_build_chain(1, comm, root), @@ -590,20 +607,22 @@ int ompi_coll_adapt_ireduce_pipeline(const void *sbuf, void *rbuf, int count, } -int ompi_coll_adapt_ireduce_chain(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module) +static int +ompi_coll_adapt_ireduce_chain(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { return ompi_coll_adapt_ireduce_generic(sbuf, rbuf, count, dtype, op, root, comm, request, module, ompi_coll_base_topo_build_chain(4, comm, root), mca_coll_adapt_component.adapt_ireduce_segment_size); } -int ompi_coll_adapt_ireduce_linear(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, ompi_request_t ** request, - mca_coll_base_module_t * module) +static int +ompi_coll_adapt_ireduce_linear(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, ompi_request_t ** request, + mca_coll_base_module_t * module) { int fanout = ompi_comm_size(comm) - 1; ompi_coll_tree_t *tree; @@ -640,7 +659,7 @@ int ompi_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, opal_mutex_t *mutex_recv_list; opal_mutex_t *mutex_num_recv_segs; opal_mutex_t *mutex_num_sent; - opal_mutex_t **mutex_op_list; + opal_mutex_t *mutex_op_list; /* A list to store the segments need to be sent */ opal_list_t *recv_list; @@ -706,7 +725,7 @@ int ompi_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, temp_request = OBJ_NEW(ompi_request_t); OMPI_REQUEST_INIT(temp_request, false); temp_request->req_state = OMPI_REQUEST_ACTIVE; - temp_request->req_type = 0; + temp_request->req_type = OMPI_REQUEST_COLL; temp_request->req_free = ompi_coll_adapt_request_free; temp_request->req_status.MPI_SOURCE = 0; temp_request->req_status.MPI_TAG = 0; @@ -718,9 +737,9 @@ int ompi_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, /* Set up mutex */ mutex_recv_list = OBJ_NEW(opal_mutex_t); mutex_num_recv_segs = OBJ_NEW(opal_mutex_t); - mutex_op_list = (opal_mutex_t **) malloc(sizeof(opal_mutex_t *) * num_segs); + mutex_op_list = (opal_mutex_t *) malloc(sizeof(opal_mutex_t) * num_segs); for (i = 0; i < num_segs; i++) { - mutex_op_list[i] = OBJ_NEW(opal_mutex_t); + OBJ_CONSTRUCT(&mutex_op_list[i], opal_mutex_t); } mutex_num_sent = OBJ_NEW(opal_mutex_t); /* Create recv_list */ @@ -837,7 +856,7 @@ int ompi_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, if (MPI_SUCCESS != err) { return err; } - /* Invoke recv call back */ + /* Set the recv callback */ ompi_request_set_callback(recv_req, recv_cb, context); } } @@ -899,7 +918,7 @@ int ompi_coll_adapt_ireduce_generic(const void *sbuf, void *rbuf, int count, } OBJ_RELEASE(item); - /* Invoke send call back */ + /* Set the send callback */ ompi_request_set_callback(send_req, send_cb, context); } }