diff --git a/config/ompi_check_ucg.m4 b/config/ompi_check_ucg.m4 new file mode 100644 index 0000000000000000000000000000000000000000..8e2d376f5a4c6f28be01951d6c926eb983af6a68 --- /dev/null +++ b/config/ompi_check_ucg.m4 @@ -0,0 +1,66 @@ +dnl -*- shell-script -*- +dnl +dnl Copyright (c) 2022 Huawei Technologies Co., Ltd. All rights reserved. +dnl +dnl $COPYRIGHT$ +dnl +dnl Additional copyrights may follow +dnl +dnl $HEADER$ +dnl + +# OMPI_CHECK_UCG(prefix, [action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +# check if ucg support can be found. sets prefix_{CPPFLAGS, +# LDFLAGS, LIBS} as needed and runs action-if-found if there is +# support, otherwise executes action-if-not-found +AC_DEFUN([OMPI_CHECK_UCG],[ + OPAL_VAR_SCOPE_PUSH([ompi_check_ucg_happy CPPFLAGS_save LDFLAGS_save LIBS_save]) + + m4_ifblank([$1], [m4_fatal([First argument to OMPI_CHECK_UCG cannot be blank])]) + + AC_ARG_WITH([ucg], + [AS_HELP_STRING([--with-ucg(=DIR)], + [Build UCG (Unified Collective Group)])]) + + AS_IF([test "$with_ucg" != "no"], + [CPPFLAGS_save=$CPPFLAGS + LDFLAGS_save=$LDFLAGS + LIBS_save=$LIBS + + OPAL_LOG_MSG([$1_CPPFLAGS : $$1_CPPFLAGS], 1) + OPAL_LOG_MSG([$1_LDFLAGS : $$1_LDFLAGS], 1) + OPAL_LOG_MSG([$1_LIBS : $$1_LIBS], 1) + + OAC_CHECK_PACKAGE([ucg], + [$1], + [ucg/api/ucg.h], + [ucg], + [ucg_cleanup], + [ompi_check_ucg_happy="yes"], + [ompi_check_ucg_happy="no"]) + AS_IF([test "$ompi_check_ucg_happy" = "yes"], + [ + CPPFLAGS=$coll_ucg_CPPFLAGS + LDFLAGS=$coll_ucg_LDFLAGS + LIBS=$coll_ucg_LIBS + ], + []) + + CPPFLAGS=$CPPFLAGS_save + LDFLAGS=$LDFLAGS_save + LIBS=$LIBS_save], + [ompi_check_ucg_happy=no]) + + AS_IF([test "$ompi_check_ucg_happy" = "yes" && test "$enable_progress_threads" = "yes"], + [AC_MSG_WARN([ucg driver does not currently support progress threads. Disabling UCG.]) + ompi_check_ucg_happy="no"]) + + AS_IF([test "$ompi_check_ucg_happy" = "yes"], + [$2], + [AS_IF([test ! -z "$with_ucg" && test "$with_ucg" != "no"], + [AC_MSG_ERROR([UCG support requested but not found. Aborting])]) + $3]) + + OPAL_VAR_SCOPE_POP +]) \ No newline at end of file diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index 4c2cd35a412d52b77bfb4e1b95c80a1ae0a0a87f..482287163ecf189f6a7973a2ea38b45f6fc6ab4b 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -91,8 +91,7 @@ AC_DEFUN([OMPI_CHECK_UCX],[ [#include ]) AC_CHECK_DECLS([ucp_ep_flush_nb, ucp_worker_flush_nb, ucp_request_check_status, ucp_put_nb, ucp_get_nb, - ucp_put_nbx, ucp_get_nbx, ucp_atomic_op_nbx, - ucp_ep_flush_nbx], + ucp_put_nbx, ucp_get_nbx, ucp_atomic_op_nbx], [], [], [#include ]) AC_CHECK_DECLS([ucm_test_events, @@ -107,7 +106,6 @@ AC_DEFUN([OMPI_CHECK_UCX],[ UCP_ATOMIC_FETCH_OP_FXOR, UCP_PARAM_FIELD_ESTIMATED_NUM_PPN, UCP_WORKER_FLAG_IGNORE_REQUEST_LEAK, - UCP_OP_ATTR_FLAG_MULTI_SEND, UCP_MEM_MAP_SYMMETRIC_RKEY, UCS_MEMORY_TYPE_RDMA], [], [], diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c index 4aa521fdefb6ca31c2b7455dcd20f1a76822bd94..31df5a37592d60b7b50cc14518e28c3e66fc205c 100644 --- a/ompi/instance/instance.c +++ b/ompi/instance/instance.c @@ -114,8 +114,7 @@ OBJ_CLASS_INSTANCE(ompi_instance_t, opal_infosubscriber_t, ompi_instance_constru static mca_base_framework_t *ompi_framework_dependencies[] = { &ompi_hook_base_framework, &ompi_op_base_framework, &opal_allocator_base_framework, &opal_rcache_base_framework, &opal_mpool_base_framework, &opal_smsc_base_framework, - &ompi_bml_base_framework, &ompi_pml_base_framework, &ompi_coll_base_framework, - &ompi_osc_base_framework, NULL, + &ompi_bml_base_framework, &ompi_pml_base_framework, NULL, }; static mca_base_framework_t *ompi_lazy_frameworks[] = { @@ -519,6 +518,25 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ompi_instance_print_error ("mca_pml_base_select() failed", ret); } + ret = mca_base_framework_open (&ompi_coll_base_framework, 0); + if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { + char error_msg[256]; + snprintf (error_msg, sizeof(error_msg), "mca_base_framework_open on %s_%s failed", + (&ompi_coll_base_framework)->framework_project, + (&ompi_coll_base_framework)->framework_name); + return ompi_instance_print_error (error_msg, ret); + } + + ret = mca_base_framework_open (&ompi_osc_base_framework, 0); + if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { + char error_msg[256]; + snprintf (error_msg, sizeof(error_msg), "mca_base_framework_open on %s_%s failed", + (&ompi_osc_base_framework)->framework_project, + (&ompi_osc_base_framework)->framework_name); + return ompi_instance_print_error (error_msg, ret); + } + + OMPI_TIMING_IMPORT_OPAL("orte_init"); OMPI_TIMING_NEXT("rte_init-commit"); diff --git a/ompi/mca/coll/ucg/Makefile.am b/ompi/mca/coll/ucg/Makefile.am new file mode 100644 index 0000000000000000000000000000000000000000..925f41c9f57e40043253a424873cf2eb7f16ae33 --- /dev/null +++ b/ompi/mca/coll/ucg/Makefile.am @@ -0,0 +1,52 @@ +# +# Copyright (c) 2022-2022 Huawei Technologies Co., Ltd. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CPPFLAGS = $(coll_ucg_CPPFLAGS) -DCOLL_UCG_HOME=\"$(coll_ucg_HOME)\" $(coll_ucg_extra_CPPFLAGS) + +#dist_ompidata_DATA = help-coll-ucg.txt +coll_ucg_sources = \ + coll_ucg.h \ + coll_ucg_debug.h \ + coll_ucg_request.h \ + coll_ucg_dt.h \ + coll_ucg_component.c \ + coll_ucg_module.c \ + coll_ucg_request.c \ + coll_ucg_dt.c \ + coll_ucg_allreduce.c \ + coll_ucg_barrier.c \ + coll_ucg_bcast.c \ + coll_ucg_alltoallv.c \ + coll_ucg_scatterv.c \ + coll_ucg_gatherv.c \ + coll_ucg_allgatherv.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_ompi_coll_ucg_DSO +component_noinst = +component_install = mca_coll_ucg.la +else +component_noinst = libmca_coll_ucg.la +component_install = +endif + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_coll_ucg_la_SOURCES = $(coll_ucg_sources) +mca_coll_ucg_la_LIBADD = $(coll_ucg_LIBS) +mca_coll_ucg_la_LDFLAGS = -module -avoid-version $(coll_ucg_LDFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_coll_ucg_la_SOURCES =$(coll_ucg_sources) +libmca_coll_ucg_la_LIBADD = $(coll_ucg_LIBS) +libmca_coll_ucg_la_LDFLAGS = -module -avoid-version $(coll_ucg_LDFLAGS) diff --git a/ompi/mca/coll/ucg/coll_ucg.h b/ompi/mca/coll/ucg/coll_ucg.h new file mode 100644 index 0000000000000000000000000000000000000000..7b97b99870029d18075e106a33280d8843f7a195 --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg.h @@ -0,0 +1,317 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2022 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#ifndef MCA_COLL_UCG_H +#define MCA_COLL_UCG_H + +#include "ompi_config.h" +#include "opal/class/opal_free_list.h" +#include "opal/class/opal_list.h" +#include "ompi/communicator/communicator.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/datatype/ompi_datatype_internal.h" +#include "ompi/op/op.h" + +#include +#include + +BEGIN_C_DECLS + +#ifndef container_of + #define container_of(ptr, type, member) ((type *) (((char *) (ptr)) - offsetof(type, member))) +#endif + +#if OMPI_MAJOR_VERSION < 5 + #define PMIX_NODEID OPAL_PMIX_NODEID + #define PMIX_UINT32 OPAL_UINT32 + #define PMIX_LOCALITY_STRING OPAL_PMIX_LOCALITY_STRING + #define PMIX_STRING OPAL_STRING +#endif + +typedef struct { + /** Base coll component */ + mca_coll_base_component_t super; + bool initialized; + + /** MCA parameter */ + int priority; /* Priority of this component */ + int verbose; /* Verbose level of this component */ + int max_rcache_size; /* Max size of request cache */ + char *disable_coll; /* JUST FOR TEST, may remove later */ + char *topology; /* Topology file path */ + int npolls; /* test progress npolls */ + + ucg_context_h ucg_context; + + char **blacklist; /** disabled collective operations */ +} mca_coll_ucg_component_t; +OMPI_DECLSPEC extern mca_coll_ucg_component_t mca_coll_ucg_component; + +typedef struct { + mca_coll_base_module_t super; + ompi_communicator_t *comm; + ucg_group_h group; + + /* blocking fallback */ + mca_coll_base_module_allreduce_fn_t previous_allreduce; + mca_coll_base_module_t *previous_allreduce_module; + + mca_coll_base_module_bcast_fn_t previous_bcast; + mca_coll_base_module_t *previous_bcast_module; + + mca_coll_base_module_barrier_fn_t previous_barrier; + mca_coll_base_module_t *previous_barrier_module; + + mca_coll_base_module_alltoallv_fn_t previous_alltoallv; + mca_coll_base_module_t *previous_alltoallv_module; + + mca_coll_base_module_scatterv_fn_t previous_scatterv; + mca_coll_base_module_t *previous_scatterv_module; + + mca_coll_base_module_gatherv_fn_t previous_gatherv; + mca_coll_base_module_t *previous_gatherv_module; + + mca_coll_base_module_allgatherv_fn_t previous_allgatherv; + mca_coll_base_module_t *previous_allgatherv_module; + + /* non-blocking fallback */ + mca_coll_base_module_iallreduce_fn_t previous_iallreduce; + mca_coll_base_module_t *previous_iallreduce_module; + + mca_coll_base_module_ibcast_fn_t previous_ibcast; + mca_coll_base_module_t *previous_ibcast_module; + + mca_coll_base_module_ibarrier_fn_t previous_ibarrier; + mca_coll_base_module_t *previous_ibarrier_module; + + mca_coll_base_module_ialltoallv_fn_t previous_ialltoallv; + mca_coll_base_module_t *previous_ialltoallv_module; + + mca_coll_base_module_iscatterv_fn_t previous_iscatterv; + mca_coll_base_module_t *previous_iscatterv_module; + + mca_coll_base_module_igatherv_fn_t previous_igatherv; + mca_coll_base_module_t *previous_igatherv_module; + + mca_coll_base_module_iallgatherv_fn_t previous_iallgatherv; + mca_coll_base_module_t *previous_iallgatherv_module; + + /* persistent fallback */ + mca_coll_base_module_allreduce_init_fn_t previous_allreduce_init; + mca_coll_base_module_t *previous_allreduce_init_module; + + mca_coll_base_module_bcast_init_fn_t previous_bcast_init; + mca_coll_base_module_t *previous_bcast_init_module; + + mca_coll_base_module_barrier_init_fn_t previous_barrier_init; + mca_coll_base_module_t *previous_barrier_init_module; + + mca_coll_base_module_alltoallv_init_fn_t previous_alltoallv_init; + mca_coll_base_module_t *previous_alltoallv_init_module; + + mca_coll_base_module_scatterv_init_fn_t previous_scatterv_init; + mca_coll_base_module_t *previous_scatterv_init_module; + + mca_coll_base_module_gatherv_init_fn_t previous_gatherv_init; + mca_coll_base_module_t *previous_gatherv_init_module; + + mca_coll_base_module_allgatherv_init_fn_t previous_allgatherv_init; + mca_coll_base_module_t *previous_allgatherv_init_module; +} mca_coll_ucg_module_t; +OBJ_CLASS_DECLARATION(mca_coll_ucg_module_t); + +int mca_coll_ucg_init_query(bool enable_progress_threads, bool enable_mpi_threads); +mca_coll_base_module_t *mca_coll_ucg_comm_query(ompi_communicator_t *comm, int *priority); + +int mca_coll_ucg_init_once(void); +void mca_coll_ucg_cleanup_once(void); + +/* allreduce */ +int mca_coll_ucg_allreduce(const void *sbuf, void *rbuf, int count, + ompi_datatype_t *dtype, ompi_op_t *op, + ompi_communicator_t *comm, mca_coll_base_module_t *module); + +int mca_coll_ucg_allreduce_cache(const void *sbuf, void *rbuf, int count, + ompi_datatype_t *dtype, ompi_op_t *op, + ompi_communicator_t *comm, mca_coll_base_module_t *module); + +int mca_coll_ucg_iallreduce(const void *sbuf, void *rbuf, int count, + ompi_datatype_t *datatype, ompi_op_t *op, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module); + +int mca_coll_ucg_iallreduce_cache(const void *sbuf, void *rbuf, int count, + ompi_datatype_t *datatype, ompi_op_t *op, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module); + +int mca_coll_ucg_allreduce_init(const void *sbuf, void *rbuf, int count, ompi_datatype_t *datatype, + ompi_op_t *op, ompi_communicator_t *comm, ompi_info_t *info, + ompi_request_t **request, mca_coll_base_module_t *module); + +/* bcast */ +int mca_coll_ucg_bcast(void *buff, int count, ompi_datatype_t *datatype, + int root, ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +int mca_coll_ucg_bcast_cache(void *buff, int count, ompi_datatype_t *datatype, + int root, ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +int mca_coll_ucg_ibcast(void *buffer, int count, MPI_Datatype datatype, int root, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module); + +int mca_coll_ucg_ibcast_cache(void *buffer, int count, MPI_Datatype datatype, int root, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module); + +int mca_coll_ucg_bcast_init(void *buffer, int count, MPI_Datatype datatype, int root, + ompi_communicator_t *comm, ompi_info_t *info, + ompi_request_t **request, mca_coll_base_module_t *module); + +/* alltoallv */ +int mca_coll_ucg_alltoallv(const void *sbuf, const int *scounts, const int *sdispls, + ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, + const int *rdispls, ompi_datatype_t *rdtype, + ompi_communicator_t *comm, mca_coll_base_module_t *module); + +int mca_coll_ucg_alltoallv_cache(const void *sbuf, const int *scounts, const int *sdispls, + ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, + const int *rdispls, ompi_datatype_t *rdtype, + ompi_communicator_t *comm, mca_coll_base_module_t *module); + +int mca_coll_ucg_ialltoallv(const void *sbuf, const int *scounts, const int *sdispls, + ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, + const int *rdispls, ompi_datatype_t *rdtype, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module); + +int mca_coll_ucg_ialltoallv_cache(const void *sbuf, const int *scounts, const int *sdispls, + ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, + const int *rdispls, ompi_datatype_t *rdtype, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module); + +int mca_coll_ucg_alltoallv_init(const void *sbuf, const int *scounts, const int *sdispls, + ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, + const int *rdispls, ompi_datatype_t *rdtype, + ompi_communicator_t *comm, ompi_info_t *info, + ompi_request_t **request, mca_coll_base_module_t *module); + +/* barrier */ +int mca_coll_ucg_barrier(ompi_communicator_t *comm, mca_coll_base_module_t *module); + +int mca_coll_ucg_barrier_cache(ompi_communicator_t *comm, mca_coll_base_module_t *module); + +int mca_coll_ucg_ibarrier(ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module); + +int mca_coll_ucg_ibarrier_cache(ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module); + +int mca_coll_ucg_barrier_init(ompi_communicator_t *comm, ompi_info_t *info, + ompi_request_t **request, mca_coll_base_module_t *module); + +/* scatterv */ +int mca_coll_ucg_scatterv(const void *sbuf, const int *scounts, const int *disps, + ompi_datatype_t *sdtype, void *rbuf, int rcount, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +int mca_coll_ucg_scatterv_cache(const void *sbuf, const int *scounts, const int *disps, + ompi_datatype_t *sdtype, void *rbuf, int rcount, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +int mca_coll_ucg_iscatterv(const void *sbuf, const int *scounts, const int *disps, + ompi_datatype_t *sdtype, void *rbuf, int rcount, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module); + +int mca_coll_ucg_iscatterv_cache(const void *sbuf, const int *scounts, const int *disps, + ompi_datatype_t *sdtype, void *rbuf, int rcount, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module); + +int mca_coll_ucg_scatterv_init(const void *sbuf, const int *scounts, const int *disps, + ompi_datatype_t *sdtype, void *rbuf, int rcount, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, ompi_info_t *info, + ompi_request_t **request, mca_coll_base_module_t *module); + +/* gatherv */ +int mca_coll_ucg_gatherv(const void *sbuf, int sendcount, + ompi_datatype_t *sdtype, void *rbuf, const int *recvcounts, + const int *disps, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +int mca_coll_ucg_gatherv_cache(const void *sbuf, int sendcount, + ompi_datatype_t *sdtype, void *rbuf, const int *recvcounts, + const int *disps, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +int mca_coll_ucg_igatherv(const void *sbuf, int sendcount, + ompi_datatype_t *sdtype, void *rbuf, const int *recvcounts, + const int *disps, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module); + +int mca_coll_ucg_igatherv_cache(const void *sbuf, int sendcount, + ompi_datatype_t *sdtype, void *rbuf, const int *recvcounts, + const int *disps, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module); + +int mca_coll_ucg_gatherv_init(const void *sbuf, int sendcount, + ompi_datatype_t *sdtype, void *rbuf, const int *recvcounts, + const int *disps, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, ompi_info_t *info, + ompi_request_t **request, mca_coll_base_module_t *module); + +/* allgatherv */ +int mca_coll_ucg_allgatherv(const void *sbuf, int scount, ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, const int *disps, + ompi_datatype_t *rdtype, ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +int mca_coll_ucg_allgatherv_cache(const void *sbuf, int scount, ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, const int *disps, + ompi_datatype_t *rdtype, ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +int mca_coll_ucg_iallgatherv(const void *sbuf, int scount, ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, const int *disps, + ompi_datatype_t *rdtype, ompi_communicator_t *comm, + ompi_request_t **request, mca_coll_base_module_t *module); + +int mca_coll_ucg_iallgatherv_cache(const void *sbuf, int scount, ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, const int *disps, + ompi_datatype_t *rdtype, ompi_communicator_t *comm, + ompi_request_t **request, mca_coll_base_module_t *module); + +int mca_coll_ucg_allgatherv_init(const void *sbuf, int scount, ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, const int *disps, + ompi_datatype_t *rdtype, ompi_communicator_t *comm, + ompi_info_t *info, ompi_request_t **request, + mca_coll_base_module_t *module); +END_C_DECLS +#endif //MCA_COLL_UCG_H \ No newline at end of file diff --git a/ompi/mca/coll/ucg/coll_ucg_allgatherv.c b/ompi/mca/coll/ucg/coll_ucg_allgatherv.c new file mode 100644 index 0000000000000000000000000000000000000000..6dc80b7578637bbad7179282f23f637b270e5370 --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_allgatherv.c @@ -0,0 +1,261 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#include "coll_ucg.h" +#include "coll_ucg_request.h" +#include "coll_ucg_debug.h" +#include "coll_ucg_dt.h" + + +static int mca_coll_ucg_request_allgatherv_init(mca_coll_ucg_req_t *coll_req, + const void *sbuf, int scount, + ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, + const int *disps, + ompi_datatype_t *rdtype, + mca_coll_ucg_module_t *module, + ucg_request_type_t nb) +{ + ucg_dt_h ucg_send_dt; + int rc = mca_coll_ucg_type_adapt(sdtype, &ucg_send_dt, NULL, NULL); + if (rc != OMPI_SUCCESS) { + UCG_DEBUG("Failed to adapt type"); + return rc; + } + + ucg_dt_h ucg_recv_dt; + rc = mca_coll_ucg_type_adapt(rdtype, &ucg_recv_dt, NULL, NULL); + if (rc != OMPI_SUCCESS) { + UCG_DEBUG("Failed to adapt type"); + return rc; + } + + ucg_request_h ucg_req; + ucg_status_t status = ucg_request_allgatherv_init(sbuf, scount, ucg_send_dt, + rbuf, rcounts, disps, ucg_recv_dt, + module->group, &coll_req->info, + nb, &ucg_req); + if (status != UCG_OK) { + UCG_DEBUG("Failed to initialize ucg request, %s", ucg_status_string(status)); + return OMPI_ERROR; + } + coll_req->ucg_req = ucg_req; + return OMPI_SUCCESS; +} + +int mca_coll_ucg_allgatherv(const void *sbuf, int scount, ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, const int *disps, + ompi_datatype_t *rdtype, ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg allgatherv"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_req_t coll_req; + OBJ_CONSTRUCT(&coll_req, mca_coll_ucg_req_t); + int rc; + rc = mca_coll_ucg_request_common_init(&coll_req, false, false); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_allgatherv_init(&coll_req, sbuf, scount, sdtype, + rbuf, rcounts, disps, rdtype, + ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_execute(&coll_req); + mca_coll_ucg_request_cleanup(&coll_req); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + OBJ_DESTRUCT(&coll_req); + return OMPI_SUCCESS; + +fallback: + OBJ_DESTRUCT(&coll_req); + UCG_DEBUG("fallback allgatherv"); + return ucg_module->previous_allgatherv(sbuf, scount, sdtype, rbuf, rcounts, + disps, rdtype, comm, + ucg_module->previous_allgatherv_module); +} + +int mca_coll_ucg_allgatherv_cache(const void *sbuf, int scount, ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, const int *disps, + ompi_datatype_t *rdtype, ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg allgatherv cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_ALLGATHERV, + .comm = comm, + .allgatherv.sbuf = sbuf, + .allgatherv.scount = scount, + .allgatherv.sdtype = sdtype, + .allgatherv.rbuf = rbuf, + .allgatherv.rcounts = rcounts, + .allgatherv.disps = disps, + .allgatherv.rdtype = rdtype, + }; + + int rc; + rc = mca_coll_ucg_request_execute_cache(&args); + if (rc == OMPI_SUCCESS) { + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN(&args, mca_coll_ucg_request_allgatherv_init, + sbuf, scount, sdtype, rbuf, rcounts, disps, + rdtype, ucg_module, UCG_REQUEST_BLOCKING); + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback allgatherv"); + return ucg_module->previous_allgatherv(sbuf, scount, sdtype, rbuf, rcounts, + disps, rdtype, comm, + ucg_module->previous_allgatherv_module); +} + +int mca_coll_ucg_iallgatherv(const void *sbuf, int scount, ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, const int *disps, + ompi_datatype_t *rdtype, ompi_communicator_t *comm, + ompi_request_t **request, mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg iallgatherv"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, true, false); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_allgatherv_init(coll_req, sbuf, scount, sdtype, + rbuf, rcounts, disps, rdtype, + ucg_module, UCG_REQUEST_NONBLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_execute_nb(coll_req); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + *request = &coll_req->super.super; + + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback iallgatherv"); + return ucg_module->previous_iallgatherv(sbuf, scount, sdtype, rbuf, rcounts, + disps, rdtype, comm, request, + ucg_module->previous_iallgatherv_module); +} + +int mca_coll_ucg_iallgatherv_cache(const void *sbuf, int scount, ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, const int *disps, + ompi_datatype_t *rdtype, ompi_communicator_t *comm, + ompi_request_t **request, mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg iallgatherv cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_IALLGATHERV, + .comm = comm, + .allgatherv.sbuf = sbuf, + .allgatherv.scount = scount, + .allgatherv.sdtype = sdtype, + .allgatherv.rbuf = rbuf, + .allgatherv.rcounts = rcounts, + .allgatherv.disps = disps, + .allgatherv.rdtype = rdtype, + }; + + int rc; + mca_coll_ucg_req_t *coll_req = NULL; + rc = mca_coll_ucg_request_execute_cache_nb(&args, &coll_req); + if (rc == OMPI_SUCCESS) { + *request = &coll_req->super.super; + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN_NB(request, &args, mca_coll_ucg_request_allgatherv_init, + sbuf, scount, sdtype, rbuf, rcounts, disps, + rdtype, ucg_module, UCG_REQUEST_NONBLOCKING); + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback iallgatherv"); + return ucg_module->previous_iallgatherv(sbuf, scount, sdtype, rbuf, rcounts, + disps, rdtype, comm, request, + ucg_module->previous_iallgatherv_module); +} + +int mca_coll_ucg_allgatherv_init(const void *sbuf, int scount, ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, const int *disps, + ompi_datatype_t *rdtype, ompi_communicator_t *comm, + ompi_info_t *info, ompi_request_t **request, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg allgatherv init"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, false, true); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_allgatherv_init(coll_req, sbuf, scount, sdtype, + rbuf, rcounts, disps, rdtype, + ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + *request = &coll_req->super.super; + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback allgatherv init"); + return ucg_module->previous_allgatherv_init(sbuf, scount, sdtype, rbuf, rcounts, + disps, rdtype, comm, info, request, + ucg_module->previous_allgatherv_module); +} \ No newline at end of file diff --git a/ompi/mca/coll/ucg/coll_ucg_allreduce.c b/ompi/mca/coll/ucg/coll_ucg_allreduce.c new file mode 100644 index 0000000000000000000000000000000000000000..4f06b816600cb9a077eac5de3079f9a128245ffb --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_allreduce.c @@ -0,0 +1,239 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#include "coll_ucg.h" +#include "coll_ucg_request.h" +#include "coll_ucg_debug.h" +#include "coll_ucg_dt.h" + +static int mca_coll_ucg_request_allreduce_init(mca_coll_ucg_req_t *coll_req, + const void *sbuf, void *rbuf, int count, + ompi_datatype_t *datatype, ompi_op_t *op, + mca_coll_ucg_module_t *module, + ucg_request_type_t nb) +{ + /* Trick: Prepare sufficient space for storing ucg_op_h. */ + char tmp[UCG_OP_SIZE]; + ucg_dt_h ucg_dt; + ucg_op_h ucg_op = (ucg_op_h)tmp; + int rc = mca_coll_ucg_type_adapt(datatype, &ucg_dt, op, &ucg_op); + if (rc != OMPI_SUCCESS) { + UCG_DEBUG("Failed to adapt type"); + return rc; + } + + // TODO: Check the memory type of buffer if possible + ucg_request_h ucg_req; + const void *tmp_sbuf = sbuf == MPI_IN_PLACE ? UCG_IN_PLACE : sbuf; + ucg_status_t status = ucg_request_allreduce_init(tmp_sbuf, rbuf, count, + ucg_dt, ucg_op, module->group, + &coll_req->info, nb, &ucg_req); + if (status != UCG_OK) { + UCG_DEBUG("Failed to initialize ucg request, %s", ucg_status_string(status)); + return OMPI_ERROR; + } + coll_req->ucg_req = ucg_req; + return OMPI_SUCCESS; +} + +int mca_coll_ucg_allreduce(const void *sbuf, void *rbuf, int count, + ompi_datatype_t *datatype, ompi_op_t *op, + ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg allreduce"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_req_t coll_req; + OBJ_CONSTRUCT(&coll_req, mca_coll_ucg_req_t); + int rc; + rc = mca_coll_ucg_request_common_init(&coll_req, false, false); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_allreduce_init(&coll_req, sbuf, rbuf, count, datatype, + op, ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_execute(&coll_req); + mca_coll_ucg_request_cleanup(&coll_req); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + OBJ_DESTRUCT(&coll_req); + return OMPI_SUCCESS; + +fallback: + OBJ_DESTRUCT(&coll_req); + UCG_DEBUG("fallback allreduce"); + return ucg_module->previous_allreduce(sbuf, rbuf, count, datatype, op, comm, + ucg_module->previous_allreduce_module); +} + +int mca_coll_ucg_allreduce_cache(const void *sbuf, void *rbuf, int count, + ompi_datatype_t *datatype, ompi_op_t *op, + ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg allreduce cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_ALLREDUCE, + .comm = comm, + .allreduce.sbuf = sbuf, + .allreduce.rbuf = rbuf, + .allreduce.count = count, + .allreduce.datatype = datatype, + .allreduce.op = op, + }; + + int rc; + rc = mca_coll_ucg_request_execute_cache(&args); + if (rc == OMPI_SUCCESS) { + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN(&args, mca_coll_ucg_request_allreduce_init, + sbuf, rbuf, count, datatype, op, + ucg_module, UCG_REQUEST_BLOCKING); + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback allreduce"); + return ucg_module->previous_allreduce(sbuf, rbuf, count, datatype, op, comm, + ucg_module->previous_allreduce_module); +} + +int mca_coll_ucg_iallreduce(const void *sbuf, void *rbuf, int count, + ompi_datatype_t *datatype, ompi_op_t *op, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg iallreduce"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, true, false); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_allreduce_init(coll_req, sbuf, rbuf, count, datatype, op, + ucg_module, UCG_REQUEST_NONBLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_execute_nb(coll_req); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + *request = &coll_req->super.super; + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback iallreduce"); + return ucg_module->previous_iallreduce(sbuf, rbuf, count, datatype, op, comm, + request, ucg_module->previous_iallreduce_module); +} + +int mca_coll_ucg_iallreduce_cache(const void *sbuf, void *rbuf, int count, + ompi_datatype_t *datatype, ompi_op_t *op, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg iallreduce cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_IALLREDUCE, + .comm = comm, + .allreduce.sbuf = sbuf, + .allreduce.rbuf = rbuf, + .allreduce.count = count, + .allreduce.datatype = datatype, + .allreduce.op = op, + }; + + int rc; + mca_coll_ucg_req_t *coll_req = NULL; + rc = mca_coll_ucg_request_execute_cache_nb(&args, &coll_req); + if (rc == OMPI_SUCCESS) { + *request = &coll_req->super.super; + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN_NB(request, &args, mca_coll_ucg_request_allreduce_init, + sbuf, rbuf, count, datatype, op, ucg_module, + UCG_REQUEST_NONBLOCKING); + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback iallreduce"); + return ucg_module->previous_iallreduce(sbuf, rbuf, count, datatype, op, comm, + request, ucg_module->previous_iallreduce_module); +} + +int mca_coll_ucg_allreduce_init(const void *sbuf, void *rbuf, int count, ompi_datatype_t *datatype, + ompi_op_t *op, ompi_communicator_t *comm, ompi_info_t *info, + ompi_request_t **request, mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg allreduce init"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, false, true); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_allreduce_init(coll_req, sbuf, rbuf, count, datatype, op, + ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + *request = &coll_req->super.super; + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback allreduce init"); + return ucg_module->previous_allreduce_init(sbuf, rbuf, count, datatype, op, comm, info, + request, ucg_module->previous_allreduce_module); +} \ No newline at end of file diff --git a/ompi/mca/coll/ucg/coll_ucg_alltoallv.c b/ompi/mca/coll/ucg/coll_ucg_alltoallv.c new file mode 100644 index 0000000000000000000000000000000000000000..03c103c2bdde71d9bb3967f250ede69261519f59 --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_alltoallv.c @@ -0,0 +1,272 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#include "coll_ucg.h" +#include "coll_ucg_request.h" +#include "coll_ucg_debug.h" +#include "coll_ucg_dt.h" + + +static int mca_coll_ucg_request_alltoallv_init(mca_coll_ucg_req_t *coll_req, + const void *sbuf, const int *scounts, + const int *sdispls, ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, + const int *rdispls, ompi_datatype_t *rdtype, + mca_coll_ucg_module_t *module, + ucg_request_type_t nb) +{ + ucg_dt_h ucg_send_dt; + int rc = mca_coll_ucg_type_adapt(sdtype, &ucg_send_dt, NULL, NULL); + if (rc != OMPI_SUCCESS) { + UCG_DEBUG("Failed to adapt type"); + return rc; + } + + ucg_dt_h ucg_recv_dt; + rc = mca_coll_ucg_type_adapt(rdtype, &ucg_recv_dt, NULL, NULL); + if (rc != OMPI_SUCCESS) { + UCG_DEBUG("Failed to adapt type"); + return rc; + } + + // TODO: Check the memory type of buffer if possible + ucg_request_h ucg_req; + const void *tmp_sbuf = sbuf == MPI_IN_PLACE ? UCG_IN_PLACE : sbuf; + ucg_status_t status = ucg_request_alltoallv_init(tmp_sbuf, scounts, sdispls, ucg_send_dt, + rbuf, rcounts, rdispls, ucg_recv_dt, + module->group, &coll_req->info, + nb, &ucg_req); + if (status != UCG_OK) { + UCG_DEBUG("Failed to initialize ucg request, %s", ucg_status_string(status)); + return OMPI_ERROR; + } + coll_req->ucg_req = ucg_req; + return OMPI_SUCCESS; +} + +int mca_coll_ucg_alltoallv(const void *sbuf, const int *scounts, const int *sdispls, + ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, + const int *rdispls, ompi_datatype_t *rdtype, + ompi_communicator_t *comm, mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg alltoallv"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_req_t coll_req; + OBJ_CONSTRUCT(&coll_req, mca_coll_ucg_req_t); + int rc; + rc = mca_coll_ucg_request_common_init(&coll_req, false, false); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_alltoallv_init(&coll_req, + sbuf, scounts, sdispls, sdtype, + rbuf, rcounts, rdispls, rdtype, + ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_execute(&coll_req); + mca_coll_ucg_request_cleanup(&coll_req); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + OBJ_DESTRUCT(&coll_req); + return OMPI_SUCCESS; + +fallback: + OBJ_DESTRUCT(&coll_req); + UCG_DEBUG("fallback alltoallv"); + return ucg_module->previous_alltoallv(sbuf, scounts, sdispls, sdtype, + rbuf, rcounts, rdispls, rdtype, + comm, ucg_module->previous_alltoallv_module); +} + +int mca_coll_ucg_alltoallv_cache(const void *sbuf, const int *scounts, const int *sdispls, + ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, + const int *rdispls, ompi_datatype_t *rdtype, + ompi_communicator_t *comm, mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg alltoallv cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_ALLTOALLV, + .comm = comm, + .alltoallv.sbuf = sbuf, + .alltoallv.scounts = scounts, + .alltoallv.sdispls = sdispls, + .alltoallv.sdtype = sdtype, + .alltoallv.rbuf = rbuf, + .alltoallv.rcounts = rcounts, + .alltoallv.rdispls = rdispls, + .alltoallv.rdtype = rdtype, + }; + + int rc; + rc = mca_coll_ucg_request_execute_cache(&args); + if (rc == OMPI_SUCCESS) { + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN(&args, mca_coll_ucg_request_alltoallv_init, + sbuf, scounts, sdispls, sdtype, + rbuf, rcounts, rdispls, rdtype, + ucg_module, UCG_REQUEST_BLOCKING); + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback alltoallv"); + return ucg_module->previous_alltoallv(sbuf, scounts, sdispls, sdtype, + rbuf, rcounts, rdispls, rdtype, + comm, ucg_module->previous_alltoallv_module); +} + +int mca_coll_ucg_ialltoallv(const void *sbuf, const int *scounts, const int *sdispls, + ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, + const int *rdispls, ompi_datatype_t *rdtype, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg ialltoallv"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, true, false); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_alltoallv_init(coll_req, + sbuf, scounts, sdispls, sdtype, + rbuf, rcounts, rdispls, rdtype, + ucg_module, UCG_REQUEST_NONBLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_execute_nb(coll_req); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + *request = &coll_req->super.super; + return OMPI_SUCCESS; +fallback: + UCG_DEBUG("fallback ialltoallv"); + return ucg_module->previous_ialltoallv(sbuf, scounts, sdispls, sdtype, + rbuf, rcounts, rdispls, rdtype, + comm, request, + ucg_module->previous_ialltoallv_module); +} + +int mca_coll_ucg_ialltoallv_cache(const void *sbuf, const int *scounts, const int *sdispls, + ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, + const int *rdispls, ompi_datatype_t *rdtype, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg ialltoallv cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_IALLTOALLV, + .comm = comm, + .alltoallv.sbuf = sbuf, + .alltoallv.scounts = scounts, + .alltoallv.sdispls = sdispls, + .alltoallv.sdtype = sdtype, + .alltoallv.rbuf = rbuf, + .alltoallv.rcounts = rcounts, + .alltoallv.rdispls = rdispls, + .alltoallv.rdtype = rdtype, + }; + + int rc; + mca_coll_ucg_req_t *coll_req = NULL; + rc = mca_coll_ucg_request_execute_cache_nb(&args, &coll_req); + if (rc == OMPI_SUCCESS) { + *request = &coll_req->super.super; + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN_NB(request, &args, mca_coll_ucg_request_alltoallv_init, + sbuf, scounts, sdispls, sdtype, + rbuf, rcounts, rdispls, rdtype, + ucg_module, UCG_REQUEST_NONBLOCKING); + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback ialltoallv"); + return ucg_module->previous_ialltoallv(sbuf, scounts, sdispls, sdtype, + rbuf, rcounts, rdispls, rdtype, + comm, request, + ucg_module->previous_ialltoallv_module); +} + +int mca_coll_ucg_alltoallv_init(const void *sbuf, const int *scounts, const int *sdispls, + ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, + const int *rdispls, ompi_datatype_t *rdtype, + ompi_communicator_t *comm, ompi_info_t *info, + ompi_request_t **request, mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg alltoallv init"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, false, true); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_alltoallv_init(coll_req, + sbuf, scounts, sdispls, sdtype, + rbuf, rcounts, rdispls, rdtype, + ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + *request = &coll_req->super.super; + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback alltoallv init"); + return ucg_module->previous_alltoallv_init(sbuf, scounts, sdispls, sdtype, + rbuf, rcounts, rdispls, rdtype, + comm, info, request, + ucg_module->previous_alltoallv_module); +} \ No newline at end of file diff --git a/ompi/mca/coll/ucg/coll_ucg_barrier.c b/ompi/mca/coll/ucg/coll_ucg_barrier.c new file mode 100644 index 0000000000000000000000000000000000000000..2fd18d23c539999007d329f78ec5e0866ff00199 --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_barrier.c @@ -0,0 +1,199 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#include "coll_ucg.h" +#include "coll_ucg_request.h" +#include "coll_ucg_debug.h" +#include "coll_ucg_dt.h" + + +static int mca_coll_ucg_request_barrier_init(mca_coll_ucg_req_t *coll_req, + mca_coll_ucg_module_t *module, + ucg_request_type_t nb) +{ + /* The UCG cannot automatically detect the memory environment where the barrier is executed. + TODO: Allows users to pass hints. */ + coll_req->info.field_mask |= UCG_REQUEST_INFO_FIELD_MEM_TYPE; + coll_req->info.mem_type = UCG_MEM_TYPE_HOST; + + ucg_request_h ucg_req; + ucg_status_t status = ucg_request_barrier_init(module->group, &coll_req->info, nb, &ucg_req); + if (status != UCG_OK) { + UCG_DEBUG("Failed to initialize ucg request, %s", ucg_status_string(status)); + return OMPI_ERROR; + } + coll_req->ucg_req = ucg_req; + return OMPI_SUCCESS; +} + +int mca_coll_ucg_barrier(ompi_communicator_t *comm, mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg barrier"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_req_t coll_req; + OBJ_CONSTRUCT(&coll_req, mca_coll_ucg_req_t); + int rc; + rc = mca_coll_ucg_request_common_init(&coll_req, false, false); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_barrier_init(&coll_req, ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_execute(&coll_req); + mca_coll_ucg_request_cleanup(&coll_req); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + OBJ_DESTRUCT(&coll_req); + return OMPI_SUCCESS; + +fallback: + OBJ_DESTRUCT(&coll_req); + UCG_DEBUG("fallback barrier"); + return ucg_module->previous_barrier(comm, ucg_module->previous_barrier_module); +} + +int mca_coll_ucg_barrier_cache(ompi_communicator_t *comm, mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg barrier cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_BARRIER, + .comm = comm, + }; + int rc; + rc = mca_coll_ucg_request_execute_cache(&args); + if (rc == OMPI_SUCCESS) { + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN(&args, mca_coll_ucg_request_barrier_init, + ucg_module, UCG_REQUEST_BLOCKING); + + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback barrier"); + return ucg_module->previous_barrier(comm, ucg_module->previous_barrier_module); +} + +int mca_coll_ucg_ibarrier(ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg ibarrier"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, true, false); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_barrier_init(coll_req, ucg_module, UCG_REQUEST_NONBLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_execute_nb(coll_req); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + *request = &coll_req->super.super; + + return OMPI_SUCCESS; +fallback: + UCG_DEBUG("fallback ibarrier"); + return ucg_module->previous_ibarrier(comm, request, ucg_module->previous_barrier_module); +} + +int mca_coll_ucg_ibarrier_cache(ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg ibarrier cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_IBARRIER, + .comm = comm, + }; + + int rc; + mca_coll_ucg_req_t *coll_req = NULL; + rc = mca_coll_ucg_request_execute_cache_nb(&args, &coll_req); + if (rc == OMPI_SUCCESS) { + *request = &coll_req->super.super; + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN_NB(request, &args, mca_coll_ucg_request_barrier_init, + ucg_module, UCG_REQUEST_NONBLOCKING); + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback ibarrier"); + return ucg_module->previous_ibarrier(comm, request, ucg_module->previous_barrier_module); +} + +int mca_coll_ucg_barrier_init(ompi_communicator_t *comm, ompi_info_t *info, + ompi_request_t **request, mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg barrier init"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, false, true); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_barrier_init(coll_req, ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + *request = &coll_req->super.super; + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback barrier init"); + return ucg_module->previous_barrier_init(comm, info, request, + ucg_module->previous_barrier_module); +} \ No newline at end of file diff --git a/ompi/mca/coll/ucg/coll_ucg_bcast.c b/ompi/mca/coll/ucg/coll_ucg_bcast.c new file mode 100644 index 0000000000000000000000000000000000000000..9b20a6d2a77a271b091d694c57bdab44f977077c --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_bcast.c @@ -0,0 +1,231 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#include "coll_ucg.h" +#include "coll_ucg_request.h" +#include "coll_ucg_debug.h" +#include "coll_ucg_dt.h" + + +static int mca_coll_ucg_request_bcast_init(mca_coll_ucg_req_t *coll_req, + void *buff, int count, + ompi_datatype_t *datatype, int root, + mca_coll_ucg_module_t *module, + ucg_request_type_t nb) +{ + ucg_dt_h ucg_dt; + int rc = mca_coll_ucg_type_adapt(datatype, &ucg_dt, NULL, NULL); + if (rc != OMPI_SUCCESS) { + UCG_DEBUG("Failed to adapt type"); + return rc; + } + + // TODO: Check the memory type of buffer if possible + ucg_request_h ucg_req; + ucg_status_t status = ucg_request_bcast_init(buff, count, ucg_dt, root, + module->group, &coll_req->info, + nb, &ucg_req); + if (status != UCG_OK) { + UCG_DEBUG("Failed to initialize ucg request, %s", ucg_status_string(status)); + return OMPI_ERROR; + } + coll_req->ucg_req = ucg_req; + return OMPI_SUCCESS; +} + +int mca_coll_ucg_bcast(void *buff, int count, ompi_datatype_t *datatype, + int root, ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg bcast"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_req_t coll_req; + OBJ_CONSTRUCT(&coll_req, mca_coll_ucg_req_t); + int rc; + rc = mca_coll_ucg_request_common_init(&coll_req, false, false); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_bcast_init(&coll_req, buff, count, datatype, root, + ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_execute(&coll_req); + mca_coll_ucg_request_cleanup(&coll_req); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + OBJ_DESTRUCT(&coll_req); + return OMPI_SUCCESS; + +fallback: + OBJ_DESTRUCT(&coll_req); + UCG_DEBUG("fallback bcast"); + return ucg_module->previous_bcast(buff, count, datatype, root, + comm, ucg_module->previous_bcast_module); +} + +int mca_coll_ucg_bcast_cache(void *buff, int count, ompi_datatype_t *datatype, + int root, ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg bcast cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_BCAST, + .comm = comm, + .bcast.buffer = buff, + .bcast.count = count, + .bcast.datatype = datatype, + .bcast.root = root, + }; + + int rc; + rc = mca_coll_ucg_request_execute_cache(&args); + if (rc == OMPI_SUCCESS) { + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN(&args, mca_coll_ucg_request_bcast_init, + buff, count, datatype, root, ucg_module, + UCG_REQUEST_BLOCKING); + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback bcast"); + return ucg_module->previous_bcast(buff, count, datatype, root, + comm, ucg_module->previous_bcast_module); +} + +int mca_coll_ucg_ibcast(void *buff, int count, MPI_Datatype datatype, int root, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg ibcast"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, true, false); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_bcast_init(coll_req, buff, count, datatype, root, + ucg_module, UCG_REQUEST_NONBLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_execute_nb(coll_req); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + *request = &coll_req->super.super; + + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback ibcast"); + return ucg_module->previous_ibcast(buff, count, datatype, root, comm, + request, ucg_module->previous_ibcast_module); +} + +int mca_coll_ucg_ibcast_cache(void *buffer, int count, MPI_Datatype datatype, int root, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg ibcast cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_IBCAST, + .comm = comm, + .bcast.buffer = buffer, + .bcast.count = count, + .bcast.datatype = datatype, + .bcast.root = root, + }; + + int rc; + mca_coll_ucg_req_t *coll_req = NULL; + rc = mca_coll_ucg_request_execute_cache_nb(&args, &coll_req); + if (rc == OMPI_SUCCESS) { + *request = &coll_req->super.super; + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN_NB(request, &args, mca_coll_ucg_request_bcast_init, + buffer, count, datatype, root, + ucg_module, UCG_REQUEST_NONBLOCKING); + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback ibcast"); + return ucg_module->previous_ibcast(buffer, count, datatype, root, comm, + request, ucg_module->previous_ibcast_module); +} + +int mca_coll_ucg_bcast_init(void *buffer, int count, MPI_Datatype datatype, int root, + ompi_communicator_t *comm, ompi_info_t *info, + ompi_request_t **request, mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg bcast init"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, false, true); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_bcast_init(coll_req, buffer, count, datatype, root, + ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + *request = &coll_req->super.super; + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback bcast init"); + return ucg_module->previous_bcast_init(buffer, count, datatype, root, comm, info, + request, ucg_module->previous_bcast_module); +} \ No newline at end of file diff --git a/ompi/mca/coll/ucg/coll_ucg_component.c b/ompi/mca/coll/ucg/coll_ucg_component.c new file mode 100644 index 0000000000000000000000000000000000000000..7ecadfe802eeee691134122cac7ca1ad8c4d1bb7 --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_component.c @@ -0,0 +1,293 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2022 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#include "ompi_config.h" +#include "coll_ucg.h" +#include "coll_ucg_dt.h" +#include "coll_ucg_debug.h" +#include "coll_ucg_request.h" + +#include "opal/util/argv.h" + +/* + * Public string showing the coll ompi_ucg component version number + */ +const char *mca_coll_ucg_component_version_string = + "Open MPI UCG collective MCA component version " OMPI_VERSION; + +/* + * Global variable + */ +int mca_coll_ucg_output = -1; + +/* + * Local function + */ +static int mca_coll_ucg_register(void); +static int mca_coll_ucg_open(void); +static int mca_coll_ucg_close(void); + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +mca_coll_ucg_component_t mca_coll_ucg_component = { + /* First, fill in the super */ + { + /* First, the mca_component_t struct containing meta information + about the component itself */ + .collm_version = { +#if OMPI_MAJOR_VERSION > 4 + MCA_COLL_BASE_VERSION_2_4_0, +#else + MCA_COLL_BASE_VERSION_2_0_0, +#endif + + /* Component name and version */ + .mca_component_name = "ucg", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + + /* Component open and close functions */ + .mca_open_component = mca_coll_ucg_open, + .mca_close_component = mca_coll_ucg_close, + .mca_register_component_params = mca_coll_ucg_register, + }, + .collm_data = { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE + }, + + /* Initialization / querying functions */ + .collm_init_query = mca_coll_ucg_init_query, + .collm_comm_query = mca_coll_ucg_comm_query, + }, + .initialized = false, + /* MCA parameter */ + .priority = 90, /* priority */ + .verbose = 2, /* verbose level */ + .max_rcache_size = 10, + .disable_coll = NULL, + .topology = NULL, + .npolls = 10, + + .ucg_context = NULL, + //TODO: More parameters should be added below. +}; + +static int mca_coll_ucg_register(void) +{ + (void)mca_base_component_var_register(&mca_coll_ucg_component.super.collm_version, "priority", + "Priority of the UCG component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_ucg_component.priority); + + (void)mca_base_component_var_register(&mca_coll_ucg_component.super.collm_version, "verbose", + "Verbosity of the UCG component, " + "0:fatal, 1:error, 2:warn, 3:info, 4:debug, >4:fine-grained trace logs", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_LOCAL, + &mca_coll_ucg_component.verbose); + + (void)mca_base_component_var_register(&mca_coll_ucg_component.super.collm_version, "max_rcache_size", + "Max size of request cache", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_ucg_component.max_rcache_size); + + (void)mca_base_component_var_register(&mca_coll_ucg_component.super.collm_version, "disable_coll", + "Comma separated list of collective operations to disable", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_ucg_component.disable_coll); + + (void)mca_base_component_var_register(&mca_coll_ucg_component.super.collm_version, "topology", + "Path of the topology file required by the net-topo-aware algorithm", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_ucg_component.topology); + + (void)mca_base_component_var_register(&mca_coll_ucg_component.super.collm_version, "npolls", + "Set how many poll counts of ucg progress before opal_progress, " + "can fine tune performance by setting this value, range [1, 100]", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_ucg_component.npolls); + + return OMPI_SUCCESS; +} + +/** + * @brief Parse the topology file and find the subnet ID corresponding to the rank + * + * Temporary solution, which does not consider the overhead of repeatedly + * opening and traversing files. This solution will be changed later. + */ +static ucg_status_t mca_coll_ucg_get_subnet_id(ucg_rank_t myrank, char *topology, + int32_t *subnet_id) +{ + if (topology == NULL) { + UCG_DEBUG("No topology file is specified"); + return UCG_ERR_NOT_FOUND; + } + + FILE *fp = fopen(topology, "r"); + if (fp == NULL) { + UCG_DEBUG("Topology file %s doesn't seem to exist", topology); + return UCG_ERR_NOT_FOUND; + } + + ucg_status_t status = UCG_OK; + char line[1024]; + ucg_rank_t temp_rank; + int32_t temp_id; + while (!feof(fp)) { + fgets(line, sizeof(line) - 1, fp); + int rc = sscanf(line, "rank %d subnet_id %d", &temp_rank, &temp_id); + if (rc != 2) { + goto err; + } else if (temp_rank == myrank) { + *subnet_id = temp_id; + goto out; + } + } +err: + status = UCG_ERR_INVALID_PARAM; + UCG_DEBUG("Failed to parse the topology file. Rank %d is not found", myrank); +out: + fclose(fp); + return status; +} + +static ucg_status_t mca_coll_ucg_set_local_location(ucg_location_t *location) +{ + if (location == NULL) { + return UCG_ERR_INVALID_PARAM; + } + + int rc; + opal_process_name_t proc_name = { + .jobid = OMPI_PROC_MY_NAME->jobid, + .vpid = OMPI_PROC_MY_NAME->vpid, + }; + location->field_mask = 0; + location->subnet_id = -1; + location->node_id = -1; + location->socket_id = -1; + + // get subnet id + int32_t subnet_id = 0; + ucg_status_t status; + status = mca_coll_ucg_get_subnet_id(OMPI_PROC_MY_NAME->vpid, + mca_coll_ucg_component.topology, + &subnet_id); + if (status == UCG_OK) { + location->field_mask |= UCG_LOCATION_FIELD_SUBNET_ID; + location->subnet_id = subnet_id; + } + + // get node id + uint32_t node_id = 0; + uint32_t *pnode_id = &node_id; + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_NODEID, &proc_name, &pnode_id, PMIX_UINT32); + if (rc != OPAL_SUCCESS) { + goto out; + } + location->field_mask |= UCG_LOCATION_FIELD_NODE_ID; + location->node_id = (int32_t)node_id; + + // get socket id + char *locality = NULL; + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, + &proc_name, &locality, PMIX_STRING); + if (rc != OPAL_SUCCESS || locality == NULL) { + goto out; + } + char *socket = strstr(locality, "SK"); + if (socket == NULL) { + goto out_free_locality; + } + location->field_mask |= UCG_LOCATION_FIELD_SOCKET_ID; + location->socket_id = atoi(socket + 2); + +out_free_locality: + free(locality); +out: + return UCG_OK; +} + +/* + * Call pmix_put for sending my own process infor to pmix server + */ +static int mca_coll_ucg_send_local_proc_info(void) +{ + char rank_addr_identify[32] = {0}; + mca_coll_ucg_component_t *cm = &mca_coll_ucg_component; + const char *mca_type_name = cm->super.collm_version.mca_type_name; + const char *mca_component_name = cm->super.collm_version.mca_component_name; + uint32_t jobid = (uint32_t)OMPI_PROC_MY_NAME->jobid; + uint32_t vpid = (uint32_t)OMPI_PROC_MY_NAME->vpid; + sprintf(rank_addr_identify, "%s.%s.%u.%u", mca_type_name, mca_component_name, jobid, vpid); + + int rc; + ucg_proc_info_t *proc = ucg_get_allocated_local_proc_info(cm->ucg_context); + if (!proc) { + UCG_ERROR("Failed to get local proc info!"); + return OMPI_ERR_OUT_OF_RESOURCE; + } + mca_coll_ucg_set_local_location(&proc->location); + + uint32_t proc_size = *(uint32_t *)proc; + UCG_DEBUG("key: %s value: %p (size: %d, location: %d,%d,%d)", rank_addr_identify, proc, proc_size, + proc->location.subnet_id, proc->location.node_id, proc->location.socket_id); + + OPAL_MODEX_SEND_STRING(rc, PMIX_GLOBAL, rank_addr_identify, proc, proc_size); + ucg_free_proc_info(proc); + if (rc != OMPI_SUCCESS) { + return rc; + } + return OMPI_SUCCESS; +} + +static int mca_coll_ucg_open(void) +{ + mca_coll_ucg_component_t *cm = &mca_coll_ucg_component; + mca_coll_ucg_output = opal_output_open(NULL); + opal_output_set_verbosity(mca_coll_ucg_output, cm->verbose); + + int rc = mca_coll_ucg_init_once(); + if (rc != OMPI_SUCCESS) { + return rc; + } + + rc = mca_coll_ucg_send_local_proc_info(); + if (rc != OMPI_SUCCESS) { + return rc; + } + + return OMPI_SUCCESS; +} + +static int mca_coll_ucg_close(void) +{ + /* In some cases, mpi_comm_world is not the last comm to free. + * call mca_coll_ucg_cleanup_once here, ensure cleanup ucg resources at last. + */ + mca_coll_ucg_cleanup_once(); + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/ucg/coll_ucg_debug.h b/ompi/mca/coll/ucg/coll_ucg_debug.h new file mode 100644 index 0000000000000000000000000000000000000000..e245567fda687ff2efc8322e81191b16d070aafb --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_debug.h @@ -0,0 +1,51 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2022 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#ifndef MCA_COLL_UCG_DEBUG_H +#define MCA_COLL_UCG_DEBUG_H + +#include "ompi_config.h" +#pragma GCC system_header + +#ifdef __BASE_FILE__ +#define __UCG_FILE__ __BASE_FILE__ +#else +#define __UCG_FILE__ __FILE__ +#endif + +#define UCG_FATAL(_format, ... ) \ + opal_output_verbose(0, mca_coll_ucg_output, "[%s:%d] FATAL " _format, \ + __UCG_FILE__, __LINE__, ## __VA_ARGS__); \ + abort() + +#define UCG_ERROR(_format, ... ) \ + opal_output_verbose(1, mca_coll_ucg_output, "[%s:%d] ERROR " _format, \ + __UCG_FILE__, __LINE__, ## __VA_ARGS__) + +#define UCG_WARN(_format, ... ) \ + opal_output_verbose(2, mca_coll_ucg_output, "[%s:%d] WARN " _format, \ + __UCG_FILE__, __LINE__, ## __VA_ARGS__) + +#define UCG_INFO(_format, ... ) \ + opal_output_verbose(3, mca_coll_ucg_output, "[%s:%d] INFO " _format, \ + __UCG_FILE__, __LINE__, ## __VA_ARGS__) + +#define UCG_INFO_IF(_cond, _format, ... ) \ + if (_cond) { \ + opal_output_verbose(3, mca_coll_ucg_output, "[%s:%d] INFO " _format, \ + __UCG_FILE__, __LINE__, ## __VA_ARGS__); \ + } + +#define UCG_DEBUG(_format, ... ) \ + opal_output_verbose(4, mca_coll_ucg_output, "[%s:%d] DEBUG " _format, \ + __UCG_FILE__, __LINE__, ## __VA_ARGS__) + +extern int mca_coll_ucg_output; +#endif //MCA_COLL_UCG_DEBUG_H \ No newline at end of file diff --git a/ompi/mca/coll/ucg/coll_ucg_dt.c b/ompi/mca/coll/ucg/coll_ucg_dt.c new file mode 100644 index 0000000000000000000000000000000000000000..f0b11aca0182ab7306b2f9dba8a514917469c627 --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_dt.c @@ -0,0 +1,478 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2022 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#include "coll_ucg_dt.h" +#include "coll_ucg_debug.h" + +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/datatype/ompi_datatype_internal.h" +#include "ompi/runtime/mpiruntime.h" +#include "ompi/attribute/attribute.h" + +static mca_coll_ucg_type_table_t ucg_type_table = {.attr_key = MPI_KEYVAL_INVALID}; +static mca_coll_ucg_conv_pool_t mca_coll_ucg_conv_pool; + +static ucg_status_t mca_coll_ucg_op_reduce(void *op, const void *source, void *target, + int32_t count, void *dt) +{ + ompi_op_t *ompi_op = (ompi_op_t*)op; + ompi_datatype_t *ompi_dt = (ompi_datatype_t*)dt; + ompi_op_reduce(ompi_op, (void*)source, target, count, ompi_dt); + return UCG_OK; +} + +static inline mca_coll_ucg_convertor_t* mca_coll_ucg_conv_pool_get(void) +{ + return (mca_coll_ucg_convertor_t *)opal_free_list_wait(&mca_coll_ucg_conv_pool.flist); +} + +static inline void mca_coll_ucg_conv_pool_put(mca_coll_ucg_convertor_t *conv) +{ + opal_free_list_return(&mca_coll_ucg_conv_pool.flist, (opal_free_list_item_t*)conv); + return; +} + +static void* mca_coll_ucg_conv_start_pack(const void *user_buf, void *user_dt, int32_t count) +{ + ompi_datatype_t *datatype = (ompi_datatype_t *)user_dt; + mca_coll_ucg_convertor_t *convertor; + + convertor = mca_coll_ucg_conv_pool_get(); + OMPI_DATATYPE_RETAIN(datatype); + convertor->datatype = datatype; + int rc = opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor, + &datatype->super, + count, user_buf, 0, + &convertor->opal_conv); + if (rc != OPAL_SUCCESS) { + OMPI_DATATYPE_RELEASE(datatype); + mca_coll_ucg_conv_pool_put(convertor); + convertor = NULL; + } + return convertor; +} + +static ucg_status_t mca_coll_ucg_conv_pack(void *conv, uint64_t offset, void *buffer, uint64_t *length) +{ + mca_coll_ucg_convertor_t *convertor = (mca_coll_ucg_convertor_t *)conv; + uint32_t iov_count; + struct iovec iov; + size_t pack_length; + int rc; + + iov_count = 1; + iov.iov_base = buffer; + iov.iov_len = *length; + + opal_convertor_set_position(&convertor->opal_conv, &offset); + pack_length = *length; + rc = opal_convertor_pack(&convertor->opal_conv, &iov, &iov_count, &pack_length); + if (OPAL_UNLIKELY(rc < 0)) { + UCG_ERROR("Failed to pack datatype structure"); + return UCG_ERR_NO_RESOURCE; + } + + *length = pack_length; + return UCG_OK; +} + +static void* mca_coll_ucg_conv_start_unpack(void *user_buf, void *user_dt, int32_t count) +{ + ompi_datatype_t *datatype = (ompi_datatype_t *)user_dt; + mca_coll_ucg_convertor_t *convertor; + + convertor = mca_coll_ucg_conv_pool_get(); + OMPI_DATATYPE_RETAIN(datatype); + convertor->datatype = datatype; + convertor->offset = 0; + int rc = opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor, + &datatype->super, + count, user_buf, 0, + &convertor->opal_conv); + if (rc != OPAL_SUCCESS) { + OMPI_DATATYPE_RELEASE(datatype); + mca_coll_ucg_conv_pool_put(convertor); + convertor = NULL; + } + return convertor; +} + +static ucg_status_t mca_coll_ucg_conv_unpack(void *conv, uint64_t offset, const void *buffer, uint64_t *length) +{ + mca_coll_ucg_convertor_t *convertor = (mca_coll_ucg_convertor_t *)conv; + int rc; + uint32_t iov_count; + uint64_t unpack_length; + struct iovec iov; + opal_convertor_t opal_conv; + + iov_count = 1; + iov.iov_base = (void*)buffer; + iov.iov_len = *length; + + /* in case if unordered message arrived - create separate convertor to + * unpack data. */ + if (offset != convertor->offset) { + OBJ_CONSTRUCT(&opal_conv, opal_convertor_t); + opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor, + &convertor->datatype->super, + convertor->opal_conv.count, + convertor->opal_conv.pBaseBuf, 0, + &opal_conv); + opal_convertor_set_position(&opal_conv, &offset); + rc = opal_convertor_unpack(&opal_conv, &iov, &iov_count, &unpack_length); + opal_convertor_cleanup(&opal_conv); + OBJ_DESTRUCT(&opal_conv); + /* permanently switch to un-ordered mode */ + convertor->offset = 0; + } else { + rc = opal_convertor_unpack(&convertor->opal_conv, &iov, &iov_count, &unpack_length); + convertor->offset += unpack_length; + } + if (OPAL_UNLIKELY(rc < 0)) { + UCG_ERROR("Failed to unpack datatype structure"); + return UCG_ERR_NO_RESOURCE; + } + + *length = unpack_length; + return UCG_OK; +} + +static void mca_coll_ucg_conv_finish(void *conv) +{ + mca_coll_ucg_convertor_t *convertor = (mca_coll_ucg_convertor_t *)conv; + + opal_convertor_cleanup(&convertor->opal_conv); + OMPI_DATATYPE_RELEASE(convertor->datatype); + mca_coll_ucg_conv_pool_put(convertor); +} + +static void mca_coll_ucg_convertor_construct(mca_coll_ucg_convertor_t *convertor) +{ + OBJ_CONSTRUCT(&convertor->opal_conv, opal_convertor_t); +} + +static void mca_coll_ucg_convertor_destruct(mca_coll_ucg_convertor_t *convertor) +{ + OBJ_DESTRUCT(&convertor->opal_conv); +} + +OBJ_CLASS_INSTANCE(mca_coll_ucg_convertor_t, + opal_free_list_item_t, + mca_coll_ucg_convertor_construct, + mca_coll_ucg_convertor_destruct); + +static inline ucg_dt_type_t ompi_dt_2_ucg_dt_type(ompi_datatype_t *ompi_dt) +{ + switch (ompi_dt->id) { + case OMPI_DATATYPE_MPI_INT8_T: + return UCG_DT_TYPE_INT8; + case OMPI_DATATYPE_MPI_INT16_T: + return UCG_DT_TYPE_INT16; + case OMPI_DATATYPE_MPI_INT32_T: + return UCG_DT_TYPE_INT32; + case OMPI_DATATYPE_MPI_INT64_T: + return UCG_DT_TYPE_INT64; + + case OMPI_DATATYPE_MPI_UINT8_T: + return UCG_DT_TYPE_UINT8; + case OMPI_DATATYPE_MPI_UINT16_T: + return UCG_DT_TYPE_UINT16; + case OMPI_DATATYPE_MPI_UINT32_T: + return UCG_DT_TYPE_UINT32; + case OMPI_DATATYPE_MPI_UINT64_T: + return UCG_DT_TYPE_UINT64; + +#if OMPI_MAJOR_VERSION > 4 + case OMPI_DATATYPE_MPI_SHORT_FLOAT: + return UCG_DT_TYPE_FP16; +#endif + case OMPI_DATATYPE_MPI_FLOAT: + return UCG_DT_TYPE_FP32; + case OMPI_DATATYPE_MPI_DOUBLE: + return UCG_DT_TYPE_FP64; + + default: + return UCG_DT_TYPE_USER; + } +} + +static inline ucg_op_type_t ompi_op_2_ucg_op_type(ompi_op_t *ompi_op) +{ + switch (ompi_op->op_type) { + case OMPI_OP_MAX: + return UCG_OP_TYPE_MAX; + case OMPI_OP_MIN: + return UCG_OP_TYPE_MIN; + case OMPI_OP_SUM: + return UCG_OP_TYPE_SUM; + case OMPI_OP_PROD: + return UCG_OP_TYPE_PROD; + default: + return UCG_OP_TYPE_USER; + } +} + +static int mca_coll_ucg_type_destroy_user_dt(ompi_datatype_t* datatype, int keyval, + void *attr_val, void *extra) +{ + ucg_dt_h ucg_dt = (ucg_dt_h)attr_val; + ucg_dt_destroy(ucg_dt); + return OMPI_SUCCESS; +} + +static int mca_coll_ucg_type_create_user_dt(ompi_datatype_t *ompi_dt, ucg_dt_h *ucg_dt) +{ + size_t size; + ompi_datatype_type_size(ompi_dt, &size); + ptrdiff_t lb; + ptrdiff_t extent; + ptrdiff_t true_lb; + ptrdiff_t true_extent; + ompi_datatype_get_extent(ompi_dt, &lb, &extent); + ompi_datatype_get_true_extent(ompi_dt, &true_lb, &true_extent); + ucg_dt_params_t params; + params.field_mask = UCG_DT_PARAMS_FIELD_TYPE | + UCG_DT_PARAMS_FIELD_USER_DT | + UCG_DT_PARAMS_FIELD_SIZE | + UCG_DT_PARAMS_FIELD_EXTENT | + UCG_DT_PARAMS_FIELD_TRUE_LB | + UCG_DT_PARAMS_FIELD_TRUE_EXTENT; + params.type = UCG_DT_TYPE_USER; + params.user_dt = ompi_dt; + params.size = (uint32_t)size; + params.extent = (uint32_t)extent; + params.true_lb = (int32_t)true_lb; + params.true_extent = (uint32_t)true_extent; + if (size != (size_t)extent) { + params.field_mask |= UCG_DT_PARAMS_FIELD_CONV; + params.conv.start_pack = mca_coll_ucg_conv_start_pack; + params.conv.pack = mca_coll_ucg_conv_pack; + params.conv.start_unpack = mca_coll_ucg_conv_start_unpack; + params.conv.unpack = mca_coll_ucg_conv_unpack; + params.conv.finish = mca_coll_ucg_conv_finish; + } + ucg_status_t status = ucg_dt_create(¶ms, ucg_dt); + if (status != UCG_OK) { + UCG_ERROR("Failed to create user-defined dt"); + return OMPI_ERROR; + } + return OMPI_SUCCESS; +} + +void mca_coll_ucg_type_free_kv(void) +{ + if (ucg_type_table.attr_key != MPI_KEYVAL_INVALID) { + ompi_attr_free_keyval(TYPE_ATTR, &ucg_type_table.attr_key, false); + ucg_type_table.attr_key = MPI_KEYVAL_INVALID; + } +} + +static void mca_coll_ucg_type_destroy_dt(void) +{ + ucg_dt_type_t type = UCG_DT_TYPE_INT8; + for (; type < UCG_DT_TYPE_PREDEFINED_LAST; ++type) { + if (ucg_type_table.predefined_dt[type] != NULL) { + ucg_dt_destroy(ucg_type_table.predefined_dt[type]); + ucg_type_table.predefined_dt[type] = NULL; + } + } + + return; +} + +static int mca_coll_ucg_type_create_dt(void) +{ + ucg_dt_params_t params; + params.field_mask = UCG_DT_PARAMS_FIELD_TYPE; + ucg_dt_type_t type = UCG_DT_TYPE_INT8; + for (; type < UCG_DT_TYPE_PREDEFINED_LAST; ++type) { + params.type = type; + ucg_status_t status = ucg_dt_create(¶ms, &ucg_type_table.predefined_dt[type]); + if (status != UCG_OK) { + goto err_destroy_dt; + } + } + + /* Create a key for adding user-defined ucg_dt to ompi_dt */ + ompi_attribute_fn_ptr_union_t copy_fn; + ompi_attribute_fn_ptr_union_t del_fn; + copy_fn.attr_datatype_copy_fn = (MPI_Type_copy_attr_function*)MPI_TYPE_NULL_COPY_FN; + del_fn.attr_datatype_delete_fn = mca_coll_ucg_type_destroy_user_dt; + int rc = ompi_attr_create_keyval(TYPE_ATTR, copy_fn, del_fn, + &ucg_type_table.attr_key, NULL, 0, + NULL); + if (rc != OMPI_SUCCESS) { + UCG_ERROR("Failed to create keyval: %d", rc); + goto err_destroy_dt; + } + return OMPI_SUCCESS; + +err_destroy_dt: + mca_coll_ucg_type_destroy_dt(); + return OMPI_ERROR; +} + +static void mca_coll_ucg_type_destroy_op(void) +{ + ucg_op_type_t type = UCG_OP_TYPE_MAX; + for (; type < UCG_OP_TYPE_PREDEFINED_LAST; ++type) { + if (ucg_type_table.predefined_op[type] != NULL) { + ucg_op_destroy(ucg_type_table.predefined_op[type]); + ucg_type_table.predefined_op[type] = NULL; + } + } + + return; +} + +static int mca_coll_ucg_type_create_op(void) +{ + ucg_op_params_t params; + params.field_mask = UCG_DT_PARAMS_FIELD_TYPE; + ucg_op_type_t type = UCG_OP_TYPE_MAX; + for (; type < UCG_OP_TYPE_PREDEFINED_LAST; ++type) { + params.type = type; + ucg_status_t status = ucg_op_create(¶ms, &ucg_type_table.predefined_op[type]); + if (status != UCG_OK) { + goto err_destroy_op; + } + } + return OMPI_SUCCESS; + +err_destroy_op: + mca_coll_ucg_type_destroy_op(); + return OMPI_ERROR; +} + +static int mca_coll_ucg_type_adapt_dt(ompi_datatype_t *ompi_dt, + ucg_dt_type_t type, + ucg_dt_h *ucg_dt) +{ + if (type != UCG_DT_TYPE_USER) { + *ucg_dt = ucg_type_table.predefined_dt[type]; + return OMPI_SUCCESS; + } + + int rc; + int found = 0; + rc = ompi_attr_get_c(ompi_dt->d_keyhash, ucg_type_table.attr_key, (void**)ucg_dt, &found); + if (rc == OMPI_SUCCESS && found) { + return OMPI_SUCCESS; + } + + rc = mca_coll_ucg_type_create_user_dt(ompi_dt, ucg_dt); + if (rc != OMPI_SUCCESS) { + return rc; + } + + rc = ompi_attr_set_c(TYPE_ATTR, ompi_dt, &ompi_dt->d_keyhash, + ucg_type_table.attr_key, (void*)*ucg_dt, false); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_type_destroy_user_dt(ompi_dt, ucg_type_table.attr_key, (void*)*ucg_dt, NULL); + UCG_ERROR("Failed to add UCG datatype attribute for %s: %d", ompi_dt->name, rc); + return rc; + } + return OMPI_SUCCESS; +} + +static int mca_coll_ucg_type_adapt_op(ompi_op_t *ompi_op, + ucg_op_type_t type, + ucg_op_h *ucg_op) +{ + if (type != UCG_OP_TYPE_USER) { + *ucg_op = ucg_type_table.predefined_op[type]; + return OMPI_SUCCESS; + } + /* *ucg_op should point to a memory space of size UCG_OP_SIZE */ + assert(*ucg_op != NULL); + ucg_op_params_t params; + params.field_mask = UCG_OP_PARAMS_FIELD_TYPE | + UCG_OP_PARAMS_FIELD_USER_OP | + UCG_OP_PARAMS_FIELD_USER_FUNC | + UCG_OP_PARAMS_FIELD_COMMUTATIVE; + params.type = type; + params.user_op = (void*)ompi_op; + params.user_func = mca_coll_ucg_op_reduce; + params.commutative = ompi_op_is_commute(ompi_op); + ucg_status_t status = ucg_op_init(¶ms, *ucg_op, UCG_OP_SIZE); + if (status != UCG_OK) { + UCG_ERROR("Failed to initialize ucg op: %d", status); + return OMPI_ERROR; + } + return OMPI_SUCCESS; +} + +int mca_coll_ucg_conv_pool_init(void) +{ + OBJ_CONSTRUCT(&mca_coll_ucg_conv_pool.flist, opal_free_list_t); + int rc = opal_free_list_init(&mca_coll_ucg_conv_pool.flist, sizeof(mca_coll_ucg_convertor_t), + opal_cache_line_size, OBJ_CLASS(mca_coll_ucg_convertor_t), + 0, 0, + 128, INT_MAX, 128, + NULL, 0, NULL, NULL, NULL); + return rc == OPAL_SUCCESS ? OMPI_SUCCESS : OMPI_ERROR; +} + +void mca_coll_ucg_conv_pool_cleanup(void) +{ + OBJ_DESTRUCT(&mca_coll_ucg_conv_pool.flist); + return; +} + +int mca_coll_ucg_type_init(void) +{ + int rc; + rc = mca_coll_ucg_type_create_dt(); + if (rc != OMPI_SUCCESS) { + return rc; + } + + rc = mca_coll_ucg_type_create_op(); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_type_destroy_dt(); + return rc; + } + + return OMPI_SUCCESS; +} + +void mca_coll_ucg_type_cleanup(void) +{ + mca_coll_ucg_type_destroy_op(); + mca_coll_ucg_type_destroy_dt(); + return; +} + +int mca_coll_ucg_type_adapt(ompi_datatype_t *ompi_dt, ucg_dt_h *ucg_dt, + ompi_op_t *ompi_op, ucg_op_h *ucg_op) +{ + if (!ompi_datatype_is_valid(ompi_dt)) { + return OMPI_ERR_NOT_SUPPORTED; + } + + ucg_dt_type_t dt_type = ompi_dt_2_ucg_dt_type(ompi_dt); + if (ompi_op == NULL) { + return mca_coll_ucg_type_adapt_dt(ompi_dt, dt_type, ucg_dt); + } + + /* Both the dt_type and op_type must be predefined or user-defined.*/ + ucg_op_type_t op_type = ompi_op_2_ucg_op_type(ompi_op); + if (dt_type == UCG_DT_TYPE_USER || op_type == UCG_OP_TYPE_USER) { + dt_type = UCG_DT_TYPE_USER; + op_type = UCG_OP_TYPE_USER; + } + + int rc = mca_coll_ucg_type_adapt_op(ompi_op, op_type, ucg_op); + if (rc != OMPI_SUCCESS) { + return rc; + } + return mca_coll_ucg_type_adapt_dt(ompi_dt, dt_type, ucg_dt); +} \ No newline at end of file diff --git a/ompi/mca/coll/ucg/coll_ucg_dt.h b/ompi/mca/coll/ucg/coll_ucg_dt.h new file mode 100644 index 0000000000000000000000000000000000000000..523204b5c3e1df05cf89b8caba072ccc89243cc5 --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_dt.h @@ -0,0 +1,59 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2022 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#ifndef MCA_COLL_UCG_DT_H +#define MCA_COLL_UCG_DT_H + +#include "coll_ucg.h" + + +typedef struct { + opal_free_list_t flist; +} mca_coll_ucg_conv_pool_t; + +typedef struct mca_coll_ucg_convertor { + opal_free_list_item_t super; + ompi_datatype_t *datatype; + opal_convertor_t opal_conv; + size_t offset; +} mca_coll_ucg_convertor_t; +OBJ_CLASS_DECLARATION(mca_coll_ucg_convertor_t); + +typedef struct mca_coll_ucg_type_table { + ucg_dt_h predefined_dt[UCG_DT_TYPE_PREDEFINED_LAST]; + ucg_op_h predefined_op[UCG_OP_TYPE_PREDEFINED_LAST]; + int attr_key; /* key of saving user-defined ucg dt */ +} mca_coll_ucg_type_table_t; + +/* Initialize the convertor pool */ +int mca_coll_ucg_conv_pool_init(void); +/* Cleanup the convertor pool */ +void mca_coll_ucg_conv_pool_cleanup(void); + +/** + * @brief Initialize ucg type. + * + * It depends on ompi attr, should be invoked after ompi_attr_init() + */ +int mca_coll_ucg_type_init(void); +/* Cleanup ucg type */ +void mca_coll_ucg_type_cleanup(void); + +void mca_coll_ucg_type_free_kv(void); +/** + * @brief Adapt ompi type to ucg type. + * + * For operations such as allreduce, *ucg_op should point to a memory space + * of size UCG_OP_SIZE. + */ +int mca_coll_ucg_type_adapt(ompi_datatype_t *ompi_dt, ucg_dt_h *ucg_dt, + ompi_op_t *ompi_op, ucg_op_h *ucg_op); + +#endif //MCA_COLL_UCG_DT_H \ No newline at end of file diff --git a/ompi/mca/coll/ucg/coll_ucg_gatherv.c b/ompi/mca/coll/ucg/coll_ucg_gatherv.c new file mode 100644 index 0000000000000000000000000000000000000000..a8d4f32b8d74b765916cbd67b6fffc28cc359b22 --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_gatherv.c @@ -0,0 +1,272 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#include "coll_ucg.h" +#include "coll_ucg_request.h" +#include "coll_ucg_debug.h" +#include "coll_ucg_dt.h" + + +static int mca_coll_ucg_request_gatherv_init(mca_coll_ucg_req_t *coll_req, + const void *sbuf, int sendcount, + ompi_datatype_t *sdtype, + void *rbuf, const int *recvcounts, + const int *disps, + ompi_datatype_t *rdtype, int root, + mca_coll_ucg_module_t *module, + ucg_request_type_t nb) +{ + ucg_dt_h ucg_send_dt; + int rc = mca_coll_ucg_type_adapt(sdtype, &ucg_send_dt, NULL, NULL); + if (rc != OMPI_SUCCESS) { + UCG_DEBUG("Failed to adapt type"); + return rc; + } + + ucg_dt_h ucg_recv_dt; + rc = mca_coll_ucg_type_adapt(rdtype, &ucg_recv_dt, NULL, NULL); + if (rc != OMPI_SUCCESS) { + UCG_DEBUG("Failed to adapt type"); + return rc; + } + + ucg_request_h ucg_req; + ucg_status_t status = ucg_request_gatherv_init(sbuf, sendcount, ucg_send_dt, + rbuf, recvcounts, disps, ucg_recv_dt, root, + module->group, &coll_req->info, + nb, &ucg_req); + if (status != UCG_OK) { + UCG_DEBUG("Failed to initialize ucg request, %s", ucg_status_string(status)); + return OMPI_ERROR; + } + coll_req->ucg_req = ucg_req; + return OMPI_SUCCESS; +} + +int mca_coll_ucg_gatherv(const void *sbuf, int sendcount, + ompi_datatype_t *sdtype, void *rbuf, + const int *recvcounts, const int *disps, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg gatherv"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_req_t coll_req; + OBJ_CONSTRUCT(&coll_req, mca_coll_ucg_req_t); + int rc; + rc = mca_coll_ucg_request_common_init(&coll_req, false, false); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_gatherv_init(&coll_req, sbuf, sendcount, sdtype, + rbuf, recvcounts, disps, rdtype, root, + ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_execute(&coll_req); + mca_coll_ucg_request_cleanup(&coll_req); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + OBJ_DESTRUCT(&coll_req); + return OMPI_SUCCESS; + +fallback: + OBJ_DESTRUCT(&coll_req); + UCG_DEBUG("fallback gatherv"); + return ucg_module->previous_gatherv(sbuf, sendcount, sdtype, rbuf, + recvcounts, disps, + rdtype, root, comm, + ucg_module->previous_gatherv_module); +} + +int mca_coll_ucg_gatherv_cache(const void *sbuf, int sendcount, + ompi_datatype_t *sdtype, void *rbuf, const int *recvcounts, + const int *disps, ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg gatherv cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_GATHERV, + .comm = comm, + .gatherv.sbuf = sbuf, + .gatherv.scount = sendcount, + .gatherv.sdtype = sdtype, + .gatherv.rbuf = rbuf, + .gatherv.rcounts = recvcounts, + .gatherv.disps = disps, + .gatherv.rdtype = rdtype, + .gatherv.root = root, + }; + + int rc; + rc = mca_coll_ucg_request_execute_cache(&args); + if (rc == OMPI_SUCCESS) { + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN(&args, mca_coll_ucg_request_gatherv_init, + sbuf, sendcount, sdtype, rbuf, recvcounts, disps, + rdtype, root, ucg_module, UCG_REQUEST_BLOCKING); + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback gatherv"); + return ucg_module->previous_gatherv(sbuf, sendcount, sdtype, rbuf, + recvcounts, disps, rdtype, root, comm, + ucg_module->previous_gatherv_module); +} + +int mca_coll_ucg_igatherv(const void *sbuf, int sendcount, + ompi_datatype_t *sdtype, void *rbuf, + const int *recvcounts, const int *disps, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg igatherv"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, true, false); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_gatherv_init(coll_req, sbuf, sendcount, sdtype, + rbuf, recvcounts, disps, rdtype, root, + ucg_module, UCG_REQUEST_NONBLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_execute_nb(coll_req); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + *request = &coll_req->super.super; + + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback igatherv"); + return ucg_module->previous_igatherv(sbuf, sendcount, sdtype, rbuf, + recvcounts, disps, rdtype, root, comm, request, + ucg_module->previous_igatherv_module); +} + +int mca_coll_ucg_igatherv_cache(const void *sbuf, int sendcount, + ompi_datatype_t *sdtype, void *rbuf, + const int *recvcounts, const int *disps, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg igatherv cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_IGATHERV, + .comm = comm, + .gatherv.sbuf = sbuf, + .gatherv.scount = sendcount, + .gatherv.sdtype = sdtype, + .gatherv.rbuf = rbuf, + .gatherv.rcounts = recvcounts, + .gatherv.disps = disps, + .gatherv.rdtype = rdtype, + .gatherv.root = root, + }; + + int rc; + mca_coll_ucg_req_t *coll_req = NULL; + rc = mca_coll_ucg_request_execute_cache_nb(&args, &coll_req); + if (rc == OMPI_SUCCESS) { + *request = &coll_req->super.super; + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN_NB(request, &args, mca_coll_ucg_request_gatherv_init, + sbuf, sendcount, sdtype, rbuf, recvcounts, disps, + rdtype, root, ucg_module, UCG_REQUEST_NONBLOCKING); + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback igatherv"); + return ucg_module->previous_igatherv(sbuf, sendcount, sdtype, rbuf, + recvcounts, disps, rdtype, root, comm, request, + ucg_module->previous_igatherv_module); +} + +int mca_coll_ucg_gatherv_init(const void *sbuf, int sendcount, + ompi_datatype_t *sdtype, void *rbuf, + const int *recvcounts, const int *disps, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, ompi_info_t *info, + ompi_request_t **request, mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg gatherv init"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, false, true); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_gatherv_init(coll_req, sbuf, sendcount, + sdtype, rbuf, recvcounts, disps, rdtype, root, + ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + *request = &coll_req->super.super; + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback gatherv init"); + return ucg_module->previous_gatherv_init(sbuf, sendcount, sdtype, rbuf, + recvcounts, disps, rdtype, root, comm, info, + request, ucg_module->previous_gatherv_module); +} \ No newline at end of file diff --git a/ompi/mca/coll/ucg/coll_ucg_module.c b/ompi/mca/coll/ucg/coll_ucg_module.c new file mode 100644 index 0000000000000000000000000000000000000000..d315b841e0a916414e98a87c588bf59bf5f682c1 --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_module.c @@ -0,0 +1,589 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#include "coll_ucg.h" +#include "coll_ucg_debug.h" +#include "coll_ucg_dt.h" +#include "coll_ucg_request.h" + +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/pml/ucx/pml_ucx.h" +#include "opal/util/argv.h" + +/* Ensure coll ucg can be dlopened if global var "ompi_pml_ucx" is not existed*/ +mca_pml_ucx_module_t ompi_pml_ucx __attribute__((weak)); + +#define MCA_COLL_UCG_SET_HANDLER(_api) \ + if (mca_coll_ucg_is_api_enable(#_api)) { \ + module->super.coll_ ## _api = mca_coll_ucg_ ## _api;\ + } + +#define MCA_COLL_UCG_SET_CACHE_HANDLER(_api) \ + if (mca_coll_ucg_is_api_enable(#_api)) { \ + module->super.coll_ ## _api = mca_coll_ucg_ ## _api ## _cache; \ + } + +#define MCA_COLL_UCG_SAVE_FALLBACK(_api) \ + do {\ + ucg_module->previous_ ## _api = comm->c_coll->coll_ ## _api;\ + ucg_module->previous_ ## _api ## _module = comm->c_coll->coll_ ## _api ## _module;\ + if (!comm->c_coll->coll_ ## _api || !comm->c_coll->coll_ ## _api ## _module) {\ + return OMPI_ERROR;\ + }\ + OBJ_RETAIN(ucg_module->previous_ ## _api ## _module);\ + } while(0) + +#define MCA_COLL_UCG_FREE_FALLBACK(_api) \ + if (NULL != ucg_module->previous_ ## _api ## _module) { \ + OBJ_RELEASE(ucg_module->previous_ ## _api ## _module); \ + } + + +static int mca_coll_ucg_progress(void) +{ + ucg_progress(mca_coll_ucg_component.ucg_context); + return OMPI_SUCCESS; +} + +static ucg_status_t mca_coll_ucg_oob_allgather(const void *sendbuf, void *recvbuf, int count, void *group) +{ + int rc; + ompi_communicator_t *comm = (ompi_communicator_t *)group; + rc = ompi_coll_base_allgather_intra_bruck(sendbuf, count, MPI_CHAR, + recvbuf, count, MPI_CHAR, + comm, NULL); + return (rc == OMPI_SUCCESS) ? UCG_OK : UCG_ERR_NO_RESOURCE; +} + +static ucg_status_t mca_coll_ucg_oob_blocking_allgather(const void *sendbuf, + void *recvbuf, + int count, + void *group) +{ + int rc, i; + ompi_communicator_t *comm = (ompi_communicator_t *)group; + int rank = ompi_comm_rank(comm); + int size = ompi_comm_size(comm); + + if (rank == 0) { + //gather all rank data to recvbuf + for (i = 0; i < size; i++) { + if (i == 0) { + memcpy(recvbuf, sendbuf, count); + } else { + rc = MCA_PML_CALL(recv((char *)recvbuf + i * count, count, MPI_CHAR, i, + MCA_COLL_BASE_TAG_ALLGATHER, comm, + MPI_STATUS_IGNORE)); + if (rc != OMPI_SUCCESS) { + goto out; + } + } + } + + //bcast recvbuf to all rank + for (i = 1; i < size; i++) { + rc = MCA_PML_CALL(send((char *)recvbuf, size * count, MPI_CHAR, i, + MCA_COLL_BASE_TAG_ALLGATHER, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (rc != OMPI_SUCCESS) { + goto out; + } + } + } else { + //send data to rank 0 + rc = MCA_PML_CALL(send((char *)sendbuf, count, MPI_CHAR, 0, MCA_COLL_BASE_TAG_ALLGATHER, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (rc != OMPI_SUCCESS) { + goto out; + } + + //recv gather data from rank 0 + rc = MCA_PML_CALL(recv((char *)recvbuf, size * count, MPI_CHAR, 0, + MCA_COLL_BASE_TAG_ALLGATHER, comm, + MPI_STATUS_IGNORE)); + if (rc != OMPI_SUCCESS) { + goto out; + } + } + +out: + return (rc == OMPI_SUCCESS) ? UCG_OK : UCG_ERR_NO_RESOURCE; +} + +static ucg_status_t mca_coll_ucg_get_proc_info(ucg_rank_t rank, ucg_proc_info_t **proc) +{ + int comm_size = ompi_process_info.num_procs; + if (rank >= comm_size) { + return UCG_ERR_INVALID_PARAM; + } + + char rank_addr_identify[32] = {0}; + mca_coll_ucg_component_t *cm = &mca_coll_ucg_component; + const char *mca_type_name = cm->super.collm_version.mca_type_name; + const char *mca_component_name = cm->super.collm_version.mca_component_name; + uint32_t jobid = (uint32_t)OMPI_PROC_MY_NAME->jobid; + uint32_t vpid = rank; + sprintf(rank_addr_identify, "%s.%s.%u.%u", mca_type_name, mca_component_name, jobid, vpid); + + int rc; + uint32_t proc_size; + opal_process_name_t proc_name = {.vpid = rank, .jobid = OMPI_PROC_MY_NAME->jobid}; + OPAL_MODEX_RECV_STRING(rc, rank_addr_identify, &proc_name, (void**)proc, &proc_size); + if (rc != OPAL_SUCCESS) { + return UCG_ERR_NOT_FOUND; + } + return UCG_OK; +} + +static int mca_coll_ucg_get_world_rank(void *arg, int rank) +{ + ompi_communicator_t* comm = (ompi_communicator_t*)arg; + ompi_proc_t *proc = ompi_comm_peer_lookup(comm, rank); + return ((ompi_process_name_t*)&proc->super.proc_name)->vpid; +} + +/* mca_coll_ucg_fill_oob_group is used in ucg_init */ +static void mca_coll_ucg_fill_oob_group(ucg_oob_group_t *oob_group) +{ + oob_group->myrank = ompi_process_info.my_name.vpid; + oob_group->size = ompi_process_info.num_procs; + oob_group->num_local_procs = ompi_process_info.num_local_peers + 1; + return; +} + +/* mca_coll_ucg_fill_group_oob_group is used in ucg_group_create. + * If ompi_mpi_thread_multiple is true, ompi_sync_wait_mt will be nested + * called which will cause the program hangs. */ +static void mca_coll_ucg_fill_group_oob_group(ucg_oob_group_t *oob_group, + ompi_communicator_t *comm) +{ + oob_group->allgather = ompi_mpi_thread_multiple ? + mca_coll_ucg_oob_blocking_allgather : + mca_coll_ucg_oob_allgather; + oob_group->myrank = (ucg_rank_t)ompi_comm_rank(comm); + oob_group->size = (uint32_t)ompi_comm_size(comm); + oob_group->group = (void *)comm; + return; +} + +static void mca_coll_ucg_fill_rank_map(ucg_rank_map_t *rank_map, + ompi_communicator_t *comm) +{ + rank_map->size = (uint32_t)ompi_comm_size(comm); + if (comm == &ompi_mpi_comm_world.comm) { + rank_map->type = UCG_RANK_MAP_TYPE_FULL; + } else { + rank_map->type = UCG_RANK_MAP_TYPE_CB; + rank_map->cb.mapping = mca_coll_ucg_get_world_rank; + rank_map->cb.arg = (void *)comm; + } + return; +} + +static void *mca_coll_ucg_get_ucp_ep(void *arg, void *oob_group, int rank) +{ + ompi_communicator_t *comm = (ompi_communicator_t*)oob_group; + ompi_proc_t *proc = ompi_comm_peer_lookup(comm, rank); + ucp_ep_h ep = proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + if (OPAL_LIKELY(ep != NULL)) { + return (void*)ep; + } + + const int nprocs = 1; + int ret = MCA_PML_CALL(add_procs(&proc, nprocs)); + if (ret != OMPI_SUCCESS) { + return NULL; + } + + return (void*)ompi_comm_peer_lookup(comm, rank)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; +} + +static void *mca_coll_ucg_get_ucp_worker(void *arg) +{ + return (void*)ompi_pml_ucx.ucp_worker; +} + +static void *mca_coll_ucg_get_ucp_context(void *arg) +{ + return (void*)ompi_pml_ucx.ucp_context; +} + +static int mca_coll_ucg_init(void) +{ + mca_coll_ucg_component_t *cm = &mca_coll_ucg_component; + ucg_status_t status; + ucg_config_h config; + + ucg_global_params_t global_params = { + .field_mask = UCG_GLOBAL_PARAMS_FIELD_OOB_RESOURCE, + .oob_resource.get_ucp_ep = mca_coll_ucg_get_ucp_ep, + .oob_resource.get_ucp_worker = mca_coll_ucg_get_ucp_worker, + .oob_resource.get_ucp_context = mca_coll_ucg_get_ucp_context + }; + + status = ucg_global_init(&global_params); + if (status != UCG_OK) { + UCG_ERROR("UCG global init failed: %s", ucg_status_string(status)); + return OMPI_ERROR; + } + + status = ucg_config_read(NULL, NULL, &config); + if (status != UCG_OK) { + UCG_ERROR("UCG config read failed: %s", ucg_status_string(status)); + ucg_global_cleanup(); + return OMPI_ERROR; + } + + ucg_params_t params; + params.field_mask = UCG_PARAMS_FIELD_OOB_GROUP | + UCG_PARAMS_FIELD_THREAD_MODE | + UCG_PARAMS_FIELD_PROC_INFO_CB; + mca_coll_ucg_fill_oob_group(¶ms.oob_group); + params.get_proc_info = mca_coll_ucg_get_proc_info; + params.thread_mode = ompi_mpi_thread_multiple ? UCG_THREAD_MODE_MULTI : UCG_THREAD_MODE_SINGLE; + status = ucg_init(¶ms, config, &cm->ucg_context); + ucg_config_release(config); + if (status != UCG_OK) { + UCG_ERROR("UCG context init failed: %s", ucg_status_string(status)); + ucg_global_cleanup(); + return OMPI_ERROR; + } + return OMPI_SUCCESS; +} + +static void mca_coll_ucg_cleanup(void) +{ + mca_coll_ucg_component_t *cm = &mca_coll_ucg_component; + ucg_cleanup(cm->ucg_context); + cm->ucg_context = NULL; + ucg_global_cleanup(); + return; +} + +int mca_coll_ucg_init_once() +{ + mca_coll_ucg_component_t *cm = &mca_coll_ucg_component; + if (cm->initialized) { + return OMPI_SUCCESS; + } + + int rc; + rc = mca_coll_ucg_conv_pool_init(); + if (rc != OMPI_SUCCESS) { + goto err; + } + + rc = mca_coll_ucg_rpool_init(); + if (rc != OMPI_SUCCESS) { + goto err_cleanup_conv_pool; + } + + uint32_t size = ompi_process_info.num_procs; + rc = mca_coll_ucg_subargs_pool_init(size); + if (rc != OMPI_SUCCESS) { + UCG_ERROR("Failed to init subargs mpool, %d", rc); + goto err_cleanup_rpool; + } + + if (ompi_mpi_thread_multiple) { + UCG_DEBUG("rcache is non-thread-safe, disable it"); + cm->max_rcache_size = 0; + } + + if (cm->max_rcache_size > 0) { + UCG_DEBUG("max rcache size is %d", cm->max_rcache_size); + rc = mca_coll_ucg_rcache_init(cm->max_rcache_size); + if (rc != OMPI_SUCCESS) { + goto err_cleanup_subargs_pool; + } + } + + if (cm->disable_coll != NULL) { + UCG_DEBUG("Disable %s", cm->disable_coll); + cm->blacklist = opal_argv_split(cm->disable_coll, ','); + } + + mca_coll_ucg_npolls_init(cm->npolls); + + rc = mca_coll_ucg_init(); + if (rc != OMPI_SUCCESS) { + goto err_free_blacklist; + } + + /* everything is ready, register progress function. */ + opal_progress_register(mca_coll_ucg_progress); + cm->initialized = true; + return OMPI_SUCCESS; + +err_free_blacklist: + if (cm->blacklist != NULL) { + opal_argv_free(cm->blacklist); + } + if (cm->max_rcache_size > 0) { + mca_coll_ucg_rcache_cleanup(); + } +err_cleanup_subargs_pool: + mca_coll_ucg_subargs_pool_cleanup(); +err_cleanup_rpool: + mca_coll_ucg_rpool_cleanup(); +err_cleanup_conv_pool: + mca_coll_ucg_conv_pool_cleanup(); +err: + return rc; +} + +void mca_coll_ucg_cleanup_once(void) +{ + mca_coll_ucg_component_t *cm = &mca_coll_ucg_component; + if (!cm->initialized) { + return; + } + + opal_progress_unregister(mca_coll_ucg_progress); + mca_coll_ucg_type_cleanup(); + mca_coll_ucg_cleanup(); + if (cm->blacklist != NULL) { + opal_argv_free(cm->blacklist); + } + if (cm->max_rcache_size > 0) { + mca_coll_ucg_rcache_cleanup(); + } + mca_coll_ucg_rpool_cleanup(); + mca_coll_ucg_conv_pool_cleanup(); + return; +} + +static int mca_coll_ucg_save_fallback(mca_coll_ucg_module_t *ucg_module, + ompi_communicator_t *comm) +{ + MCA_COLL_UCG_SAVE_FALLBACK(allreduce); + MCA_COLL_UCG_SAVE_FALLBACK(bcast); + MCA_COLL_UCG_SAVE_FALLBACK(barrier); + MCA_COLL_UCG_SAVE_FALLBACK(alltoallv); + MCA_COLL_UCG_SAVE_FALLBACK(scatterv); + MCA_COLL_UCG_SAVE_FALLBACK(gatherv); + MCA_COLL_UCG_SAVE_FALLBACK(allgatherv); + + MCA_COLL_UCG_SAVE_FALLBACK(iallreduce); + MCA_COLL_UCG_SAVE_FALLBACK(ibcast); + MCA_COLL_UCG_SAVE_FALLBACK(ibarrier); + MCA_COLL_UCG_SAVE_FALLBACK(ialltoallv); + MCA_COLL_UCG_SAVE_FALLBACK(iscatterv); + MCA_COLL_UCG_SAVE_FALLBACK(igatherv); + MCA_COLL_UCG_SAVE_FALLBACK(iallgatherv); + + MCA_COLL_UCG_SAVE_FALLBACK(allreduce_init); + MCA_COLL_UCG_SAVE_FALLBACK(bcast_init); + MCA_COLL_UCG_SAVE_FALLBACK(barrier_init); + MCA_COLL_UCG_SAVE_FALLBACK(alltoallv_init); + MCA_COLL_UCG_SAVE_FALLBACK(scatterv_init); + MCA_COLL_UCG_SAVE_FALLBACK(gatherv_init); + MCA_COLL_UCG_SAVE_FALLBACK(allgatherv_init); + + return OMPI_SUCCESS; +} + +static void mca_coll_ucg_free_fallback(mca_coll_ucg_module_t *ucg_module) +{ + MCA_COLL_UCG_FREE_FALLBACK(allreduce); + MCA_COLL_UCG_FREE_FALLBACK(bcast); + MCA_COLL_UCG_FREE_FALLBACK(barrier); + MCA_COLL_UCG_FREE_FALLBACK(alltoallv); + MCA_COLL_UCG_FREE_FALLBACK(scatterv); + MCA_COLL_UCG_FREE_FALLBACK(gatherv); + MCA_COLL_UCG_FREE_FALLBACK(allgatherv); + + MCA_COLL_UCG_FREE_FALLBACK(iallreduce); + MCA_COLL_UCG_FREE_FALLBACK(ibcast); + MCA_COLL_UCG_FREE_FALLBACK(ibarrier); + MCA_COLL_UCG_FREE_FALLBACK(ialltoallv); + MCA_COLL_UCG_FREE_FALLBACK(iscatterv); + MCA_COLL_UCG_FREE_FALLBACK(igatherv); + MCA_COLL_UCG_FREE_FALLBACK(iallgatherv); + + MCA_COLL_UCG_FREE_FALLBACK(allreduce_init); + MCA_COLL_UCG_FREE_FALLBACK(bcast_init); + MCA_COLL_UCG_FREE_FALLBACK(barrier_init); + MCA_COLL_UCG_FREE_FALLBACK(alltoallv_init); + MCA_COLL_UCG_FREE_FALLBACK(scatterv_init); + MCA_COLL_UCG_FREE_FALLBACK(gatherv_init); + MCA_COLL_UCG_FREE_FALLBACK(allgatherv_init); + + return; +} + +static int mca_coll_ucg_create_group(mca_coll_ucg_module_t *module, ompi_communicator_t *comm) +{ + ucg_status_t rc; + mca_coll_ucg_component_t *cm = &mca_coll_ucg_component; + ucg_group_params_t params; + + /* Set UCG group parameter*/ + params.field_mask = UCG_GROUP_PARAMS_FIELD_ID | + UCG_GROUP_PARAMS_FIELD_SIZE | + UCG_GROUP_PARAMS_FIELD_MYRANK | + UCG_GROUP_PARAMS_FIELD_RANK_MAP | + UCG_GROUP_PARAMS_FIELD_OOB_GROUP; + params.id = ompi_comm_get_local_cid(comm); + params.size = (uint32_t)ompi_comm_size(comm); + params.myrank = (ucg_rank_t)ompi_comm_rank(comm); + mca_coll_ucg_fill_rank_map(¶ms.rank_map, comm); + mca_coll_ucg_fill_group_oob_group(¶ms.oob_group, comm); + + /* Initialize UCG group*/ + rc = ucg_group_create(cm->ucg_context, ¶ms, &module->group); + if (rc != UCG_OK) { + UCG_ERROR("UCG create group failed: %s", ucg_status_string(rc)); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +static int mca_coll_ucg_module_enable(mca_coll_base_module_t *module, + ompi_communicator_t *comm) +{ + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t *)module; + int rc; + /* if any fails, resources will be freed in mca_coll_ucg_module_destruct() */ + rc = mca_coll_ucg_save_fallback(ucg_module, comm); + if (rc != OMPI_SUCCESS) { + UCG_ERROR("Failed to save coll fallback, %d", rc); + return rc; + } + + rc = mca_coll_ucg_type_init(); + if (rc != OMPI_SUCCESS) { + UCG_ERROR("Failed to init ucg type, %d", rc); + return rc; + } + + rc = mca_coll_ucg_create_group(ucg_module, comm); + if (rc != OMPI_SUCCESS) { + UCG_ERROR("Failed to create ucg group, %d", rc); + return rc; + } + + UCG_DEBUG("Module initialized"); + return OMPI_SUCCESS; +} + +static bool mca_coll_ucg_is_api_enable(const char *api) +{ + char **blacklist = mca_coll_ucg_component.blacklist; + if (blacklist == NULL) { + return true; + } + + for (; *blacklist != NULL; ++blacklist) { + if (!strcmp(*blacklist, api)) { + return false; + } + } + + return true; +} + +static void mca_coll_ucg_module_construct(mca_coll_ucg_module_t *module) +{ + memset((char*)module + sizeof(module->super), 0, sizeof(*module) - sizeof(module->super)); + module->super.coll_module_enable = mca_coll_ucg_module_enable; + if (mca_coll_ucg_component.max_rcache_size > 0) { + MCA_COLL_UCG_SET_CACHE_HANDLER(allreduce); + MCA_COLL_UCG_SET_CACHE_HANDLER(barrier); + MCA_COLL_UCG_SET_CACHE_HANDLER(bcast); + MCA_COLL_UCG_SET_CACHE_HANDLER(alltoallv); + MCA_COLL_UCG_SET_CACHE_HANDLER(scatterv); + MCA_COLL_UCG_SET_CACHE_HANDLER(gatherv); + MCA_COLL_UCG_SET_CACHE_HANDLER(allgatherv); + + MCA_COLL_UCG_SET_CACHE_HANDLER(iallreduce); + MCA_COLL_UCG_SET_CACHE_HANDLER(ibarrier); + MCA_COLL_UCG_SET_CACHE_HANDLER(ibcast); + MCA_COLL_UCG_SET_CACHE_HANDLER(ialltoallv); + MCA_COLL_UCG_SET_CACHE_HANDLER(iscatterv); + MCA_COLL_UCG_SET_CACHE_HANDLER(igatherv); + MCA_COLL_UCG_SET_CACHE_HANDLER(iallgatherv); + } else { + MCA_COLL_UCG_SET_HANDLER(allreduce); + MCA_COLL_UCG_SET_HANDLER(barrier); + MCA_COLL_UCG_SET_HANDLER(bcast); + MCA_COLL_UCG_SET_HANDLER(alltoallv); + MCA_COLL_UCG_SET_HANDLER(scatterv); + MCA_COLL_UCG_SET_HANDLER(gatherv); + MCA_COLL_UCG_SET_HANDLER(allgatherv); + + MCA_COLL_UCG_SET_HANDLER(iallreduce); + MCA_COLL_UCG_SET_HANDLER(ibarrier); + MCA_COLL_UCG_SET_HANDLER(ibcast); + MCA_COLL_UCG_SET_HANDLER(ialltoallv); + MCA_COLL_UCG_SET_HANDLER(iscatterv); + MCA_COLL_UCG_SET_HANDLER(igatherv); + MCA_COLL_UCG_SET_HANDLER(iallgatherv); + } + + MCA_COLL_UCG_SET_HANDLER(allreduce_init); + MCA_COLL_UCG_SET_HANDLER(barrier_init); + MCA_COLL_UCG_SET_HANDLER(bcast_init); + MCA_COLL_UCG_SET_HANDLER(alltoallv_init); + MCA_COLL_UCG_SET_HANDLER(scatterv_init); + MCA_COLL_UCG_SET_HANDLER(gatherv_init); + MCA_COLL_UCG_SET_HANDLER(allgatherv_init); + return; +} + +static void mca_coll_ucg_module_destruct(mca_coll_ucg_module_t *ucg_module) +{ + if (ucg_module->group != NULL) { + if (mca_coll_ucg_component.max_rcache_size > 0) { + mca_coll_ucg_rcache_del_by_comm(ucg_module->comm); + } + ucg_group_destroy(ucg_module->group); + ucg_module->group = NULL; + } + + /* kv must be freed before component close */ + if (ucg_module->comm == &ompi_mpi_comm_world.comm) { + mca_coll_ucg_type_free_kv(); + } + + mca_coll_ucg_free_fallback(ucg_module); + return; +} + +OBJ_CLASS_INSTANCE(mca_coll_ucg_module_t, + mca_coll_base_module_t, + mca_coll_ucg_module_construct, + mca_coll_ucg_module_destruct); + +int mca_coll_ucg_init_query(bool enable_progress_threads, bool enable_mpi_threads) +{ + return OMPI_SUCCESS; +} + +mca_coll_base_module_t *mca_coll_ucg_comm_query(ompi_communicator_t *comm, int *priority) +{ + mca_coll_ucg_component_t *cm = &mca_coll_ucg_component; + mca_coll_ucg_module_t *ucg_module; + + if ((OMPI_COMM_IS_INTER(comm)) || (ompi_comm_size(comm) < 2)) { + return NULL; + } + + ucg_module = OBJ_NEW(mca_coll_ucg_module_t); + if (ucg_module == NULL) { + return NULL; + } + ucg_module->comm = comm; + + *priority = cm->priority; + return &(ucg_module->super); +} diff --git a/ompi/mca/coll/ucg/coll_ucg_request.c b/ompi/mca/coll/ucg/coll_ucg_request.c new file mode 100644 index 0000000000000000000000000000000000000000..555df135353c8045b4cde04034d8c8d35163443b --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_request.c @@ -0,0 +1,725 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#include "coll_ucg_request.h" +#include "coll_ucg_debug.h" +#include "coll_ucg.h" + + +/* todo: move to op.h ? */ +#define OMPI_OP_RETAIN(_op) \ + if (!ompi_op_is_intrinsic(_op)) { \ + OBJ_RETAIN(_op); \ + } + +#define OMPI_OP_RELEASE(_op) \ + if (!ompi_op_is_intrinsic(_op)) { \ + OBJ_RELEASE(_op); \ + } + +mca_coll_ucg_rpool_t mca_coll_ucg_rpool = {0}; +static mca_coll_ucg_rcache_t mca_coll_ucg_rcache; +static int npolls = 10; +const int MAX_NPOLLS = 100; + +mca_coll_ucg_subargs_pool_t mca_coll_ucg_subargs_pool = {0}; + +static void ucg_coll_ucg_rcache_ref(mca_coll_ucg_req_t *coll_req) +{ + mca_coll_ucg_args_t *args = &coll_req->args; + switch (args->coll_type) { + case MCA_COLL_UCG_TYPE_BCAST: + case MCA_COLL_UCG_TYPE_IBCAST: + OMPI_DATATYPE_RETAIN(args->bcast.datatype); + break; + case MCA_COLL_UCG_TYPE_ALLREDUCE: + case MCA_COLL_UCG_TYPE_IALLREDUCE: + OMPI_DATATYPE_RETAIN(args->allreduce.datatype); + OMPI_OP_RETAIN(args->allreduce.op); + break; + case MCA_COLL_UCG_TYPE_ALLTOALLV: + case MCA_COLL_UCG_TYPE_IALLTOALLV: + OMPI_DATATYPE_RETAIN(args->alltoallv.sdtype); + OMPI_DATATYPE_RETAIN(args->alltoallv.rdtype); + case MCA_COLL_UCG_TYPE_BARRIER: + case MCA_COLL_UCG_TYPE_IBARRIER: + break; + case MCA_COLL_UCG_TYPE_SCATTERV: + case MCA_COLL_UCG_TYPE_ISCATTERV: + OMPI_DATATYPE_RETAIN(args->scatterv.sdtype); + OMPI_DATATYPE_RETAIN(args->scatterv.rdtype); + break; + case MCA_COLL_UCG_TYPE_GATHERV: + case MCA_COLL_UCG_TYPE_IGATHERV: + OMPI_DATATYPE_RETAIN(args->gatherv.sdtype); + OMPI_DATATYPE_RETAIN(args->gatherv.rdtype); + break; + case MCA_COLL_UCG_TYPE_ALLGATHERV: + case MCA_COLL_UCG_TYPE_IALLGATHERV: + OMPI_DATATYPE_RETAIN(args->allgatherv.sdtype); + OMPI_DATATYPE_RETAIN(args->allgatherv.rdtype); + break; + default: + UCG_FATAL("Unsupported collective type(%d).", args->coll_type); + break; + } + return; +} + +static void ucg_coll_ucg_rcache_deref(mca_coll_ucg_req_t *coll_req) +{ + mca_coll_ucg_args_t *args = &coll_req->args; + switch (args->coll_type) { + case MCA_COLL_UCG_TYPE_BCAST: + case MCA_COLL_UCG_TYPE_IBCAST: + OMPI_DATATYPE_RELEASE(args->bcast.datatype); + break; + case MCA_COLL_UCG_TYPE_ALLREDUCE: + case MCA_COLL_UCG_TYPE_IALLREDUCE: + OMPI_DATATYPE_RELEASE(args->allreduce.datatype); + OMPI_OP_RELEASE(args->allreduce.op); + break; + case MCA_COLL_UCG_TYPE_ALLTOALLV: + case MCA_COLL_UCG_TYPE_IALLTOALLV: + OMPI_DATATYPE_RELEASE(args->alltoallv.sdtype); + OMPI_DATATYPE_RELEASE(args->alltoallv.rdtype); + case MCA_COLL_UCG_TYPE_BARRIER: + case MCA_COLL_UCG_TYPE_IBARRIER: + break; + case MCA_COLL_UCG_TYPE_SCATTERV: + case MCA_COLL_UCG_TYPE_ISCATTERV: + OMPI_DATATYPE_RELEASE(args->scatterv.sdtype); + OMPI_DATATYPE_RELEASE(args->scatterv.rdtype); + break; + case MCA_COLL_UCG_TYPE_GATHERV: + case MCA_COLL_UCG_TYPE_IGATHERV: + OMPI_DATATYPE_RELEASE(args->gatherv.sdtype); + OMPI_DATATYPE_RELEASE(args->gatherv.rdtype); + break; + case MCA_COLL_UCG_TYPE_ALLGATHERV: + case MCA_COLL_UCG_TYPE_IALLGATHERV: + OMPI_DATATYPE_RELEASE(args->allgatherv.sdtype); + OMPI_DATATYPE_RELEASE(args->allgatherv.rdtype); + break; + default: + UCG_FATAL("Unsupported collective type(%d).", args->coll_type); + break; + } + return; +} + +static inline void mca_coll_ucg_rcache_full_adjust(void) +{ + // LRU, remove the last item + opal_list_t *requests = &mca_coll_ucg_rcache.requests; + if ((int)opal_list_get_size(requests) == mca_coll_ucg_rcache.max_size) { + opal_list_item_t *item = opal_list_remove_last(requests); + mca_coll_ucg_req_t *coll_req = container_of(item, mca_coll_ucg_req_t, list); + mca_coll_ucg_rcache_del(coll_req); + } + return; +} + +static int mca_coll_ucg_request_start(size_t count, ompi_request_t **requests) +{ + for (size_t i = 0; i < count; ++i) { + mca_coll_ucg_req_t *coll_req = (mca_coll_ucg_req_t*)requests[i]; + if (coll_req == NULL) { + continue; + } + + int rc = mca_coll_ucg_request_execute_nb(coll_req); + if (rc != OMPI_SUCCESS) { + return rc; + } + } + return OMPI_SUCCESS; +} + +static void mca_coll_ucg_request_complete(void *arg, ucg_status_t status) +{ + mca_coll_ucg_req_t *coll_req = (mca_coll_ucg_req_t*)arg; + ompi_request_t *ompi_req = &coll_req->super.super; + if (status == UCG_OK) { + ompi_req->req_status.MPI_ERROR = MPI_SUCCESS; + } else { + ompi_req->req_status.MPI_ERROR = MPI_ERR_INTERN; + } + ompi_request_complete(ompi_req, true); + return; +} + +static int mca_coll_ucg_request_free(ompi_request_t **ompi_req) +{ + mca_coll_ucg_req_t *coll_req = (mca_coll_ucg_req_t*)(*ompi_req); + if (!REQUEST_COMPLETE(*ompi_req)) { + return MPI_ERR_REQUEST; + } + + if (coll_req->cacheable) { + if ((*ompi_req)->req_status.MPI_ERROR == MPI_SUCCESS) { + mca_coll_ucg_rcache_put(coll_req); + } else { + mca_coll_ucg_rcache_del(coll_req); + } + } else { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + } + *ompi_req = MPI_REQUEST_NULL; + return MPI_SUCCESS; +} + +static int mca_coll_ucg_request_cancel(ompi_request_t* request, int flag) +{ + return MPI_ERR_REQUEST; +} + +OBJ_CLASS_INSTANCE(mca_coll_ucg_req_t, + ompi_coll_base_nbc_request_t, + NULL, + NULL); + +OBJ_CLASS_INSTANCE(mca_coll_ucg_subargs_t, + opal_free_list_item_t, + NULL, + NULL); + +int mca_coll_ucg_rpool_init(void) +{ + OBJ_CONSTRUCT(&mca_coll_ucg_rpool.flist, opal_free_list_t); + int rc = opal_free_list_init(&mca_coll_ucg_rpool.flist, sizeof(mca_coll_ucg_req_t), + opal_cache_line_size, OBJ_CLASS(mca_coll_ucg_req_t), + 0, 0, + 0, INT_MAX, 128, + NULL, 0, NULL, NULL, NULL); + return rc == OPAL_SUCCESS ? OMPI_SUCCESS : OMPI_ERROR; +} + +void mca_coll_ucg_rpool_cleanup(void) +{ + OBJ_DESTRUCT(&mca_coll_ucg_rpool.flist); + return; +} + +int mca_coll_ucg_subargs_pool_init(uint32_t size) +{ + OBJ_CONSTRUCT(&mca_coll_ucg_subargs_pool.flist, opal_free_list_t); + int rc = opal_free_list_init(&mca_coll_ucg_subargs_pool.flist, + sizeof(mca_coll_ucg_subargs_t) + 4 * size * sizeof(int), + opal_cache_line_size, OBJ_CLASS(mca_coll_ucg_subargs_t), + 0, 0, + 0, INT_MAX, 128, + NULL, 0, NULL, NULL, NULL); + return rc == OPAL_SUCCESS ? OMPI_SUCCESS : OMPI_ERROR; +} + +void mca_coll_ucg_subargs_pool_cleanup(void) +{ + OBJ_DESTRUCT(&mca_coll_ucg_subargs_pool.flist); + return; +} + +void mca_coll_ucg_npolls_init(int n) +{ + if (n < 1) { + n = 1; + } else if (n > MAX_NPOLLS) { + n = MAX_NPOLLS; + } + npolls = n; + return; +} + +int mca_coll_ucg_rcache_init(int size) +{ + if (size <= 0) { + return OMPI_ERROR; + } + mca_coll_ucg_rcache.max_size = size; + mca_coll_ucg_rcache.total = 0; + mca_coll_ucg_rcache.hit = 0; + OBJ_CONSTRUCT(&mca_coll_ucg_rcache.requests, opal_list_t); + return OMPI_SUCCESS; +} + +void mca_coll_ucg_rcache_cleanup(void) +{ + UCG_INFO_IF(mca_coll_ucg_rcache.total > 0, "rcache hit rate: %.2f%% (%lu/%lu)", + 100.0 * mca_coll_ucg_rcache.hit / mca_coll_ucg_rcache.total , + mca_coll_ucg_rcache.hit, mca_coll_ucg_rcache.total); + opal_list_t *requests = &mca_coll_ucg_rcache.requests; + if (!opal_list_is_empty(requests)) { + UCG_WARN("%zu requests are not deleted from the cache.", opal_list_get_size(requests)); + } + OBJ_DESTRUCT(&mca_coll_ucg_rcache.requests); + return; +} + +static void mca_coll_ucg_rcache_coll_req_args_init(mca_coll_ucg_args_t *dst, + const mca_coll_ucg_args_t *src) +{ + *dst = *src; + int *scounts, *sdispls, *rcounts, *rdispls, *disps; + uint32_t i, size = (uint32_t)ompi_comm_size(src->comm); + mca_coll_ucg_subargs_t *args = NULL; + + switch (src->coll_type) { + case MCA_COLL_UCG_TYPE_ALLTOALLV: + case MCA_COLL_UCG_TYPE_IALLTOALLV: + if (src->alltoallv.scounts == NULL || + src->alltoallv.sdispls == NULL || + src->alltoallv.rcounts == NULL || + src->alltoallv.rdispls == NULL) { + return; + } + args = mca_coll_ucg_subargs_pool_get(); + scounts = args->buf; + sdispls = scounts + size; + rcounts = sdispls + size; + rdispls = rcounts + size; + for (i = 0; i < size; ++i) { + scounts[i] = src->alltoallv.scounts[i]; + sdispls[i] = src->alltoallv.sdispls[i]; + rcounts[i] = src->alltoallv.rcounts[i]; + rdispls[i] = src->alltoallv.rdispls[i]; + } + dst->alltoallv.scounts = scounts; + dst->alltoallv.sdispls = sdispls; + dst->alltoallv.rcounts = rcounts; + dst->alltoallv.rdispls = rdispls; + dst->scounts = src->alltoallv.scounts; + dst->sdispls = src->alltoallv.sdispls; + dst->rcounts = src->alltoallv.rcounts; + dst->rdispls = src->alltoallv.rdispls; + break; + case MCA_COLL_UCG_TYPE_SCATTERV: + case MCA_COLL_UCG_TYPE_ISCATTERV: + if (src->scatterv.scounts == NULL || + src->scatterv.disps == NULL || + ompi_comm_rank(src->comm) != src->scatterv.root) { + return; + } + args = mca_coll_ucg_subargs_pool_get(); + scounts = args->buf; + disps = scounts + size; + for (i = 0; i < size; ++i) { + scounts[i] = src->scatterv.scounts[i]; + disps[i] = src->scatterv.disps[i]; + } + dst->scatterv.scounts = scounts; + dst->scatterv.disps = disps; + dst->scounts = src->scatterv.scounts; + dst->sdispls = src->scatterv.disps; + break; + case MCA_COLL_UCG_TYPE_GATHERV: + case MCA_COLL_UCG_TYPE_IGATHERV: + if (src->gatherv.rcounts == NULL || + src->gatherv.disps == NULL || + ompi_comm_rank(src->comm) != src->gatherv.root) { + return; + } + args = mca_coll_ucg_subargs_pool_get(); + rcounts = args->buf; + disps = rcounts + size; + for (i = 0; i < size; ++i) { + rcounts[i] = src->gatherv.rcounts[i]; + disps[i] = src->gatherv.disps[i]; + } + dst->gatherv.rcounts = rcounts; + dst->gatherv.disps = disps; + dst->rcounts = src->gatherv.rcounts; + dst->rdispls = src->gatherv.disps; + break; + case MCA_COLL_UCG_TYPE_ALLGATHERV: + case MCA_COLL_UCG_TYPE_IALLGATHERV: + if (src->allgatherv.rcounts == NULL || + src->allgatherv.disps == NULL) { + return; + } + args = mca_coll_ucg_subargs_pool_get(); + rcounts = args->buf; + disps = rcounts + size; + for (i = 0; i < size; ++i) { + rcounts[i] = src->allgatherv.rcounts[i]; + disps[i] = src->allgatherv.disps[i]; + } + dst->allgatherv.rcounts = rcounts; + dst->allgatherv.disps = disps; + dst->rcounts = src->allgatherv.rcounts; + dst->rdispls = src->allgatherv.disps; + break; + default: + break; + } + return; +} + +static void mca_coll_ucg_rcache_coll_req_args_uninit(mca_coll_ucg_args_t *args) +{ + void *buf = NULL; + switch (args->coll_type) { + case MCA_COLL_UCG_TYPE_ALLTOALLV: + case MCA_COLL_UCG_TYPE_IALLTOALLV: + buf = (void *)args->alltoallv.scounts; + break; + case MCA_COLL_UCG_TYPE_SCATTERV: + case MCA_COLL_UCG_TYPE_ISCATTERV: + if (ompi_comm_rank(args->comm) == args->scatterv.root) { + buf = (void *)args->scatterv.scounts; + } + break; + case MCA_COLL_UCG_TYPE_GATHERV: + case MCA_COLL_UCG_TYPE_IGATHERV: + if (ompi_comm_rank(args->comm) == args->gatherv.root) { + buf = (void *)args->gatherv.rcounts; + } + break; + case MCA_COLL_UCG_TYPE_ALLGATHERV: + case MCA_COLL_UCG_TYPE_IALLGATHERV: + buf = (void *)args->allgatherv.rcounts; + break; + default: + break; + } + if (buf != NULL) { + mca_coll_ucg_subargs_t *data = container_of(buf, mca_coll_ucg_subargs_t, buf); + mca_coll_ucg_subargs_pool_put(data); + } + return; +} + +void mca_coll_ucg_rcache_mark_cacheable(mca_coll_ucg_req_t *coll_req, + mca_coll_ucg_args_t *key) +{ + OBJ_CONSTRUCT(&coll_req->list, opal_list_item_t); + mca_coll_ucg_rcache_coll_req_args_init(&coll_req->args, key); // deep copy + ucg_coll_ucg_rcache_ref(coll_req); + coll_req->cacheable = true; + return; +} + +int mca_coll_ucg_rcache_add(mca_coll_ucg_req_t *coll_req, mca_coll_ucg_args_t *key) +{ + opal_list_t *requests = &mca_coll_ucg_rcache.requests; + + mca_coll_ucg_rcache_mark_cacheable(coll_req, key); + + mca_coll_ucg_rcache_full_adjust(); + opal_list_prepend(requests, &coll_req->list); + return OMPI_SUCCESS; +} + +static bool mca_coll_ucg_rcache_compare(int size, const int *array1, const int *array2, const int32_t *src) +{ + if (array1 == NULL || array2 == NULL) { + return true; + } + if (array1 != src) { + return false; + } + for (int i = 0; i < size; ++i) { + if (array1[i] != array2[i]) { + return false; + } + } + return true; +} + +static bool mca_coll_ucg_rcache_is_same(const mca_coll_ucg_args_t *key1, + const mca_coll_ucg_args_t *key2) +{ + if (key1->coll_type != key2->coll_type) { + return false; + } + + if (key1->comm != key2->comm) { + return false; + } + + uint32_t comm_size = (uint32_t)ompi_comm_size(key1->comm); + bool is_same = false; + switch (key1->coll_type) { + case MCA_COLL_UCG_TYPE_BCAST: + case MCA_COLL_UCG_TYPE_IBCAST: { + const mca_coll_bcast_args_t *args1 = &key1->bcast; + const mca_coll_bcast_args_t *args2 = &key2->bcast; + is_same = args1->buffer == args2->buffer && + args1->count == args2->count && + args1->datatype == args2->datatype && + args1->root == args2->root; + break; + } + case MCA_COLL_UCG_TYPE_BARRIER: + case MCA_COLL_UCG_TYPE_IBARRIER: { + is_same = true; + break; + } + case MCA_COLL_UCG_TYPE_ALLREDUCE: + case MCA_COLL_UCG_TYPE_IALLREDUCE: { + const mca_coll_allreduce_args_t *args1 = &key1->allreduce; + const mca_coll_allreduce_args_t *args2 = &key2->allreduce; + is_same = args1->sbuf == args2->sbuf && + args1->rbuf == args2->rbuf && + args1->count == args2->count && + args1->datatype == args2->datatype && + args1->op == args2->op; + break; + } + case MCA_COLL_UCG_TYPE_ALLTOALLV: + case MCA_COLL_UCG_TYPE_IALLTOALLV: { + const mca_coll_alltoallv_args_t *args1 = &key1->alltoallv; + const mca_coll_alltoallv_args_t *args2 = &key2->alltoallv; + is_same = args1->sbuf == args2->sbuf && + args1->sdtype == args2->sdtype && + args1->rbuf == args2->rbuf && + args1->rdtype == args2->rdtype; + is_same = is_same && + mca_coll_ucg_rcache_compare(comm_size, args1->scounts, args2->scounts, key2->scounts) && + mca_coll_ucg_rcache_compare(comm_size, args1->sdispls, args2->sdispls, key2->sdispls) && + mca_coll_ucg_rcache_compare(comm_size, args1->rcounts, args2->rcounts, key2->rcounts) && + mca_coll_ucg_rcache_compare(comm_size, args1->rdispls, args2->rdispls, key2->rdispls); + break; + } + case MCA_COLL_UCG_TYPE_SCATTERV: + case MCA_COLL_UCG_TYPE_ISCATTERV: { + const mca_coll_scatterv_args_t *args1 = &key1->scatterv; + const mca_coll_scatterv_args_t *args2 = &key2->scatterv; + is_same = args1->rbuf == args2->rbuf && + args1->rcount == args2->rcount && + args1->rdtype == args2->rdtype && + args1->root == args2->root; + if (ompi_comm_rank(key1->comm) != args1->root) { // Non-root processes don't compare send parms + break; + } + is_same = is_same && + args1->sbuf == args2->sbuf && + args1->sdtype == args2->sdtype && + mca_coll_ucg_rcache_compare(comm_size, args1->scounts, args2->scounts, key2->scounts) && + mca_coll_ucg_rcache_compare(comm_size, args1->disps, args2->disps, key2->sdispls); + break; + } + case MCA_COLL_UCG_TYPE_GATHERV: + case MCA_COLL_UCG_TYPE_IGATHERV: { + const mca_coll_gatherv_args_t *args1 = &key1->gatherv; + const mca_coll_gatherv_args_t *args2 = &key2->gatherv; + is_same = args1->sbuf == args2->sbuf && + args1->scount == args2->scount && + args1->sdtype == args2->sdtype && + args1->root == args2->root; + if (ompi_comm_rank(key1->comm) != args1->root) { // Non-root processes don't compare recv parms + break; + } + is_same = is_same && + args1->rbuf == args2->rbuf && + args1->rdtype == args2->rdtype && + mca_coll_ucg_rcache_compare(comm_size, args1->rcounts, args2->rcounts, key2->rcounts) && + mca_coll_ucg_rcache_compare(comm_size, args1->disps, args2->disps, key2->rdispls); + break; + } + case MCA_COLL_UCG_TYPE_ALLGATHERV: + case MCA_COLL_UCG_TYPE_IALLGATHERV: { + const mca_coll_allgatherv_args_t *args1 = &key1->allgatherv; + const mca_coll_allgatherv_args_t *args2 = &key2->allgatherv; + is_same = args1->sbuf == args2->sbuf && + args1->scount == args2->scount && + args1->sdtype == args2->sdtype && + args1->rbuf == args2->rbuf && + args1->rdtype == args2->rdtype; + is_same = is_same && + mca_coll_ucg_rcache_compare(comm_size, args1->rcounts, args2->rcounts, key2->rcounts) && + mca_coll_ucg_rcache_compare(comm_size, args1->disps, args2->disps, key2->rdispls); + break; + } + default: + UCG_FATAL("Unsupported collective type(%d).", key1->coll_type); + break; + } + + return is_same; +} + +mca_coll_ucg_req_t* mca_coll_ucg_rcache_get(mca_coll_ucg_args_t *key) +{ + opal_list_t *requests = &mca_coll_ucg_rcache.requests; + mca_coll_ucg_req_t *coll_req = NULL; + opal_list_item_t *item = NULL; + + ++mca_coll_ucg_rcache.total; + OPAL_LIST_FOREACH(item, requests, opal_list_item_t) { + coll_req = container_of(item, mca_coll_ucg_req_t, list); + if (mca_coll_ucg_rcache_is_same(key, &coll_req->args)) { + opal_list_remove_item(requests, item); + ++mca_coll_ucg_rcache.hit; + return coll_req; + } + } + return NULL; +} + +void mca_coll_ucg_rcache_put(mca_coll_ucg_req_t *coll_req) +{ + if (!coll_req->cacheable) { + return; + } + mca_coll_ucg_rcache_full_adjust(); + opal_list_prepend(&mca_coll_ucg_rcache.requests, &coll_req->list); + return; +} + +void mca_coll_ucg_rcache_del(mca_coll_ucg_req_t *coll_req) +{ + if (!coll_req->cacheable) { + return; + } + + coll_req->cacheable = false; + ucg_coll_ucg_rcache_deref(coll_req); + mca_coll_ucg_rcache_coll_req_args_uninit(&coll_req->args); + OBJ_DESTRUCT(&coll_req->list); + + mca_coll_ucg_request_cleanup(coll_req); + // Convention: All requests in the cache are from the rpool. + mca_coll_ucg_rpool_put(coll_req); + return; +} + +void mca_coll_ucg_rcache_del_by_comm(ompi_communicator_t *comm) +{ + opal_list_t *requests = &mca_coll_ucg_rcache.requests; + opal_list_item_t *item; + opal_list_item_t *next; + OPAL_LIST_FOREACH_SAFE(item, next, requests, opal_list_item_t) { + mca_coll_ucg_req_t *coll_req = container_of(item, mca_coll_ucg_req_t, list); + if (comm == coll_req->args.comm) { + opal_list_remove_item(requests, item); + mca_coll_ucg_rcache_del(coll_req); + } + } + return; +} + +int mca_coll_ucg_request_common_init(mca_coll_ucg_req_t *coll_req, + bool nb, + bool persistent) +{ + ompi_request_t *ompi_req = &coll_req->super.super; + OMPI_REQUEST_INIT(ompi_req, persistent); + + ucg_request_info_t *info = &coll_req->info; + info->field_mask = 0; + if (nb || persistent) { + // For those case, the request is not done in the current call stack. + info->field_mask |= UCG_REQUEST_INFO_FIELD_CB; + info->complete_cb.cb = mca_coll_ucg_request_complete; + info->complete_cb.arg = coll_req; + + ompi_req->req_free = mca_coll_ucg_request_free; + ompi_req->req_cancel = mca_coll_ucg_request_cancel; + } + + if (persistent) { + ompi_req->req_type = OMPI_REQUEST_COLL; + ompi_req->req_start = mca_coll_ucg_request_start; + } + coll_req->ucg_req = NULL; + coll_req->cacheable = false; + return OMPI_SUCCESS; +} + +void mca_coll_ucg_request_cleanup(mca_coll_ucg_req_t *coll_req) +{ + // clean up resource initialized by ${coll_type}_init + if (coll_req->ucg_req != NULL) { + ucg_status_t status = ucg_request_cleanup(coll_req->ucg_req); + if (status != UCG_OK) { + UCG_ERROR("Failed to cleanup ucg request, %s", ucg_status_string(status)); + } + } + // clean up resource initialized by common_init + OMPI_REQUEST_FINI(&coll_req->super.super); + return; +} + +int mca_coll_ucg_request_execute(mca_coll_ucg_req_t *coll_req) +{ + ucg_request_h ucg_req = coll_req->ucg_req; + + ucg_status_t status; + status = ucg_request_start(ucg_req); + if (status != UCG_OK) { + UCG_DEBUG("Failed to start ucg request, %s", ucg_status_string(status)); + return OMPI_ERROR; + } + + int count = 0; + while (UCG_INPROGRESS == (status = ucg_request_test(ucg_req))) { + // TODO: test wether opal_progress() can be removed + if (++count % npolls == 0) { + opal_progress(); + } + } + if (status != UCG_OK) { + UCG_DEBUG("Failed to progress ucg request, %s", ucg_status_string(status)); + return OMPI_ERROR; + } + return OMPI_SUCCESS; +} + +int mca_coll_ucg_request_execute_nb(mca_coll_ucg_req_t *coll_req) +{ + /* ompi_req may be completed in ucg_request_start(), set the state first. */ + ompi_request_t *ompi_req = &coll_req->super.super; + ompi_req->req_complete = REQUEST_PENDING; + ompi_req->req_state = OMPI_REQUEST_ACTIVE; + + ucg_status_t status = ucg_request_start(coll_req->ucg_req); + if (status != UCG_OK) { + mca_coll_ucg_request_complete(coll_req, status); + UCG_DEBUG("Failed to start ucg request, %s", ucg_status_string(status)); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +int mca_coll_ucg_request_execute_cache(mca_coll_ucg_args_t *key) +{ + mca_coll_ucg_req_t *coll_req = NULL; + coll_req = mca_coll_ucg_rcache_get(key); + if (coll_req == NULL) { + return OMPI_ERR_NOT_FOUND; + } + int rc = mca_coll_ucg_request_execute(coll_req); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rcache_del(coll_req); + return rc; + } + mca_coll_ucg_rcache_put(coll_req); + return OMPI_SUCCESS; +} + +int mca_coll_ucg_request_execute_cache_nb(mca_coll_ucg_args_t *key, + mca_coll_ucg_req_t **coll_req) +{ + mca_coll_ucg_req_t *tmp_coll_req; + tmp_coll_req = mca_coll_ucg_rcache_get(key); + if (tmp_coll_req == NULL) { + return OMPI_ERR_NOT_FOUND; + } + int rc = mca_coll_ucg_request_execute_nb(tmp_coll_req); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rcache_del(tmp_coll_req); + return rc; + } + *coll_req = tmp_coll_req; + // mca_coll_ucg_request_free() will put the coll_req into cache again. + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/ucg/coll_ucg_request.h b/ompi/mca/coll/ucg/coll_ucg_request.h new file mode 100644 index 0000000000000000000000000000000000000000..335ffeff8370f56d17bb2cfd3c3a97c64f96452d --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_request.h @@ -0,0 +1,309 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2022 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#ifndef MCA_COLL_UCG_REQUEST_H +#define MCA_COLL_UCG_REQUEST_H + +#include "ompi_config.h" +#include "opal/class/opal_free_list.h" +#include "opal/class/opal_list.h" +#include "ompi/communicator/communicator.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/datatype/ompi_datatype_internal.h" +#include "ompi/op/op.h" +#include "ompi/mca/coll/base/coll_base_util.h" + +#include + +/* + * Blocking execution pattern: + * 1. Initialize coll request + * 2. Execute + * 3. Add to cache + * If any failure, goto fallback. + */ +#define MCA_COLL_UCG_REQUEST_PATTERN(_cache_key, _coll_request_init, ...) \ + do {\ + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); \ + rc = mca_coll_ucg_request_common_init(coll_req, false, false); \ + if (rc != OMPI_SUCCESS) { \ + goto fallback; \ + } \ + \ + int rc = _coll_request_init(coll_req, ##__VA_ARGS__); \ + if (rc != OMPI_SUCCESS) { \ + mca_coll_ucg_rpool_put(coll_req); \ + goto fallback; \ + } \ + \ + rc = mca_coll_ucg_request_execute(coll_req); \ + if (rc != OMPI_SUCCESS) { \ + mca_coll_ucg_request_cleanup(coll_req); \ + mca_coll_ucg_rpool_put(coll_req); \ + goto fallback; \ + } \ + \ + rc = mca_coll_ucg_rcache_add(coll_req, _cache_key); \ + if (rc != OMPI_SUCCESS) { \ + mca_coll_ucg_request_cleanup(coll_req); \ + mca_coll_ucg_rpool_put(coll_req); \ + }\ + } while(0) + +/* + * Non-blockingexecution pattern: + * 1. Initialize coll request + * 2. Non-blocking execute + * 3. Mark coll request cacheable + * 4. Assign ompi request + * If any failure, goto fallback. + */ +#define MCA_COLL_UCG_REQUEST_PATTERN_NB(_ompi_req, _cache_key, _coll_request_init, ...) \ + do {\ + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); \ + rc = mca_coll_ucg_request_common_init(coll_req, true, false); \ + if (rc != OMPI_SUCCESS) { \ + mca_coll_ucg_rpool_put(coll_req); \ + goto fallback; \ + } \ + \ + rc = _coll_request_init(coll_req, ##__VA_ARGS__); \ + if (rc != OMPI_SUCCESS) { \ + mca_coll_ucg_rpool_put(coll_req); \ + goto fallback; \ + } \ + \ + rc = mca_coll_ucg_request_execute_nb(coll_req); \ + if (rc != OMPI_SUCCESS) { \ + mca_coll_ucg_request_cleanup(coll_req); \ + mca_coll_ucg_rpool_put(coll_req); \ + goto fallback; \ + } \ + /* mark cacheable, so when request is completed, it will be added to cache.*/ \ + mca_coll_ucg_rcache_mark_cacheable(coll_req, &args); \ + *_ompi_req = &coll_req->super.super; \ + } while(0) + + +typedef enum { + MCA_COLL_UCG_TYPE_BCAST, + MCA_COLL_UCG_TYPE_IBCAST, + MCA_COLL_UCG_TYPE_BARRIER, + MCA_COLL_UCG_TYPE_IBARRIER, + MCA_COLL_UCG_TYPE_ALLREDUCE, + MCA_COLL_UCG_TYPE_IALLREDUCE, + MCA_COLL_UCG_TYPE_ALLTOALLV, + MCA_COLL_UCG_TYPE_IALLTOALLV, + MCA_COLL_UCG_TYPE_SCATTERV, + MCA_COLL_UCG_TYPE_ISCATTERV, + MCA_COLL_UCG_TYPE_GATHERV, + MCA_COLL_UCG_TYPE_IGATHERV, + MCA_COLL_UCG_TYPE_ALLGATHERV, + MCA_COLL_UCG_TYPE_IALLGATHERV, + MCA_COLL_UCG_TYPE_LAST, +} mca_coll_ucg_type_t; + +typedef struct { + opal_list_t requests; + int max_size; + + /* access statistics */ + uint64_t total; + uint64_t hit; +} mca_coll_ucg_rcache_t; + +typedef struct { + opal_free_list_t flist; +} mca_coll_ucg_rpool_t; + +typedef struct { + opal_free_list_item_t super; + int buf[]; +} mca_coll_ucg_subargs_t; +OBJ_CLASS_DECLARATION(mca_coll_ucg_subargs_t); + +typedef struct { + opal_free_list_t flist; +} mca_coll_ucg_subargs_pool_t; + +typedef struct mca_coll_bcast_args { + void *buffer; + int count; + ompi_datatype_t *datatype; + int root; +} mca_coll_bcast_args_t; + +typedef struct mca_coll_allreduce_args { + const void *sbuf; + void *rbuf; + int count; + ompi_datatype_t *datatype; + ompi_op_t *op; +} mca_coll_allreduce_args_t; + +typedef struct mca_coll_alltoallv_args { + const void *sbuf; + const int *scounts; + const int *sdispls; + ompi_datatype_t *sdtype; + void *rbuf; + const int *rcounts; + const int *rdispls; + ompi_datatype_t *rdtype; +} mca_coll_alltoallv_args_t; + +typedef struct mca_coll_scatterv_args { + const void *sbuf; + const int *scounts; + const int *disps; + ompi_datatype_t *sdtype; + void *rbuf; + int rcount; + ompi_datatype_t *rdtype; + int root; +} mca_coll_scatterv_args_t; + +typedef struct mca_coll_gatherv_args { + const void *sbuf; + int scount; + ompi_datatype_t *sdtype; + void *rbuf; + const int *rcounts; + const int *disps; + ompi_datatype_t *rdtype; + int root; +} mca_coll_gatherv_args_t; + +typedef struct mca_coll_allgatherv_args { + const void *sbuf; + int scount; + ompi_datatype_t *sdtype; + void *rbuf; + const int *rcounts; + const int *disps; + ompi_datatype_t *rdtype; +} mca_coll_allgatherv_args_t; + +typedef struct mca_coll_ucg_args { + mca_coll_ucg_type_t coll_type; + ompi_communicator_t *comm; + union { + mca_coll_bcast_args_t bcast; + mca_coll_allreduce_args_t allreduce; + mca_coll_alltoallv_args_t alltoallv; + mca_coll_scatterv_args_t scatterv; + mca_coll_gatherv_args_t gatherv; + mca_coll_allgatherv_args_t allgatherv; + }; + /* Stores pointers in the rcache, combine with deep copy content */ + const int32_t *scounts; + const int32_t *sdispls; + const int32_t *rcounts; + const int32_t *rdispls; +} mca_coll_ucg_args_t; + +typedef struct mca_coll_ucg_req { + ompi_coll_base_nbc_request_t super; + ucg_request_h ucg_req; + ucg_request_info_t info; + /* only cached request need to fill the following fields */ + opal_list_item_t list; + mca_coll_ucg_args_t args; + bool cacheable; +} mca_coll_ucg_req_t; +OBJ_CLASS_DECLARATION(mca_coll_ucg_req_t); + + +extern mca_coll_ucg_rpool_t mca_coll_ucg_rpool; +/* Initialize coll request pool */ +int mca_coll_ucg_rpool_init(void); +/* cleanup the coll request pool */ +void mca_coll_ucg_rpool_cleanup(void); +/* get an empty coll request */ +static inline mca_coll_ucg_req_t* mca_coll_ucg_rpool_get(void) +{ + return (mca_coll_ucg_req_t*)opal_free_list_wait(&mca_coll_ucg_rpool.flist); +} + +/* give back the coll request */ +static inline void mca_coll_ucg_rpool_put(mca_coll_ucg_req_t *coll_req) +{ + opal_free_list_return(&mca_coll_ucg_rpool.flist, (opal_free_list_item_t*)coll_req); + return; +} + +extern mca_coll_ucg_subargs_pool_t mca_coll_ucg_subargs_pool; +/* Initialize coll subargs pool */ +int mca_coll_ucg_subargs_pool_init(uint32_t size); +/* cleanup the coll subargs pool */ +void mca_coll_ucg_subargs_pool_cleanup(void); +/* get an empty coll subargs */ +static inline mca_coll_ucg_subargs_t* mca_coll_ucg_subargs_pool_get(void) +{ + return (mca_coll_ucg_subargs_t*)opal_free_list_wait(&mca_coll_ucg_subargs_pool.flist); +} +/* give back the coll subargs */ +static inline void mca_coll_ucg_subargs_pool_put(mca_coll_ucg_subargs_t *subargs) +{ + opal_free_list_return(&mca_coll_ucg_subargs_pool.flist, (opal_free_list_item_t*)subargs); + return; +} + +/* Initialize request cache */ +int mca_coll_ucg_rcache_init(int size); + +/* Init ucg progress npolls */ +void mca_coll_ucg_npolls_init(int n); + +/* Cleanup request cache */ +void mca_coll_ucg_rcache_cleanup(void); + +/* Used in non-blocking and persistent requests, so that coll request will be + cached when it's completed. */ +void mca_coll_ucg_rcache_mark_cacheable(mca_coll_ucg_req_t *coll_req, mca_coll_ucg_args_t *key); + +/* add a new coll request to cache, the coll request should be allocated from rpool. */ +int mca_coll_ucg_rcache_add(mca_coll_ucg_req_t *coll_req, mca_coll_ucg_args_t *key); + +/* find the matched coll request in cache and return it */ +mca_coll_ucg_req_t* mca_coll_ucg_rcache_get(mca_coll_ucg_args_t *key); + +/* put the coll request that returned by get() routine into cache */ +void mca_coll_ucg_rcache_put(mca_coll_ucg_req_t *coll_req); + +/* delete the coll request from request cache and return it to rpool */ +void mca_coll_ucg_rcache_del(mca_coll_ucg_req_t *coll_req); + +/* Delete requests of the specified comm. */ +void mca_coll_ucg_rcache_del_by_comm(ompi_communicator_t *comm); + + +/* Initialize the common part of the request */ +int mca_coll_ucg_request_common_init(mca_coll_ucg_req_t *coll_req, + bool nb, + bool persistent); + +/* cleanup coll request */ +void mca_coll_ucg_request_cleanup(mca_coll_ucg_req_t *coll_req); + +/* execute request in blocking mode */ +int mca_coll_ucg_request_execute(mca_coll_ucg_req_t *coll_req); + +/* execute request in non-blocking mode */ +int mca_coll_ucg_request_execute_nb(mca_coll_ucg_req_t *coll_req); + +/* Try to find the request in the cache and execute */ +int mca_coll_ucg_request_execute_cache(mca_coll_ucg_args_t *key); + +/* Try to find the request in the cache and execute */ +int mca_coll_ucg_request_execute_cache_nb(mca_coll_ucg_args_t *key, + mca_coll_ucg_req_t **coll_req); + +#endif //MCA_COLL_UCG_REQUEST_H \ No newline at end of file diff --git a/ompi/mca/coll/ucg/coll_ucg_scatterv.c b/ompi/mca/coll/ucg/coll_ucg_scatterv.c new file mode 100644 index 0000000000000000000000000000000000000000..d98fc07d640cbcfe0b12f13523bf7c16dd13fb95 --- /dev/null +++ b/ompi/mca/coll/ucg/coll_ucg_scatterv.c @@ -0,0 +1,266 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2022-2024 Huawei Technologies Co., Ltd. + * All rights reserved. + * COPYRIGHT$ + * + * Additional copyrights may follow + * + * HEADER$ + */ +#include "coll_ucg.h" +#include "coll_ucg_request.h" +#include "coll_ucg_debug.h" +#include "coll_ucg_dt.h" + + +static int mca_coll_ucg_request_scatterv_init(mca_coll_ucg_req_t *coll_req, + const void *sbuf, const int *scounts, + const int *disps, ompi_datatype_t *sdtype, + void *rbuf, int rcount, + ompi_datatype_t *rdtype, int root, + mca_coll_ucg_module_t *module, + ucg_request_type_t nb) +{ + ucg_dt_h ucg_send_dt; + int rc = mca_coll_ucg_type_adapt(sdtype, &ucg_send_dt, NULL, NULL); + if (rc != OMPI_SUCCESS) { + UCG_DEBUG("Failed to adapt type"); + return rc; + } + + ucg_dt_h ucg_recv_dt; + rc = mca_coll_ucg_type_adapt(rdtype, &ucg_recv_dt, NULL, NULL); + if (rc != OMPI_SUCCESS) { + UCG_DEBUG("Failed to adapt type"); + return rc; + } + + ucg_request_h ucg_req; + ucg_status_t status = ucg_request_scatterv_init(sbuf, scounts, disps, ucg_send_dt, + rbuf, rcount, ucg_recv_dt, root, + module->group, &coll_req->info, + nb, &ucg_req); + if (status != UCG_OK) { + UCG_DEBUG("Failed to initialize ucg request, %s", ucg_status_string(status)); + return OMPI_ERROR; + } + coll_req->ucg_req = ucg_req; + return OMPI_SUCCESS; +} + +int mca_coll_ucg_scatterv(const void *sbuf, const int *scounts, const int *disps, + ompi_datatype_t *sdtype, void *rbuf, int rcount, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg scatterv"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_req_t coll_req; + OBJ_CONSTRUCT(&coll_req, mca_coll_ucg_req_t); + int rc; + rc = mca_coll_ucg_request_common_init(&coll_req, false, false); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_scatterv_init(&coll_req, sbuf, scounts, disps, + sdtype, rbuf, rcount, rdtype, root, + ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + rc = mca_coll_ucg_request_execute(&coll_req); + mca_coll_ucg_request_cleanup(&coll_req); + if (rc != OMPI_SUCCESS) { + goto fallback; + } + + OBJ_DESTRUCT(&coll_req); + return OMPI_SUCCESS; + +fallback: + OBJ_DESTRUCT(&coll_req); + UCG_DEBUG("fallback scatterv"); + return ucg_module->previous_scatterv(sbuf, scounts, disps, sdtype, rbuf, rcount, + rdtype, root, comm, + ucg_module->previous_scatterv_module); +} + +int mca_coll_ucg_scatterv_cache(const void *sbuf, const int *scounts, const int *disps, + ompi_datatype_t *sdtype, void *rbuf, int rcount, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg scatterv cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_SCATTERV, + .comm = comm, + .scatterv.sbuf = sbuf, + .scatterv.scounts = scounts, + .scatterv.disps = disps, + .scatterv.sdtype = sdtype, + .scatterv.rbuf = rbuf, + .scatterv.rcount = rcount, + .scatterv.rdtype = rdtype, + .scatterv.root = root, + }; + + int rc; + rc = mca_coll_ucg_request_execute_cache(&args); + if (rc == OMPI_SUCCESS) { + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN(&args, mca_coll_ucg_request_scatterv_init, + sbuf, scounts, disps, sdtype, rbuf, rcount, + rdtype, root, ucg_module, UCG_REQUEST_BLOCKING); + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback scatterv"); + return ucg_module->previous_scatterv(sbuf, scounts, disps, sdtype, rbuf, + rcount, rdtype, root, comm, + ucg_module->previous_scatterv_module); +} + +int mca_coll_ucg_iscatterv(const void *sbuf, const int *scounts, const int *disps, + ompi_datatype_t *sdtype, void *rbuf, int rcount, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg iscatterv"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, true, false); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_scatterv_init(coll_req, sbuf, scounts, disps, + sdtype, rbuf, rcount, rdtype, root, + ucg_module, UCG_REQUEST_NONBLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_execute_nb(coll_req); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + *request = &coll_req->super.super; + + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback iscatterv"); + return ucg_module->previous_iscatterv(sbuf, scounts, disps, sdtype, rbuf, + rcount, rdtype, root, comm, request, + ucg_module->previous_iscatterv_module); +} + +int mca_coll_ucg_iscatterv_cache(const void *sbuf, const int *scounts, const int *disps, + ompi_datatype_t *sdtype, void *rbuf, int rcount, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, ompi_request_t **request, + mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg iscatterv cache"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + mca_coll_ucg_args_t args = { + .coll_type = MCA_COLL_UCG_TYPE_ISCATTERV, + .comm = comm, + .scatterv.sbuf = sbuf, + .scatterv.scounts = scounts, + .scatterv.disps = disps, + .scatterv.sdtype = sdtype, + .scatterv.rbuf = rbuf, + .scatterv.rcount = rcount, + .scatterv.rdtype = rdtype, + .scatterv.root = root, + }; + + int rc; + mca_coll_ucg_req_t *coll_req = NULL; + rc = mca_coll_ucg_request_execute_cache_nb(&args, &coll_req); + if (rc == OMPI_SUCCESS) { + *request = &coll_req->super.super; + return rc; + } + + if (rc != OMPI_ERR_NOT_FOUND) { + /* The failure may is caused by a UCG internal error. Retry may also fail + and should do fallback immediately. */ + goto fallback; + } + + MCA_COLL_UCG_REQUEST_PATTERN_NB(request, &args, mca_coll_ucg_request_scatterv_init, + sbuf, scounts, disps, sdtype, rbuf, rcount, + rdtype, root, ucg_module, UCG_REQUEST_NONBLOCKING); + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback iscatterv"); + return ucg_module->previous_iscatterv(sbuf, scounts, disps, sdtype, rbuf, + rcount, rdtype, root, comm, request, + ucg_module->previous_iscatterv_module); +} + +int mca_coll_ucg_scatterv_init(const void *sbuf, const int *scounts, const int *disps, + ompi_datatype_t *sdtype, void *rbuf, int rcount, + ompi_datatype_t *rdtype, int root, + ompi_communicator_t *comm, ompi_info_t *info, + ompi_request_t **request, mca_coll_base_module_t *module) +{ + UCG_DEBUG("ucg scatterv init"); + + mca_coll_ucg_module_t *ucg_module = (mca_coll_ucg_module_t*)module; + + int rc; + mca_coll_ucg_req_t *coll_req = mca_coll_ucg_rpool_get(); + rc = mca_coll_ucg_request_common_init(coll_req, false, true); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + rc = mca_coll_ucg_request_scatterv_init(coll_req, sbuf, scounts, disps, + sdtype, rbuf, rcount, rdtype, root, + ucg_module, UCG_REQUEST_BLOCKING); + if (rc != OMPI_SUCCESS) { + mca_coll_ucg_request_cleanup(coll_req); + mca_coll_ucg_rpool_put(coll_req); + goto fallback; + } + + *request = &coll_req->super.super; + return OMPI_SUCCESS; + +fallback: + UCG_DEBUG("fallback scatterv init"); + return ucg_module->previous_scatterv_init(sbuf, scounts, disps, sdtype, rbuf, + rcount, rdtype, root, comm, info, + request, ucg_module->previous_scatterv_module); +} \ No newline at end of file diff --git a/ompi/mca/coll/ucg/configure.m4 b/ompi/mca/coll/ucg/configure.m4 new file mode 100644 index 0000000000000000000000000000000000000000..31b33095a371637871e4e3441e9c7ad7b8fe4648 --- /dev/null +++ b/ompi/mca/coll/ucg/configure.m4 @@ -0,0 +1,32 @@ +# +# Copyright (c) 2022-2022 Huawei Technologies Co., Ltd. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +# MCA_coll_ucg_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_ompi_coll_ucg_CONFIG],[ + AC_CONFIG_FILES([ompi/mca/coll/ucg/Makefile]) + + OMPI_CHECK_UCG([coll_ucg], + [coll_ucg_happy="yes"], + [coll_ucg_happy="no"]) + + AS_IF([test "$coll_ucg_happy" = "yes"], + [$1], + [$2]) + + # substitute in the things needed to build ucg + AC_SUBST([coll_ucg_CFLAGS]) + AC_SUBST([coll_ucg_CPPFLAGS]) + AC_SUBST([coll_ucg_LDFLAGS]) + AC_SUBST([coll_ucg_LIBS]) +])dnl + diff --git a/ompi/mca/coll/ucg/owner.txt b/ompi/mca/coll/ucg/owner.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c08c938c34a5a9e7ebf057549c0d4ad4b2493ad --- /dev/null +++ b/ompi/mca/coll/ucg/owner.txt @@ -0,0 +1,7 @@ +# +# owner/status file +# owner: institution that is responsible for this package +# status: e.g. active, maintenance, unmaintained +# +owner: HUAWEI +status: active \ No newline at end of file diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index dd16a27b1541348ef36cb2577a32a05b4c19118f..ddca065be88df1ad158bf253e63a9b4c245ad668 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -9,6 +9,7 @@ * Copyright (c) 2019 Intel, Inc. All rights reserved. * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. * All Rights reserved. + * Copyright (c) 2022-2025 Huawei Technologies Co., Ltd. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -233,10 +234,17 @@ int mca_pml_ucx_open(void) UCP_PARAM_FIELD_REQUEST_CLEANUP | UCP_PARAM_FIELD_TAG_SENDER_MASK | UCP_PARAM_FIELD_MT_WORKERS_SHARED | - UCP_PARAM_FIELD_ESTIMATED_NUM_EPS; + UCP_PARAM_FIELD_ESTIMATED_NUM_EPS | + UCP_PARAM_FIELD_TIMEOUT_WARN; params.features = UCP_FEATURE_TAG; +#ifdef HAVE_UCG_API_UCG_H + /* Adapt to UCG internal ucp requests. */ + params.request_size = sizeof(ompi_request_t) + 4; +#else params.request_size = sizeof(ompi_request_t); +#endif params.request_init = mca_pml_ucx_request_init; + params.timeout_warn = mca_pml_ucx_request_timeout_warn; params.request_cleanup = mca_pml_ucx_request_cleanup; params.tag_sender_mask = PML_UCX_SPECIFIC_SOURCE_MASK; params.mt_workers_shared = 0; /* we do not need mt support for context @@ -428,8 +436,10 @@ static ucp_ep_h mca_pml_ucx_add_proc_common(ompi_proc_t *proc) status = ucp_ep_create(ompi_pml_ucx.ucp_worker, &ep_params, &ep); free(address); if (UCS_OK != status) { - PML_UCX_ERROR("ucp_ep_create(proc=%d) failed: %s", + char *errhost = opal_get_proc_hostname(&proc->super); + PML_UCX_ERROR("ucp_ep_create(peer proc=%d peer hostname=%s) failed: %s", proc->super.proc_name.vpid, + errhost, ucs_status_string(status)); return NULL; } diff --git a/ompi/mca/pml/ucx/pml_ucx.h b/ompi/mca/pml/ucx/pml_ucx.h index 88837f997bcf602f0b60e0c78f63a0d8563570e9..8f44efce7fd6842ea3da93bc175fd1a3aee01541 100644 --- a/ompi/mca/pml/ucx/pml_ucx.h +++ b/ompi/mca/pml/ucx/pml_ucx.h @@ -1,6 +1,7 @@ /* * Copyright (C) Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED. * $COPYRIGHT$ + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. * * Additional copyrights may follow * diff --git a/ompi/mca/pml/ucx/pml_ucx_request.c b/ompi/mca/pml/ucx/pml_ucx_request.c index fccb9f6a6f66f086ff890bf3e94cddeae596a0de..0fade8536d90ce52b10e157a4a465e9045ac9689 100644 --- a/ompi/mca/pml/ucx/pml_ucx_request.c +++ b/ompi/mca/pml/ucx/pml_ucx_request.c @@ -4,6 +4,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -213,6 +214,21 @@ void mca_pml_ucx_request_init(void *request) mca_pml_ucx_request_cancel); } +void mca_pml_ucx_request_timeout_warn(uint64_t tag) +{ + int rc; + char *peer_hostname = NULL; + int32_t rank = PML_UCX_TAG_GET_SOURCE(tag); + opal_process_name_t proc_name = {.vpid = rank, .jobid = OMPI_PROC_MY_NAME->jobid}; + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_HOSTNAME, &proc_name, + (char**)&(peer_hostname), PMIX_STRING); + if (rc != OPAL_SUCCESS) { + peer_hostname = "unknown"; + } + PML_UCX_WARN("UCP request timeout! request tag 0x%lX local proc: %u peer proc: %d peer hostname: %s\n", + tag, OMPI_PROC_MY_NAME->vpid, rank, peer_hostname); +} + void mca_pml_ucx_request_cleanup(void *request) { ompi_request_t* ompi_req = request; diff --git a/ompi/mca/pml/ucx/pml_ucx_request.h b/ompi/mca/pml/ucx/pml_ucx_request.h index 8132f6b54ba4f511a12b63bbd780b29768877f2f..178dd3f42b34067e9a8481addd0489578ad9f008 100644 --- a/ompi/mca/pml/ucx/pml_ucx_request.h +++ b/ompi/mca/pml/ucx/pml_ucx_request.h @@ -4,6 +4,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -147,6 +148,8 @@ void mca_pml_ucx_completed_request_init(ompi_request_t *ompi_req); void mca_pml_ucx_request_init(void *request); +void mca_pml_ucx_request_timeout_warn(uint64_t tag); + void mca_pml_ucx_request_cleanup(void *request); int mca_pml_ucx_request_cancel(ompi_request_t *req, int flag);