diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index 133112d06653..2281eeec185d 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -112,38 +112,69 @@ uct_ib_mlx5_devx_reg_ksm(uct_ib_mlx5_md_t *md, int atomic, uint64_t address, return UCS_OK; } +static void uct_ib_mlx5_devx_ksm_log(uct_ib_mlx5_md_t *md, int atomic, + void *address, size_t length, + uint64_t iova, ucs_status_t status, int mt) +{ + ucs_debug("KSM %s-thread memory registration status \"%s\" " + "range %p..%p iova 0x%" PRIx64 "%s on %s", + mt ? "multi" : "single", ucs_status_string(status), address, + UCS_PTR_BYTE_OFFSET(address, length), iova, + atomic ? " atomic" : "", uct_ib_device_name(&md->super.dev)); +} + +/* + * Register KSM data for given memory handle. Can work only with MRs + * that were registered in multi-threaded mode. + */ static ucs_status_t -uct_ib_mlx5_devx_reg_ksm_data(uct_ib_mlx5_md_t *md, int atomic, void *address, - uct_ib_mlx5_devx_ksm_data_t *ksm_data, - size_t length, uint64_t iova, uint32_t mkey_index, - const char *reason, struct mlx5dv_devx_obj **mr_p, - uint32_t *mkey) +uct_ib_mlx5_devx_reg_ksm_data_mt(uct_ib_mlx5_md_t *md, int atomic, + void *address, + uct_ib_mlx5_devx_ksm_data_t *ksm_data, + uint64_t iova, uint32_t mkey_index, + const char *reason, + struct mlx5dv_devx_obj **mr_p, uint32_t *mkey) { - void *mr_address = address; + struct ibv_mr *last_mr = ksm_data->mrs[ksm_data->mr_num - 1]; + uint64_t iova_offset = iova - (uint64_t)address; + void *mr_address = address; + size_t list_size = ksm_data->mr_num; ucs_status_t status; char *in; void *klm; int i; - status = uct_ib_mlx5_alloc_mkey_inbox(ksm_data->mr_num, &in); + /* Add offset to workaround CREATE_MKEY range check issue */ + if (iova_offset > 0) { + ++list_size; + } + + status = uct_ib_mlx5_alloc_mkey_inbox(list_size, &in); if (status != UCS_OK) { - return UCS_ERR_NO_MEMORY; + goto out; } klm = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, klm_pas_mtt); for (i = 0; i < ksm_data->mr_num; i++) { UCT_IB_MLX5DV_SET64(klm, klm, address, (uintptr_t)mr_address); - UCT_IB_MLX5DV_SET(klm, klm, byte_count, ksm_data->mrs[i]->length); UCT_IB_MLX5DV_SET(klm, klm, mkey, ksm_data->mrs[i]->lkey); klm = UCS_PTR_BYTE_OFFSET(klm, UCT_IB_MLX5DV_ST_SZ_BYTES(klm)); mr_address = UCS_PTR_BYTE_OFFSET(mr_address, ksm_data->mrs[i]->length); } - status = uct_ib_mlx5_devx_reg_ksm(md, atomic, iova, length, - ksm_data->mr_num, - ksm_data->mrs[0]->length, in, mkey_index, - reason, mr_p, mkey); + if (iova_offset > 0) { + UCT_IB_MLX5DV_SET64(klm, klm, address, (uintptr_t)mr_address); + UCT_IB_MLX5DV_SET(klm, klm, mkey, last_mr->lkey); + } + + status = uct_ib_mlx5_devx_reg_ksm(md, atomic, iova, ksm_data->length, + list_size, ksm_data->mrs[0]->length, in, + mkey_index, reason, mr_p, mkey); ucs_free(in); + + uct_ib_mlx5_devx_ksm_log(md, atomic, address, ksm_data->length, iova, + status, 1); +out: return status; } @@ -180,12 +211,17 @@ uct_ib_mlx5_devx_reg_ksm_data_addr(uct_ib_mlx5_md_t *md, struct ibv_mr *mr, return status; } +/* + * Register KSM data for given memory handle. Can work only with MRs + * that were registered in single-threaded mode. + */ static ucs_status_t uct_ib_mlx5_devx_reg_ksm_data_contig( uct_ib_mlx5_md_t *md, uct_ib_mlx5_devx_mr_t *mr, void *address, uint64_t iova, int atomic, uint32_t mkey_index, const char *reason, struct mlx5dv_devx_obj **mr_p, uint32_t *mkey) { size_t mr_length = mr->super.ib->length; + ucs_status_t status; uint64_t ksm_address; uint64_t ksm_iova; size_t ksm_length; @@ -202,10 +238,13 @@ static ucs_status_t uct_ib_mlx5_devx_reg_ksm_data_contig( list_size = ucs_div_round_up(ksm_length + ucs_get_page_size(), UCT_IB_MD_MAX_MR_SIZE); - return uct_ib_mlx5_devx_reg_ksm_data_addr(md, mr->super.ib, ksm_address, - ksm_length, ksm_iova, atomic, - list_size, mkey_index, reason, - mr_p, mkey); + status = uct_ib_mlx5_devx_reg_ksm_data_addr(md, mr->super.ib, ksm_address, + ksm_length, ksm_iova, atomic, + list_size, mkey_index, reason, + mr_p, mkey); + + uct_ib_mlx5_devx_ksm_log(md, atomic, address, mr_length, ksm_iova, status, 0); + return status; } static void * @@ -362,6 +401,37 @@ static void uct_ib_mlx5_devx_mr_lru_cleanup(uct_ib_mlx5_md_t *md) kh_destroy_inplace(rkeys, &md->lru_rkeys.hash); } +/* + * Register KSM data for given memory handle. Distinguish the way of KSM creation + * structures filling by checking UCT_IB_MEM_MULTITHREADED flag. + */ +static ucs_status_t +uct_ib_mlx5_devx_reg_ksm_data(uct_ib_mlx5_md_t *md, + uct_ib_mlx5_devx_mem_t *memh, + uct_ib_mr_type_t mr_type, uint32_t iova_offset, + int atomic, uint32_t mkey_index, + const char *reason, struct mlx5dv_devx_obj **mr_p, + uint32_t *mkey) +{ + uct_ib_mlx5_devx_mr_t *mr = &memh->mrs[mr_type]; + void *address = uct_ib_mlx5_devx_memh_base_address(memh); + uint64_t iova = (uint64_t)address + iova_offset; + ucs_status_t status; + + if (memh->super.flags & UCT_IB_MEM_MULTITHREADED) { + status = uct_ib_mlx5_devx_reg_ksm_data_mt(md, atomic, address, + mr->ksm_data, iova, + mkey_index, reason, mr_p, + mkey); + } else { + status = uct_ib_mlx5_devx_reg_ksm_data_contig(md, mr, address, iova, + atomic, mkey_index, + reason, mr_p, mkey); + } + + return status; +} + UCS_PROFILE_FUNC_ALWAYS(ucs_status_t, uct_ib_mlx5_devx_reg_indirect_key, (md, memh), uct_ib_mlx5_md_t *md, uct_ib_mlx5_devx_mem_t *memh) @@ -372,11 +442,10 @@ UCS_PROFILE_FUNC_ALWAYS(ucs_status_t, uct_ib_mlx5_devx_reg_indirect_key, md->super.name); do { - status = uct_ib_mlx5_devx_reg_ksm_data_contig( - md, &memh->mrs[UCT_IB_MR_DEFAULT], - uct_ib_mlx5_devx_memh_base_address(memh), - (uint64_t)memh->address, 0, 0, "indirect key", - &memh->indirect_dvmr, &memh->indirect_rkey); + status = uct_ib_mlx5_devx_reg_ksm_data(md, memh, UCT_IB_MR_DEFAULT, 0, + 0, 0, "indirect key", + &memh->indirect_dvmr, + &memh->indirect_rkey); if (status != UCS_OK) { break; } @@ -420,12 +489,9 @@ UCS_PROFILE_FUNC_ALWAYS(ucs_status_t, uct_ib_mlx5_devx_reg_atomic_key, uct_ib_mlx5_devx_mem_t *memh) { uct_ib_mr_type_t mr_type = uct_ib_devx_get_atomic_mr_type(&md->super, memh); - uct_ib_mlx5_devx_mr_t *mr = &memh->mrs[mr_type]; - uint8_t mr_id = uct_ib_md_get_atomic_mr_id(&md->super); - uint32_t atomic_offset = uct_ib_md_atomic_offset(mr_id); + uint8_t mr_id = uct_ib_md_get_atomic_mr_id(&md->super); + uint32_t atomic_offset = uct_ib_md_atomic_offset(mr_id); uint32_t mkey_index; - uint64_t iova; - ucs_status_t status; int is_atomic; if (memh->smkey_mr != NULL) { @@ -436,31 +502,11 @@ UCS_PROFILE_FUNC_ALWAYS(ucs_status_t, uct_ib_mlx5_devx_reg_atomic_key, } is_atomic = memh->super.flags & UCT_IB_MEM_ACCESS_REMOTE_ATOMIC; - iova = (uint64_t)memh->address + atomic_offset; - - if (memh->super.flags & UCT_IB_MEM_MULTITHREADED) { - return uct_ib_mlx5_devx_reg_ksm_data(md, is_atomic, memh->address, - mr->ksm_data, mr->ksm_data->length, - iova, mkey_index, - "multi-thread atomic key", - &memh->atomic_dvmr, - &memh->atomic_rkey); - } - status = uct_ib_mlx5_devx_reg_ksm_data_contig( - md, mr, uct_ib_mlx5_devx_memh_base_address(memh), iova, is_atomic, - mkey_index, "atomic key", &memh->atomic_dvmr, &memh->atomic_rkey); - if (status != UCS_OK) { - return status; - } - - ucs_debug("KSM registered memory %p..%p lkey 0x%x offset 0x%x%s on %s rkey " - "0x%x", - memh->address, - UCS_PTR_BYTE_OFFSET(memh->address, mr->super.ib->length), - mr->super.ib->lkey, atomic_offset, is_atomic ? " atomic" : "", - uct_ib_device_name(&md->super.dev), memh->atomic_rkey); - return UCS_OK; + return uct_ib_mlx5_devx_reg_ksm_data(md, memh, mr_type, atomic_offset, + is_atomic, mkey_index, "atomic key", + &memh->atomic_dvmr, + &memh->atomic_rkey); } static ucs_status_t @@ -506,10 +552,10 @@ uct_ib_mlx5_devx_reg_mt(uct_ib_mlx5_md_t *md, void *address, size_t length, goto err_free; } - status = uct_ib_mlx5_devx_reg_ksm_data(md, is_atomic, address, ksm_data, - length, (uint64_t)address, 0, - "multi-thread key", &ksm_data->dvmr, - mkey_p); + status = uct_ib_mlx5_devx_reg_ksm_data_mt(md, is_atomic, address, ksm_data, + (uint64_t)address, 0, + "multi-thread key", + &ksm_data->dvmr, mkey_p); if (status != UCS_OK) { goto err_dereg; } @@ -2022,6 +2068,7 @@ UCS_PROFILE_FUNC_ALWAYS(ucs_status_t, uct_ib_mlx5_devx_reg_exported_key, goto out_umem_mr; } + ucs_assert(!(memh->super.flags & UCT_IB_MEM_MULTITHREADED)); status = uct_ib_mlx5_devx_reg_ksm_data_contig(md, &memh->mrs[UCT_IB_MR_DEFAULT], memh->address, diff --git a/test/gtest/uct/ib/test_ib_md.cc b/test/gtest/uct/ib/test_ib_md.cc index 0839727a46d4..464c4762b998 100644 --- a/test/gtest/uct/ib/test_ib_md.cc +++ b/test/gtest/uct/ib/test_ib_md.cc @@ -27,6 +27,8 @@ class test_ib_md : public test_md uct_rkey_t *rkey_p = NULL); void check_smkeys(uct_rkey_t rkey1, uct_rkey_t rkey2); + void test_mkey_pack_mt(bool invalidate); + void test_mkey_pack_mt_internal(unsigned access_mask, bool invalidate); void test_smkey_reg_atomic(void); private: @@ -224,6 +226,79 @@ void test_ib_md::test_smkey_reg_atomic(void) ucs_mmap_free(buffer, size); } +void test_ib_md::test_mkey_pack_mt_internal(unsigned access_mask, + bool invalidate) +{ + constexpr size_t size = UCS_MBYTE; + unsigned pack_flags, dereg_flags; + void *buffer; + int ret; + uct_mem_h memh; + + if (!check_invalidate_support(access_mask)) { + UCS_TEST_SKIP_R("mkey invalidation isn't supported"); + } + + if (!has_ksm()) { + UCS_TEST_SKIP_R("KSM is required for MT registration"); + } + + ret = ucs_posix_memalign(&buffer, size, size, "mkey_pack_mt"); + ASSERT_EQ(0, ret) << "Allocation failed"; + + if (invalidate) { + pack_flags = UCT_MD_MKEY_PACK_FLAG_INVALIDATE_RMA; + dereg_flags = UCT_MD_MEM_DEREG_FLAG_INVALIDATE; + } else { + pack_flags = dereg_flags = 0; + } + + ASSERT_UCS_OK(reg_mem(access_mask, buffer, size, &memh)); + + uct_ib_mem_t *ib_memh = (uct_ib_mem_t*)memh; + EXPECT_TRUE(ib_memh->flags & UCT_IB_MEM_MULTITHREADED); + + std::vector rkey(md_attr().rkey_packed_size); + uct_md_mkey_pack_params_t pack_params; + pack_params.field_mask = UCT_MD_MKEY_PACK_FIELD_FLAGS; + pack_params.flags = pack_flags; + ASSERT_UCS_OK(uct_md_mkey_pack_v2(md(), memh, buffer, size, + &pack_params, rkey.data())); + + uct_md_mem_dereg_params_t params; + params.field_mask = UCT_MD_MEM_DEREG_FIELD_MEMH | + UCT_MD_MEM_DEREG_FIELD_COMPLETION | + UCT_MD_MEM_DEREG_FIELD_FLAGS; + params.memh = memh; + params.flags = dereg_flags; + comp().comp.func = dereg_cb; + comp().comp.count = 1; + comp().comp.status = UCS_OK; + comp().self = this; + params.comp = &comp().comp; + ASSERT_UCS_OK(uct_md_mem_dereg_v2(md(), ¶ms)); + + ucs_free(buffer); +} + +void test_ib_md::test_mkey_pack_mt(bool invalidate) +{ + test_mkey_pack_mt_internal(UCT_MD_MEM_ACCESS_REMOTE_ATOMIC, invalidate); + test_mkey_pack_mt_internal(UCT_MD_MEM_ACCESS_RMA, invalidate); + test_mkey_pack_mt_internal(UCT_MD_MEM_ACCESS_ALL, invalidate); +} + +UCS_TEST_P(test_ib_md, pack_mkey_mt, "REG_MT_THRESH=128K", "REG_MT_CHUNK=128K") +{ + test_mkey_pack_mt(false); +} + +UCS_TEST_P(test_ib_md, pack_mkey_mt_invalidate, "REG_MT_THRESH=128K", + "REG_MT_CHUNK=128K") +{ + test_mkey_pack_mt(true); +} + UCS_TEST_P(test_ib_md, smkey_reg_atomic) { test_smkey_reg_atomic(); diff --git a/test/gtest/uct/ib/test_ib_xfer.cc b/test/gtest/uct/ib/test_ib_xfer.cc index 952105a152ca..2e753f5d47a8 100644 --- a/test/gtest/uct/ib/test_ib_xfer.cc +++ b/test/gtest/uct/ib/test_ib_xfer.cc @@ -7,6 +7,10 @@ #include #include +#include +#ifdef HAVE_MLX5_DV +#include +#endif class uct_p2p_rma_test_xfer : public uct_p2p_rma_test {}; @@ -129,13 +133,92 @@ UCS_TEST_P(uct_p2p_mix_test_alloc_methods, mix1000) run(1000); } -UCS_TEST_P(uct_p2p_mix_test_alloc_methods, mix1000_multithreaded, - "REG_MT_THRESH=1", "REG_MT_CHUNK=1K", "REG_MT_BIND=y") +UCT_INSTANTIATE_IB_TEST_CASE(uct_p2p_mix_test_alloc_methods) + + +class uct_p2p_mix_test_mt : public uct_p2p_mix_test { +protected: + bool is_page_size_aligned(const mapped_buffer &buffer) + { + return ucs_padding((size_t)buffer.reg_addr(), ucs_get_page_size()) == 0; + } + + mapped_buffer alloc_buffer(const entity &entity, size_t offset) override + { + mapped_buffer buf = uct_p2p_mix_test::alloc_buffer(entity, offset); + if (!is_page_size_aligned(buf)) { + UCS_TEST_SKIP_R("Skip MT registration for unaligned buffers"); + } + + auto *ib_memh = static_cast(buf.memh()); + EXPECT_TRUE(ib_memh->flags & UCT_IB_MEM_MULTITHREADED); + return buf; + } + + bool check_md_flags() + { +#if HAVE_DEVX + auto *ib_md = ucs_derived_of(sender().md(), uct_ib_md_t); + if (strcmp(ib_md->name, UCT_IB_MD_NAME(mlx5))) { + return false; + } + + auto *ib_mlx5_md = ucs_derived_of(sender().md(), uct_ib_mlx5_md_t); + return (ib_mlx5_md->flags & UCT_IB_MLX5_MD_FLAG_KSM) && + (ib_mlx5_md->flags & UCT_IB_MLX5_MD_FLAG_INDIRECT_ATOMICS); +#else + return false; +#endif + } + + virtual void init() override + { + push_config(); + modify_config("REG_MT_THRESH", ucs::to_string(reg_mt_chunk + 1)); + modify_config("REG_MT_CHUNK", ucs::to_string(reg_mt_chunk)); + + uct_p2p_mix_test::init(); + + if (!check_md_flags()) { + UCS_TEST_SKIP_R("KSM and indirect atomics are required for MT " + "registration"); + } + + /* Too many chunks causes MT registration failure since DEVX + * input structure became too big */ + m_buffer_size = ucs_min(m_buffer_size, 256 * reg_mt_chunk); + /* We need at least two chunks */ + m_buffer_size = ucs_max(m_buffer_size, reg_mt_chunk + 1); + } + + virtual void cleanup() override + { + uct_p2p_mix_test::cleanup(); + pop_config(); + } + + constexpr static size_t reg_mt_chunk = 16 * UCS_KBYTE; +}; + +constexpr size_t uct_p2p_mix_test_mt::reg_mt_chunk; + +UCS_TEST_P(uct_p2p_mix_test_mt, mix1000_alloc_methods, "REG_MT_BIND=y") +{ + run(1000); +} + +UCS_TEST_P(uct_p2p_mix_test_mt, mix1000) { run(1000); } -UCT_INSTANTIATE_IB_TEST_CASE(uct_p2p_mix_test_alloc_methods) +UCS_TEST_P(uct_p2p_mix_test_mt, mix1000_last_byte_offset) +{ + /* Alloc 2 chunks buffer, but perform the operations on the last 8 bytes */ + run(1000, (reg_mt_chunk * 2) - 8, 8); +} + +UCT_INSTANTIATE_IB_TEST_CASE(uct_p2p_mix_test_mt) class uct_p2p_mix_test_indirect_atomic : public uct_p2p_mix_test {}; diff --git a/test/gtest/uct/test_md.cc b/test/gtest/uct/test_md.cc index e4c007289235..cdb76d7fdf8b 100644 --- a/test/gtest/uct/test_md.cc +++ b/test/gtest/uct/test_md.cc @@ -101,7 +101,7 @@ void test_md::test_reg_mem(unsigned access_mask, params.flags = UCT_MD_MEM_DEREG_FLAG_INVALIDATE; params.comp = &comp().comp; - if (!is_supported_reg_mem_flags(access_mask)) { + if (!check_invalidate_support(access_mask)) { params.field_mask = UCT_MD_MEM_DEREG_FIELD_COMPLETION | UCT_MD_MEM_DEREG_FIELD_FLAGS | UCT_MD_MEM_DEREG_FIELD_MEMH; @@ -173,7 +173,7 @@ test_md::test_md() /* coverity[uninit_member] */ } -bool test_md::is_supported_reg_mem_flags(unsigned reg_flags) const +bool test_md::check_invalidate_support(unsigned reg_flags) const { return (reg_flags & md_flags_remote_rma) ? check_caps(UCT_MD_FLAG_INVALIDATE_RMA) : diff --git a/test/gtest/uct/test_md.h b/test/gtest/uct/test_md.h index 35b722d3f350..19a554a2385c 100644 --- a/test/gtest/uct/test_md.h +++ b/test/gtest/uct/test_md.h @@ -30,7 +30,7 @@ class test_md : public testing::TestWithParam, test_md(); - bool is_supported_reg_mem_flags(unsigned reg_flags) const; + bool check_invalidate_support(unsigned reg_flags) const; bool is_bf_arm() const; diff --git a/test/gtest/uct/test_p2p_mix.cc b/test/gtest/uct/test_p2p_mix.cc index 7dba63d162eb..0383f70d3d45 100644 --- a/test/gtest/uct/test_p2p_mix.cc +++ b/test/gtest/uct/test_p2p_mix.cc @@ -12,7 +12,13 @@ extern "C" { #include -uct_p2p_mix_test::uct_p2p_mix_test() : uct_p2p_test(0), m_send_size(0) { +uct_p2p_mix_test::uct_p2p_mix_test() : + uct_p2p_test(0), + m_buffer_size(0), + m_max_short(0), + m_max_bcopy(0), + m_max_zcopy(0) +{ } ucs_status_t uct_p2p_mix_test::am_callback(void *arg, void *data, size_t length, @@ -54,20 +60,30 @@ ucs_status_t uct_p2p_mix_test::put_short(const mapped_buffer &sendbuf, const mapped_buffer &recvbuf, uct_completion_t *comp) { - return uct_ep_put_short(sender().ep(0), sendbuf.ptr(), - sendbuf.length(), recvbuf.addr(), - recvbuf.rkey()); + return uct_ep_put_short(sender().ep(0), sendbuf.ptr(), m_max_short, + recvbuf.addr(), recvbuf.rkey()); +} + + +size_t uct_p2p_mix_test::pack_bcopy(void *dest, void *arg) +{ + auto pack_arg = static_cast(arg); + + mem_buffer::copy_from(dest, pack_arg->sendbuf->ptr(), pack_arg->max_bcopy, + pack_arg->sendbuf->mem_type()); + return pack_arg->max_bcopy; } ucs_status_t uct_p2p_mix_test::put_bcopy(const mapped_buffer &sendbuf, const mapped_buffer &recvbuf, uct_completion_t *comp) { - ssize_t packed_len; - packed_len = uct_ep_put_bcopy(sender().ep(0), mapped_buffer::pack, - (void*)&sendbuf, recvbuf.addr(), recvbuf.rkey()); + bcopy_pack_arg pack_arg = {&sendbuf, m_max_bcopy}; + ssize_t packed_len = uct_ep_put_bcopy(sender().ep(0), pack_bcopy, + (void*)&pack_arg, recvbuf.addr(), + recvbuf.rkey()); if (packed_len >= 0) { - EXPECT_EQ(sendbuf.length(), (size_t)packed_len); + EXPECT_EQ(m_max_bcopy, (size_t)packed_len); return UCS_OK; } else { return (ucs_status_t)packed_len; @@ -81,7 +97,7 @@ ucs_status_t uct_p2p_mix_test::am_short(const mapped_buffer &sendbuf, ucs_status_t status; status = uct_ep_am_short(sender().ep(0), AM_ID, *(uint64_t*)sendbuf.ptr(), (uint64_t*)sendbuf.ptr() + 1, - sendbuf.length() - sizeof(uint64_t)); + m_max_short - sizeof(uint64_t)); if (status == UCS_OK) { ucs_atomic_add32(&am_pending, +1); } @@ -96,7 +112,7 @@ ucs_status_t uct_p2p_mix_test::am_short_iov(const mapped_buffer &sendbuf, uct_iov_t iov; iov.buffer = sendbuf.ptr(); - iov.length = sendbuf.length(); + iov.length = m_max_short - sizeof(uint64_t); iov.count = 1; iov.stride = 0; iov.memh = sendbuf.memh(); @@ -117,11 +133,11 @@ ucs_status_t uct_p2p_mix_test::am_zcopy(const mapped_buffer &sendbuf, uct_iov_t iov; header_length = ucs_min(ucs::rand() % sender().iface_attr().cap.am.max_hdr, - sendbuf.length()); + m_max_zcopy); iov.buffer = (char*)sendbuf.ptr() + header_length; iov.count = 1; - iov.length = sendbuf.length() - header_length; + iov.length = m_max_zcopy - header_length; iov.memh = sendbuf.memh(); status = uct_ep_am_zcopy(sender().ep(0), AM_ID, sendbuf.ptr(), header_length, &iov, 1, 0, comp); @@ -161,7 +177,14 @@ void uct_p2p_mix_test::random_op(const mapped_buffer &sendbuf, } } -void uct_p2p_mix_test::run(unsigned count) { +uct_test::mapped_buffer +uct_p2p_mix_test::alloc_buffer(const entity &entity, size_t offset) +{ + return mapped_buffer(m_buffer_size, 0, entity, offset); +} + +void uct_p2p_mix_test::run(unsigned count, size_t offset, size_t size_cap) +{ if (m_avail_send_funcs.size() == 0) { UCS_TEST_SKIP_R("unsupported"); } @@ -169,8 +192,13 @@ void uct_p2p_mix_test::run(unsigned count) { UCS_TEST_SKIP_R("skipping on non-host memory"); } - mapped_buffer sendbuf(m_send_size, 0, sender()); - mapped_buffer recvbuf(m_send_size, 0, receiver()); + m_buffer_size = std::min(size_cap, m_buffer_size); + m_max_short = std::min(size_cap, m_max_short); + m_max_bcopy = std::min(size_cap, m_max_bcopy); + m_max_zcopy = std::min(size_cap, m_max_zcopy); + + mapped_buffer sendbuf = alloc_buffer(sender(), offset); + mapped_buffer recvbuf = alloc_buffer(receiver(), offset); for (unsigned i = 0; i < count; ++i) { random_op(sendbuf, recvbuf); @@ -179,33 +207,48 @@ void uct_p2p_mix_test::run(unsigned count) { flush(); } -void uct_p2p_mix_test::init() { +size_t uct_p2p_mix_test::max_buffer_size() const +{ + if (has_mm() || has_transport("self")) { + /* Reduce testing time */ + return UCS_MBYTE; + } + return UCS_GBYTE; +} + +void uct_p2p_mix_test::init() +{ uct_p2p_test::init(); ucs_status_t status = uct_iface_set_am_handler(receiver().iface(), AM_ID, am_callback, NULL, UCT_CB_FLAG_ASYNC); ASSERT_UCS_OK(status); - m_send_size = MAX_SIZE; + m_max_short = m_max_bcopy = m_max_zcopy = max_buffer_size(); if (sender().iface_attr().cap.flags & UCT_IFACE_FLAG_AM_SHORT) { m_avail_send_funcs.push_back(&uct_p2p_mix_test::am_short); - m_send_size = ucs_min(m_send_size, sender().iface_attr().cap.am.max_short); - m_avail_send_funcs.push_back(&uct_p2p_mix_test::am_short_iov); - m_send_size = ucs_min(m_send_size, sender().iface_attr().cap.am.max_short); + m_max_short = ucs_min(m_max_short, + sender().iface_attr().cap.am.max_short); } if (sender().iface_attr().cap.flags & UCT_IFACE_FLAG_AM_ZCOPY) { m_avail_send_funcs.push_back(&uct_p2p_mix_test::am_zcopy); - m_send_size = ucs_min(m_send_size, sender().iface_attr().cap.am.max_zcopy); + m_max_zcopy = ucs_min(m_max_zcopy, + sender().iface_attr().cap.am.max_zcopy); } if (sender().iface_attr().cap.flags & UCT_IFACE_FLAG_PUT_SHORT) { m_avail_send_funcs.push_back(&uct_p2p_mix_test::put_short); - m_send_size = ucs_min(m_send_size, sender().iface_attr().cap.put.max_short); + m_max_short = ucs_min(m_max_short, + sender().iface_attr().cap.put.max_short); } if (sender().iface_attr().cap.flags & UCT_IFACE_FLAG_PUT_BCOPY) { m_avail_send_funcs.push_back(&uct_p2p_mix_test::put_bcopy); - m_send_size = ucs_min(m_send_size, sender().iface_attr().cap.put.max_bcopy); + m_max_bcopy = ucs_min(m_max_bcopy, + sender().iface_attr().cap.put.max_bcopy); } + + m_buffer_size = std::max({m_max_short, m_max_bcopy, m_max_zcopy}); + if (sender().iface_attr().cap.atomic64.fop_flags & UCS_BIT(UCT_ATOMIC_OP_CSWAP)) { m_avail_send_funcs.push_back(&uct_p2p_mix_test::cswap64); } @@ -241,7 +284,8 @@ void uct_p2p_mix_test::init() { } } -void uct_p2p_mix_test::cleanup() { +void uct_p2p_mix_test::cleanup() +{ while (am_pending) { progress(); } @@ -255,4 +299,10 @@ UCS_TEST_P(uct_p2p_mix_test, mix_10000) { run(10000); } +UCS_TEST_P(uct_p2p_mix_test, mix1000_last_byte_offset) +{ + /* Alloc page size buffer, but perform the operations on the last 8 bytes */ + run(1000, ucs_get_page_size() - 8, 8); +} + UCT_INSTANTIATE_TEST_CASE(uct_p2p_mix_test) diff --git a/test/gtest/uct/test_p2p_mix.h b/test/gtest/uct/test_p2p_mix.h index 0765730088f5..8a3a0b662824 100644 --- a/test/gtest/uct/test_p2p_mix.h +++ b/test/gtest/uct/test_p2p_mix.h @@ -41,6 +41,8 @@ class uct_p2p_mix_test : public uct_p2p_test { const mapped_buffer &recvbuf, uct_completion_t *comp); + static size_t pack_bcopy(void *dest, void *arg); + ucs_status_t put_bcopy(const mapped_buffer &sendbuf, const mapped_buffer &recvbuf, uct_completion_t *comp); @@ -58,16 +60,28 @@ class uct_p2p_mix_test : public uct_p2p_test { void random_op(const mapped_buffer &sendbuf, const mapped_buffer &recvbuf); - void run(unsigned count); + virtual mapped_buffer alloc_buffer(const entity &entity, size_t offset); + + void run(unsigned count, size_t offset = 0, size_t size_cap = SIZE_MAX); virtual void init(); virtual void cleanup(); + size_t max_buffer_size() const; + + size_t m_buffer_size; + private: std::vector m_avail_send_funcs; - size_t m_send_size; + size_t m_max_short, m_max_bcopy, m_max_zcopy; + static uint32_t am_pending; + + struct bcopy_pack_arg { + const mapped_buffer *sendbuf; + size_t max_bcopy; + }; }; #endif diff --git a/test/gtest/uct/uct_test.cc b/test/gtest/uct/uct_test.cc index 20585b0322fc..b963b7200b09 100644 --- a/test/gtest/uct/uct_test.cc +++ b/test/gtest/uct/uct_test.cc @@ -1367,6 +1367,20 @@ std::ostream& operator<<(std::ostream& os, const uct_tl_resource_desc_t& resourc return os << resource.tl_name << "/" << resource.dev_name; } +void uct_test::mapped_buffer::reset() +{ + m_mem.method = UCT_ALLOC_METHOD_LAST; + m_mem.address = NULL; + m_mem.md = NULL; + m_mem.memh = UCT_MEM_HANDLE_NULL; + m_mem.mem_type = UCS_MEMORY_TYPE_HOST; + m_mem.length = 0; + m_buf = NULL; + m_end = NULL; + m_rkey.rkey = UCT_INVALID_RKEY; + m_rkey.handle = NULL; +} + uct_test::mapped_buffer::mapped_buffer(size_t size, uint64_t seed, const entity &entity, size_t offset, ucs_memory_type_t mem_type, @@ -1390,16 +1404,7 @@ uct_test::mapped_buffer::mapped_buffer(size_t size, uint64_t seed, m_end = (char*)m_buf + size; pattern_fill(seed); } else { - m_mem.method = UCT_ALLOC_METHOD_LAST; - m_mem.address = NULL; - m_mem.md = NULL; - m_mem.memh = UCT_MEM_HANDLE_NULL; - m_mem.mem_type= UCS_MEMORY_TYPE_HOST; - m_mem.length = 0; - m_buf = NULL; - m_end = NULL; - m_rkey.rkey = UCT_INVALID_RKEY; - m_rkey.handle = NULL; + reset(); } m_iov.buffer = ptr(); m_iov.length = length(); @@ -1411,6 +1416,23 @@ uct_test::mapped_buffer::mapped_buffer(size_t size, uint64_t seed, m_rkey.type = NULL; } +uct_test::mapped_buffer::mapped_buffer(mapped_buffer &&other) : + m_entity(other.m_entity) +{ + m_mem.method = other.m_mem.method; + m_mem.address = other.m_mem.address; + m_mem.md = other.m_mem.md; + m_mem.memh = other.m_mem.memh; + m_mem.mem_type = other.m_mem.mem_type; + m_mem.length = other.m_mem.length; + m_buf = other.m_buf; + m_end = other.m_end; + m_rkey.rkey = other.m_rkey.rkey; + m_rkey.handle = other.m_rkey.handle; + + other.reset(); +} + uct_test::mapped_buffer::~mapped_buffer() { m_entity.rkey_release(&m_rkey); if (m_mem.mem_type == UCS_MEMORY_TYPE_HOST) { @@ -1451,6 +1473,16 @@ uct_mem_h uct_test::mapped_buffer::memh() const { return m_mem.memh; } +ucs_memory_type_t uct_test::mapped_buffer::mem_type() const +{ + return m_mem.mem_type; +} + +void *uct_test::mapped_buffer::reg_addr() const +{ + return m_mem.address; +} + uct_rkey_t uct_test::mapped_buffer::rkey() const { return m_rkey.rkey; } @@ -1461,7 +1493,7 @@ const uct_iov_t* uct_test::mapped_buffer::iov() const { size_t uct_test::mapped_buffer::pack(void *dest, void *arg) { const mapped_buffer* buf = (const mapped_buffer*)arg; - mem_buffer::copy_from(dest, buf->ptr(), buf->length(), buf->m_mem.mem_type); + mem_buffer::copy_from(dest, buf->ptr(), buf->length(), buf->mem_type()); return buf->length(); } diff --git a/test/gtest/uct/uct_test.h b/test/gtest/uct/uct_test.h index df500a930d8f..b5584d5ca996 100644 --- a/test/gtest/uct/uct_test.h +++ b/test/gtest/uct/uct_test.h @@ -246,10 +246,14 @@ class uct_test : public testing::TestWithParam, unsigned mem_flags = UCT_MD_MEM_ACCESS_ALL); virtual ~mapped_buffer(); + mapped_buffer(mapped_buffer &&other); + void *ptr() const; uintptr_t addr() const; size_t length() const; uct_mem_h memh() const; + ucs_memory_type_t mem_type() const; + void *reg_addr() const; uct_rkey_t rkey() const; const uct_iov_t* iov() const; @@ -260,6 +264,7 @@ class uct_test : public testing::TestWithParam, static size_t pack(void *dest, void *arg); private: + void reset(); const uct_test::entity& m_entity;