diff --git a/src/uct/cuda/gdr_copy/gdr_copy_ep.c b/src/uct/cuda/gdr_copy/gdr_copy_ep.c index e753578fc60..58f85050eca 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_ep.c +++ b/src/uct/cuda/gdr_copy/gdr_copy_ep.c @@ -38,7 +38,7 @@ ucs_status_t uct_gdr_copy_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, siz { uct_gdr_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_gdr_copy_iface_t); uct_gdr_copy_md_t *md = (uct_gdr_copy_md_t *)iface->super.md; - uct_gdr_copy_mem_h *mem_hndl = (uct_gdr_copy_mem_h *) rkey; + uct_gdr_copy_mem_t *mem_hndl = (uct_gdr_copy_mem_t *) rkey; gdr_info_t gdr_info; size_t bar_off; diff --git a/src/uct/cuda/gdr_copy/gdr_copy_md.c b/src/uct/cuda/gdr_copy/gdr_copy_md.c index c35ad06d2a8..8eea9c3914d 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_md.c +++ b/src/uct/cuda/gdr_copy/gdr_copy_md.c @@ -15,9 +15,28 @@ #include #include +#define UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN (GPU_PAGE_SIZE) + static ucs_config_field_t uct_gdr_copy_md_config_table[] = { {"", "", NULL, - ucs_offsetof(uct_gdr_copy_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)}, + ucs_offsetof(uct_gdr_copy_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)}, + + {"RCACHE_ADDR_ALIGN", UCS_PP_MAKE_STRING(UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN), + "Registration cache address alignment, must be power of 2\n" + "between "UCS_PP_MAKE_STRING(UCS_PGT_ADDR_ALIGN)"and system page size", + ucs_offsetof(uct_gdr_copy_md_config_t, rcache.alignment), UCS_CONFIG_TYPE_UINT}, + + {"RCACHE_MEM_PRIO", "1000", "Registration cache memory event priority", + ucs_offsetof(uct_gdr_copy_md_config_t, rcache.event_prio), UCS_CONFIG_TYPE_UINT}, + + {"RCACHE_OVERHEAD", "90ns", "Registration cache lookup overhead", + ucs_offsetof(uct_gdr_copy_md_config_t, rcache.overhead), UCS_CONFIG_TYPE_TIME}, + + {"MEM_REG_OVERHEAD", "16us", "Memory registration overhead", /* TODO take default from device */ + ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.overhead), UCS_CONFIG_TYPE_TIME}, + + {"MEM_REG_GROWTH", "0.06ns", "Memory registration growth rate", /* TODO take default from device */ + ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.growth), UCS_CONFIG_TYPE_TIME}, {NULL} }; @@ -56,53 +75,41 @@ static ucs_status_t uct_gdr_copy_rkey_release(uct_md_component_t *mdc, uct_rkey_ return UCS_OK; } - -static ucs_status_t uct_gdr_copy_mem_reg(uct_md_h uct_md, void *address, size_t length, - unsigned flags, uct_mem_h *memh_p) +static ucs_status_t uct_gdr_copy_mem_reg_internal(uct_md_h uct_md, void *address, size_t length, + unsigned flags, uct_gdr_copy_mem_t *mem_hndl) { - uct_gdr_copy_mem_h * mem_hndl = NULL; uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + CUdeviceptr d_ptr = ((CUdeviceptr )(char *) address); gdr_mh_t mh; - size_t reg_size; void *bar_ptr; - - CUdeviceptr d_ptr = ((CUdeviceptr )(char *) address); - - mem_hndl = ucs_malloc(sizeof(uct_gdr_copy_mem_h), "gdr_copy handle"); - if (NULL == mem_hndl) { - ucs_error("Failed to allocate memory for uct_gdr_copy_mem_h"); - return UCS_ERR_NO_MEMORY; - } - reg_size = (length + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK; - - if (gdr_pin_buffer(md->gdrcpy_ctx, (d_ptr & GPU_PAGE_MASK), reg_size, 0, 0, &mh) != 0) { - ucs_error("gdr_pin_buffer Failed. length :%lu pin_size:%lu ", length, reg_size); + if (gdr_pin_buffer(md->gdrcpy_ctx, d_ptr, length, 0, 0, &mh) != 0) { + ucs_error("gdr_pin_buffer Failed. length :%lu ", length); return UCS_ERR_IO_ERROR; } if (mh == 0) { - ucs_error("gdr_pin_buffer Failed. length :%lu pin_size:%lu ", length, reg_size); + ucs_error("gdr_pin_buffer Failed. length :%lu ", length); return UCS_ERR_IO_ERROR; } - if (gdr_map(md->gdrcpy_ctx, mh, &bar_ptr, reg_size) !=0) { - ucs_error("gdr_map failed. length :%lu pin_size:%lu ", length, reg_size); + if (gdr_map(md->gdrcpy_ctx, mh, &bar_ptr, length) !=0) { + ucs_error("gdr_map failed. length :%lu ", length); return UCS_ERR_IO_ERROR; } mem_hndl->mh = mh; mem_hndl->bar_ptr = bar_ptr; - mem_hndl->reg_size = reg_size; - - *memh_p = mem_hndl; + mem_hndl->reg_size = length; + return UCS_OK; + } -static ucs_status_t uct_gdr_copy_mem_dereg(uct_md_h uct_md, uct_mem_h memh) +static ucs_status_t uct_gdr_copy_mem_dereg_internal(uct_md_h uct_md, uct_gdr_copy_mem_t *mem_hndl) { + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); - uct_gdr_copy_mem_h *mem_hndl = memh; if (gdr_unmap(md->gdrcpy_ctx, mem_hndl->mh, mem_hndl->bar_ptr, mem_hndl->reg_size) !=0) { ucs_error("gdr_unmap Failed. unpin_size:%lu ", mem_hndl->reg_size); @@ -112,11 +119,47 @@ static ucs_status_t uct_gdr_copy_mem_dereg(uct_md_h uct_md, uct_mem_h memh) ucs_error("gdr_unpin_buffer failed "); return UCS_ERR_IO_ERROR; } + return UCS_OK; +} - free(mem_hndl); +static ucs_status_t uct_gdr_copy_mem_reg(uct_md_h uct_md, void *address, size_t length, + unsigned flags, uct_mem_h *memh_p) +{ + uct_gdr_copy_mem_t * mem_hndl = NULL; + size_t reg_size; + void *ptr; + ucs_status_t status; + + + mem_hndl = ucs_malloc(sizeof(uct_gdr_copy_mem_t), "gdr_copy handle"); + if (NULL == mem_hndl) { + ucs_error("Failed to allocate memory for uct_gdr_copy_mem_t"); + return UCS_ERR_NO_MEMORY; + } + + reg_size = (length + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK; + ptr = (void *) ((uintptr_t)address & GPU_PAGE_MASK); + + status = uct_gdr_copy_mem_reg_internal(uct_md, ptr, reg_size, 0, mem_hndl); + if (status != UCS_OK) { + free(mem_hndl); + return status; + } + + *memh_p = mem_hndl; return UCS_OK; } +static ucs_status_t uct_gdr_copy_mem_dereg(uct_md_h uct_md, uct_mem_h memh) +{ + uct_gdr_copy_mem_t *mem_hndl = memh; + ucs_status_t status; + + status = uct_gdr_copy_mem_dereg_internal(uct_md, mem_hndl); + free(mem_hndl); + return status; +} + static ucs_status_t uct_gdr_copy_mem_detect(uct_md_h md, void *addr, uint64_t *dn_mask) { int memory_type; @@ -165,19 +208,105 @@ static void uct_gdr_copy_md_close(uct_md_h uct_md) ucs_free(md); } -static ucs_status_t uct_gdr_copy_md_open(const char *md_name, const uct_md_config_t *md_config, - uct_md_h *md_p) +static uct_md_ops_t md_ops = { + .close = uct_gdr_copy_md_close, + .query = uct_gdr_copy_md_query, + .mkey_pack = uct_gdr_copy_mkey_pack, + .mem_reg = uct_gdr_copy_mem_reg, + .mem_dereg = uct_gdr_copy_mem_dereg, + .mem_detect = uct_gdr_copy_mem_detect +}; + +static inline uct_gdr_copy_rcache_region_t* uct_gdr_copy_rache_region_from_memh(uct_mem_h memh) { - uct_gdr_copy_md_t *md; + return ucs_container_of(memh, uct_gdr_copy_rcache_region_t, memh); +} + +static ucs_status_t uct_gdr_copy_mem_rcache_reg(uct_md_h uct_md, void *address, + size_t length, unsigned flags, + uct_mem_h *memh_p) +{ + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + ucs_rcache_region_t *rregion; + ucs_status_t status; + uct_gdr_copy_mem_t *memh; + + status = ucs_rcache_get(md->rcache, address, length, PROT_READ|PROT_WRITE, + &flags, &rregion); + if (status != UCS_OK) { + return status; + } - static uct_md_ops_t md_ops = { - .close = uct_gdr_copy_md_close, - .query = uct_gdr_copy_md_query, - .mkey_pack = uct_gdr_copy_mkey_pack, - .mem_reg = uct_gdr_copy_mem_reg, - .mem_dereg = uct_gdr_copy_mem_dereg, - .mem_detect = uct_gdr_copy_mem_detect - }; + ucs_assert(rregion->refcount > 0); + memh = &ucs_derived_of(rregion, uct_gdr_copy_rcache_region_t)->memh; + *memh_p = memh; + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_mem_rcache_dereg(uct_md_h uct_md, uct_mem_h memh) +{ + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + uct_gdr_copy_rcache_region_t *region = uct_gdr_copy_rache_region_from_memh(memh); + + ucs_rcache_region_put(md->rcache, ®ion->super); + return UCS_OK; +} + +static uct_md_ops_t md_rcache_ops = { + .close = uct_gdr_copy_md_close, + .query = uct_gdr_copy_md_query, + .mkey_pack = uct_gdr_copy_mkey_pack, + .mem_reg = uct_gdr_copy_mem_rcache_reg, + .mem_dereg = uct_gdr_copy_mem_rcache_dereg, + .mem_detect = uct_gdr_copy_mem_detect +}; +static ucs_status_t uct_gdr_copy_rcache_mem_reg_cb(void *context, ucs_rcache_t *rcache, + void *arg, ucs_rcache_region_t *rregion) +{ + uct_gdr_copy_rcache_region_t *region = ucs_derived_of(rregion, uct_gdr_copy_rcache_region_t); + uct_gdr_copy_md_t *md = context; + int *flags = arg; + ucs_status_t status; + + status = uct_gdr_copy_mem_reg_internal(&md->super, (void*)region->super.super.start, + region->super.super.end - region->super.super.start, + *flags, ®ion->memh); + if (status != UCS_OK) { + return status; + } + + return UCS_OK; +} + +static void uct_gdr_copy_rcache_mem_dereg_cb(void *context, ucs_rcache_t *rcache, + ucs_rcache_region_t *rregion) +{ + uct_gdr_copy_rcache_region_t *region = ucs_derived_of(rregion, uct_gdr_copy_rcache_region_t); + uct_gdr_copy_md_t *md = context; + + (void)uct_gdr_copy_mem_dereg_internal(&md->super, ®ion->memh); +} + +static void uct_gdr_copy_rcache_dump_region_cb(void *context, ucs_rcache_t *rcache, + ucs_rcache_region_t *rregion, char *buf, + size_t max) +{ + +} + +static ucs_rcache_ops_t uct_gdr_copy_rcache_ops = { + .mem_reg = uct_gdr_copy_rcache_mem_reg_cb, + .mem_dereg = uct_gdr_copy_rcache_mem_dereg_cb, + .dump_region = uct_gdr_copy_rcache_dump_region_cb +}; + +static ucs_status_t uct_gdr_copy_md_open(const char *md_name, const uct_md_config_t *uct_md_config, + uct_md_h *md_p) +{ + ucs_status_t status; + uct_gdr_copy_md_t *md; + const uct_gdr_copy_md_config_t *md_config = ucs_derived_of(uct_md_config, uct_gdr_copy_md_config_t); + ucs_rcache_params_t rcache_params; md = ucs_malloc(sizeof(uct_gdr_copy_md_t), "uct_gdr_copy_md_t"); if (NULL == md) { @@ -187,6 +316,10 @@ static ucs_status_t uct_gdr_copy_md_open(const char *md_name, const uct_md_confi md->super.ops = &md_ops; md->super.component = &uct_gdr_copy_md_component; + md->rcache = NULL; + md->reg_cost = md_config->uc_reg_cost; + + md->gdrcpy_ctx = gdr_open(); if (md->gdrcpy_ctx == (void *)0) { @@ -194,6 +327,31 @@ static ucs_status_t uct_gdr_copy_md_open(const char *md_name, const uct_md_confi return UCS_ERR_IO_ERROR; } + if (md_config->rcache.enable != UCS_NO) { + // UCS_STATIC_ASSERT(UCS_PGT_ADDR_ALIGN >= UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN); + rcache_params.region_struct_size = sizeof(uct_gdr_copy_rcache_region_t); + rcache_params.alignment = md_config->rcache.alignment; + rcache_params.ucm_event_priority = md_config->rcache.event_prio; + rcache_params.context = md; + rcache_params.ops = &uct_gdr_copy_rcache_ops; + status = ucs_rcache_create(&rcache_params, "gdr_copy" UCS_STATS_ARG(NULL), &md->rcache); + if (status == UCS_OK) { + md->super.ops = &md_rcache_ops; + md->reg_cost.overhead = 0; + md->reg_cost.growth = 0; /* It's close enough to 0 */ + } else { + ucs_assert(md->rcache == NULL); + if (md_config->rcache.enable == UCS_YES) { + ucs_error("Failed to create registration cache: %s", + ucs_status_string(status)); + return UCS_ERR_IO_ERROR; + } else { + ucs_debug("Could not create registration cache for: %s", + ucs_status_string(status)); + } + } + } + *md_p = (uct_md_h) md; return UCS_OK; } diff --git a/src/uct/cuda/gdr_copy/gdr_copy_md.h b/src/uct/cuda/gdr_copy/gdr_copy_md.h index 7447d92110d..b3747f6c7ae 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_md.h +++ b/src/uct/cuda/gdr_copy/gdr_copy_md.h @@ -8,6 +8,7 @@ #define UCT_CUDA_CONTEXT_H #include +#include #include "gdrapi.h" #define UCT_GDR_COPY_MD_NAME "gdr_copy" @@ -19,7 +20,9 @@ extern uct_md_component_t uct_gdr_copy_md_component; */ typedef struct uct_gdr_copy_md { struct uct_md super; /**< Domain info */ - gdr_t gdrcpy_ctx; /**< gdr copy context */ + gdr_t gdrcpy_ctx; /**< gdr copy context */ + ucs_rcache_t *rcache; /**< Registration cache (can be NULL) */ + uct_linear_growth_t reg_cost; /**< Memory registration cost */ } uct_gdr_copy_md_t; /** @@ -27,6 +30,17 @@ typedef struct uct_gdr_copy_md { */ typedef struct uct_gdr_copy_md_config { uct_md_config_t super; + struct { + ucs_ternary_value_t enable; /**< Enable registration cache */ + size_t alignment; /**< Force address alignment */ + unsigned event_prio; /**< Memory events priority */ + double overhead; /**< Lookup overhead estimation */ + } rcache; + + uct_linear_growth_t uc_reg_cost; /**< Memory registration cost estimation + without using the cache */ + + } uct_gdr_copy_md_config_t; @@ -37,8 +51,15 @@ typedef struct uct_gdr_copy_mem { gdr_mh_t mh; void *bar_ptr; size_t reg_size; -} uct_gdr_copy_mem_h; +} uct_gdr_copy_mem_t; +/** + * cuda memory region in the registration cache. + */ +typedef struct uct_gdr_copy_rcache_region { + ucs_rcache_region_t super; + uct_gdr_copy_mem_t memh; /**< mr exposed to the user as the memh */ +} uct_gdr_copy_rcache_region_t; #endif