Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/UGNI: Start using spinlocks to protect critical structures #1494

Merged
merged 4 commits into from
May 19, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 101 additions & 2 deletions src/uct/ugni/base/ugni_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,33 @@
#endif

#include "ugni_device.h"
#include "ugni_md.h"
#include "ugni_iface.h"
#include <uct/base/uct_md.h>
#include <ucs/sys/string.h>

#if ENABLE_MT
#define uct_ugni_check_lock_needed(_cdm) UCS_THREAD_MODE_MULTI == _cdm->thread_mode
#define uct_ugni_device_init_lock(_dev) ucs_spinlock_init(&_dev->lock)
#define uct_ugni_device_destroy_lock(_dev) ucs_spinlock_destroy(&_dev->lock)
#define uct_ugni_device_lock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_spin_lock(&cdm->dev->lock); \
}
#define uct_ugni_device_unlock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_spin_unlock(&cdm->dev->lock); \
}
#else
#define uct_ugni_device_init_lock(x) UCS_OK
#define uct_ugni_device_destroy_lock(x) UCS_OK
#define uct_ugni_device_lock(x)
#define uct_ugni_device_unlock(x)
#define uct_ugni_check_lock_needed(x) 0
#endif

uint16_t ugni_domain_counter = 0;

void uct_ugni_device_get_resource(const char *tl_name, uct_ugni_device_t *dev,
uct_tl_resource_desc_t *resource)
{
Expand Down Expand Up @@ -106,21 +129,97 @@ ucs_status_t uct_ugni_device_create(int dev_id, int index, uct_ugni_device_t *de
ucs_snprintf_zero(dev_p->fname, sizeof(dev_p->fname), "%s:%d",
dev_p->type_name, dev_p->device_index);

status = uct_ugni_device_init_lock(dev_p);
if (UCS_OK != status) {
ucs_error("Couldn't initalize device lock.");
return status;
}
dev_p->attached = false;
return UCS_OK;
}

void uct_ugni_device_destroy(uct_ugni_device_t *dev)
{
/* Nop */
ucs_status_t status;

status = uct_ugni_device_destroy_lock(dev);
if (UCS_OK != status) {
ucs_error("Couldn't destroy device lock.");
}
}

ucs_status_t uct_ugni_iface_get_dev_address(uct_iface_t *tl_iface, uct_device_addr_t *addr)
{
uct_ugni_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_iface_t);
uct_devaddr_ugni_t *ugni_dev_addr = (uct_devaddr_ugni_t *)addr;
uct_ugni_device_t *dev = uct_ugni_iface_device(iface);

ugni_dev_addr->nic_addr = dev->address;

return UCS_OK;
}

ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device, ucs_thread_mode_t thread_mode)
{
uct_ugni_job_info_t *job_info;
int modes;
gni_return_t ugni_rc;
ucs_status_t status = UCS_OK;

job_info = uct_ugni_get_job_info();
if (NULL == job_info) {
return UCS_ERR_IO_ERROR;
}

cdm->thread_mode = thread_mode;
cdm->dev = device;
uct_ugni_device_lock(cdm);
cdm->domain_id = job_info->pmi_rank_id + job_info->pmi_num_of_ranks * ugni_domain_counter++;
ucs_debug("Creating new command domain with id %d (%d + %d * %d)",
cdm->domain_id, job_info->pmi_rank_id,
job_info->pmi_num_of_ranks, ugni_domain_counter);
modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED |
GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL;
ugni_rc = GNI_CdmCreate(cdm->domain_id, job_info->ptag, job_info->cookie,
modes, &cdm->cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
status = UCS_ERR_NO_DEVICE;
goto out_unlock;
}

ugni_rc = GNI_CdmAttach(cdm->cdm_handle, device->device_id,
&cdm->address, &cdm->nic_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmAttach failed (domain id %d, %d), Error status: %s %d",
cdm->domain_id, ugni_domain_counter, gni_err_str[ugni_rc], ugni_rc);
ugni_rc = GNI_CdmDestroy(cdm->cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmDestroy error status: %s (%d)",
gni_err_str[ugni_rc], ugni_rc);
}
status = UCS_ERR_NO_DEVICE;
}

ugni_dev_addr->nic_addr = iface->dev->address;
out_unlock:
uct_ugni_device_unlock(cdm);
if (UCS_OK == status) {
ucs_debug("Made ugni cdm. nic_addr = %i domain_id = %i", device->address, cdm->domain_id);
}
return status;
}

ucs_status_t uct_ugni_destroy_cdm(uct_ugni_cdm_t *cdm)
{
gni_return_t ugni_rc;

ucs_debug("MD GNI_CdmDestroy");
ugni_rc = GNI_CdmDestroy(cdm->cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmDestroy error status: %s (%d)",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_IO_ERROR;
}
return UCS_OK;
}
2 changes: 2 additions & 0 deletions src/uct/ugni/base/ugni_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@ void uct_ugni_device_destroy(uct_ugni_device_t *dev);
void uct_ugni_device_get_resource(const char *tl_name, uct_ugni_device_t *dev,
uct_tl_resource_desc_t *resource);
ucs_status_t uct_ugni_iface_get_dev_address(uct_iface_t *tl_iface, uct_device_addr_t *addr);
ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device, ucs_thread_mode_t thread_mode);
ucs_status_t uct_ugni_destroy_cdm(uct_ugni_cdm_t *cdm);
#endif
7 changes: 3 additions & 4 deletions src/uct/ugni/base/ugni_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface,
const uct_devaddr_ugni_t *ugni_dev_addr = (const uct_devaddr_ugni_t *)dev_addr;
ucs_status_t rc = UCS_OK;
gni_return_t ugni_rc;
uint32_t *big_hash;

self->arb_sched = 0;
UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super);
Expand All @@ -196,7 +197,7 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface,
self->flush_group->parent = NULL;
#endif

ugni_rc = GNI_EpCreate(iface->nic_handle, iface->local_cq, &self->ep);
ugni_rc = GNI_EpCreate(uct_ugni_iface_nic_handle(iface), iface->local_cq, &self->ep);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand All @@ -208,11 +209,9 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface,
}

ucs_arbiter_group_init(&self->arb_group);

uint32_t *big_hash;
big_hash = (void *)&self->ep;
self->hash_key = big_hash[0];
if (GNI_DEVICE_ARIES == iface->dev->type) {
if (uct_ugni_check_device_type(iface, GNI_DEVICE_ARIES)) {
self->hash_key &= 0x00FFFFFF;
}
ucs_debug("Adding ep hash %x to iface %p", self->hash_key, iface);
Expand Down
151 changes: 36 additions & 115 deletions src/uct/ugni/base/ugni_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
#include "ugni_iface.h"
#include <pmi.h>

static uint16_t ugni_domain_global_counter = 0;

void uct_ugni_base_desc_init(ucs_mpool_t *mp, void *obj, void *chunk)
{
uct_ugni_base_desc_t *base = (uct_ugni_base_desc_t *) obj;
Expand Down Expand Up @@ -132,7 +130,7 @@ ucs_status_t uct_ugni_iface_get_address(uct_iface_h tl_iface,
uct_ugni_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_iface_t);
uct_sockaddr_ugni_t *iface_addr = (uct_sockaddr_ugni_t*)addr;

iface_addr->domain_id = iface->domain_id;
iface_addr->domain_id = iface->cdm.domain_id;
return UCS_OK;
}

Expand Down Expand Up @@ -183,7 +181,7 @@ static ucs_status_t get_ptag(uint8_t *ptag)
return UCS_OK;
}

static ucs_status_t uct_ugni_fetch_pmi()
ucs_status_t uct_ugni_fetch_pmi()
{
int spawned = 0,
rc;
Expand Down Expand Up @@ -234,136 +232,54 @@ static ucs_status_t uct_ugni_fetch_pmi()
return UCS_OK;
}

ucs_status_t uct_ugni_init_nic(int device_index,
uint16_t *domain_id,
gni_cdm_handle_t *cdm_handle,
gni_nic_handle_t *nic_handle,
uint32_t *address)
{
int modes;
ucs_status_t status;
gni_return_t ugni_rc = GNI_RC_SUCCESS;

status = uct_ugni_fetch_pmi();
if (UCS_OK != status) {
ucs_error("Failed to activate context, Error status: %d", status);
return status;
}

*domain_id = job_info.pmi_rank_id + job_info.pmi_num_of_ranks * ugni_domain_global_counter;
modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED |
GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL;
ucs_debug("Creating new command domain with id %d (%d + %d * %d)",
*domain_id, job_info.pmi_rank_id,
job_info.pmi_num_of_ranks, ugni_domain_global_counter);
ugni_rc = GNI_CdmCreate(*domain_id, job_info.ptag, job_info.cookie,
modes, cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_NO_DEVICE;
}

/* For now we use the first device for allocation of the domain */
ugni_rc = GNI_CdmAttach(*cdm_handle, job_info.devices[device_index].device_id,
address, nic_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmAttach failed (domain id %d, %d), Error status: %s %d",
*domain_id, ugni_domain_global_counter, gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_NO_DEVICE;
}

++ugni_domain_global_counter;
return UCS_OK;
}

ucs_status_t ugni_activate_iface(uct_ugni_iface_t *iface)
{
ucs_status_t status;
gni_return_t ugni_rc;
uint32_t pe_address;

if(iface->activated) {
return UCS_OK;
}

status = uct_ugni_init_nic(0, &iface->domain_id,
&iface->cdm_handle, &iface->nic_handle,
&pe_address);
if (UCS_OK != status) {
ucs_error("Failed to UGNI NIC, Error status: %d", status);
return status;
}

ucs_debug("Made ugni interface. iface->dev->nic_addr = %i iface->domain_id = %i", iface->dev->address, iface->domain_id);

ugni_rc = GNI_CqCreate(iface->nic_handle, UCT_UGNI_LOCAL_CQ, 0,
GNI_CQ_NOBLOCK,
NULL, NULL, &iface->local_cq);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CqCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_NO_DEVICE;
}
iface->activated = true;

/* iface is activated */
return UCS_OK;
}

ucs_status_t ugni_deactivate_iface(uct_ugni_iface_t *iface)
{
gni_return_t ugni_rc;

if(!iface->activated) {
return UCS_OK;
}

ugni_rc = GNI_CqDestroy(iface->local_cq);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_warn("GNI_CqDestroy failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_IO_ERROR;
}
ugni_rc = GNI_CdmDestroy(iface->cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_warn("GNI_CdmDestroy error status: %s (%d)",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_IO_ERROR;
}

iface->activated = false ;
return UCS_OK;
}

static ucs_mpool_ops_t uct_ugni_flush_mpool_ops = {
.chunk_alloc = ucs_mpool_chunk_malloc,
.chunk_release = ucs_mpool_chunk_free,
.obj_init = NULL,
.obj_cleanup = NULL
};

void uct_ugni_cleanup_base_iface(uct_ugni_iface_t *iface)
{
ucs_arbiter_cleanup(&iface->arbiter);
ucs_mpool_cleanup(&iface->flush_pool, 1);
GNI_CqDestroy(iface->local_cq);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

error log

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shamisp Thing is, the next PR pulls out all of the GNI_Cq* calls and has common spin locked code paths with error logging. I'd like to fix these problems in the next PR instead.

uct_ugni_destroy_cdm(&iface->cdm);
}

UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker,
const uct_iface_params_t *params,
uct_iface_ops_t *uct_ugni_iface_ops,
const uct_iface_config_t *tl_config
UCS_STATS_ARG(ucs_stats_node_t *stats_parent))
{
uct_ugni_device_t *dev;
gni_return_t ugni_rc;
ucs_status_t status;
uct_ugni_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_iface_config_t);
unsigned grow = (config->mpool.bufs_grow == 0) ? 128 : config->mpool.bufs_grow;

UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, uct_ugni_iface_ops, md, worker,
params, tl_config UCS_STATS_ARG(params->stats_root)
UCS_STATS_ARG(UCT_UGNI_MD_NAME));
dev = uct_ugni_device_by_name(params->dev_name);
if (NULL == dev) {
ucs_error("No device was found: %s", params->dev_name);
return UCS_ERR_NO_DEVICE;
}
UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, uct_ugni_iface_ops, md, worker,
params, tl_config UCS_STATS_ARG(params->stats_root)
UCS_STATS_ARG(UCT_UGNI_MD_NAME));
self->dev = dev;
self->activated = false;
status = uct_ugni_create_cdm(&self->cdm, dev, worker->thread_mode);
if (UCS_OK != status) {
ucs_error("Failed to UGNI NIC, Error status: %d", status);
return status;
}
ugni_rc = GNI_CqCreate(uct_ugni_iface_nic_handle(self), UCT_UGNI_LOCAL_CQ, 0,
GNI_CQ_NOBLOCK,
NULL, NULL, &self->local_cq);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CqCreate failed, Error status: %s %d",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Somebody has to clean CDM ?

gni_err_str[ugni_rc], ugni_rc);
goto clean_cdm;
}
self->outstanding = 0;
sglib_hashed_uct_ugni_ep_t_init(self->eps);
ucs_arbiter_init(&self->arbiter);
Expand All @@ -378,18 +294,23 @@ UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker,
"UGNI-DESC-ONLY");
if (UCS_OK != status) {
ucs_error("Could not init iface");
goto clean_cq;
}
return status;
clean_cq:
GNI_CqDestroy(self->local_cq);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Log error code

clean_cdm:
uct_ugni_destroy_cdm(&self->cdm);
return status;
}

UCS_CLASS_DEFINE_NEW_FUNC(uct_ugni_iface_t, uct_iface_t, uct_md_h, uct_worker_h,
const uct_iface_params_t*, uct_iface_ops_t *,
const uct_iface_config_t * UCS_STATS_ARG(ucs_stats_node_t *));

static UCS_CLASS_CLEANUP_FUNC(uct_ugni_iface_t){

ugni_deactivate_iface(self);
ucs_arbiter_cleanup(&self->arbiter);
static UCS_CLASS_CLEANUP_FUNC(uct_ugni_iface_t)
{
uct_ugni_cleanup_base_iface(self);
}

UCS_CLASS_DEFINE(uct_ugni_iface_t, uct_base_iface_t);
Loading