Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into feature/vos_on_blob_p2
Browse files Browse the repository at this point in the history
Required-githooks: true
  • Loading branch information
tanabarr committed Sep 8, 2024
2 parents aa2cba4 + 6a59b26 commit d0a295c
Show file tree
Hide file tree
Showing 106 changed files with 4,022 additions and 934 deletions.
1 change: 1 addition & 0 deletions docs/admin/env_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Environment variables in this section only apply to the server side.
|DAOS\_DTX\_RPC\_HELPER\_THD|DTX RPC helper threshold. The valid range is [18, unlimited). The default value is 513.|
|DAOS\_DTX\_BATCHED\_ULT\_MAX|The max count of DTX batched commit ULTs. The valid range is [0, unlimited). 0 means to commit DTX synchronously. The default value is 32.|
|DAOS\_FORWARD\_NEIGHBOR|Set to enable I/O forwarding on neighbor xstream in the absence of helper threads.|
|DAOS\_POOL\_RF|Redundancy factor for the pool. The valid range is [1, 4]. The default value is 2.|

## Server and Client environment variables

Expand Down
24 changes: 24 additions & 0 deletions docs/admin/pool_operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -916,6 +916,30 @@ and possibly repair a pmemobj file. As discussed in the previous section, the
rebuild status can be consulted via the pool query and will be expanded
with more information.

## Pool Redundancy Factor

If the DAOS system experiences cascading failures, where the number of failed
fault domains exceeds a pool's redundancy factor, there could be unrecoverable
errors and applications could suffer from data loss. This can happen in cases
of power or network outages and would cause node/engine failures. In most cases
those failures can be recovered and DAOS engines can be restarted and the system
can function again.

Administrator can set the default pool redundancy factor by environment variable
"DAOS_POOL_RF" in the server yaml file. If SWIM detects and reports an engine is
dead and the number of failed fault domain exceeds or is going to exceed the pool
redundancy factor, it will not change pool map immediately. Instead, it will give
critical log message:
intolerable unavailability: engine rank x
In this case, the system administrator should check and try to recover those
failed engines and bring them back with:
dmg system start --ranks=x
one by one. A reintegrate call is not needed.

For true unrecoverable failures, the administrator can still exclude engines.
However, data loss is expected as the number of unrecoverable failures exceeds
the pool redundancy factor.

## Recovering Container Ownership

Typically users are expected to manage their containers. However, in the event
Expand Down
2 changes: 1 addition & 1 deletion src/cart/README.env
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ This file lists the environment variables used in CaRT.

. CRT_CTX_NUM
If set, specifies the limit of number of allowed CaRT contexts to be created.
Valid range is [1, 64], with default being 64 if unset.
Valid range is [1, 128], with default being 128 if unset.

. D_FI_CONFIG
Specifies the fault injection configuration file. If this variable is not set
Expand Down
2 changes: 1 addition & 1 deletion src/cart/crt_internal_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#define CRT_CONTEXT_NULL (NULL)

#ifndef CRT_SRV_CONTEXT_NUM
#define CRT_SRV_CONTEXT_NUM (64) /* Maximum number of contexts */
#define CRT_SRV_CONTEXT_NUM (128) /* Maximum number of contexts */
#endif


Expand Down
4 changes: 2 additions & 2 deletions src/chk/chk_engine.c
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,7 @@ chk_engine_pool_mbs_one(struct chk_pool_rec *cpr, struct pool_map *map, struct c
int rc = 0;
bool unknown;

dom = pool_map_find_node_by_rank(map, mbs->cpm_rank);
dom = pool_map_find_dom_by_rank(map, mbs->cpm_rank);
if (dom == NULL) {
D_ASSERT(mbs->cpm_rank != dss_self_rank());

Expand Down Expand Up @@ -777,7 +777,7 @@ chk_engine_find_dangling_pm(struct chk_pool_rec *cpr, struct pool_map *map)
int j;
bool down;

rank_nr = pool_map_find_nodes(map, PO_COMP_ID_ALL, &doms);
rank_nr = pool_map_find_ranks(map, PO_COMP_ID_ALL, &doms);
if (rank_nr <= 0)
D_GOTO(out, rc = rank_nr);

Expand Down
2 changes: 1 addition & 1 deletion src/client/api/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def scons():

if prereqs.client_requested():
libdaos = env.d_library('daos', libdaos_tgts, SHLIBVERSION=API_VERSION,
LIBS=['daos_common'])
LIBS=['daos_common', 'numa'])
if hasattr(env, 'InstallVersionedLib'):
env.InstallVersionedLib('$PREFIX/lib64/', libdaos, SHLIBVERSION=API_VERSION)
else:
Expand Down
39 changes: 34 additions & 5 deletions src/client/api/event.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,24 @@ daos_eq_lib_init(crt_init_options_t *crt_info)
D_GOTO(unlock, rc);
}

/* use a global shared context for all eq for now */
rc = crt_context_create(&daos_eq_ctx);
if (d_dynamic_ctx_g) {
char iface[DAOS_SYS_INFO_STRING_MAX];

rc = dc_mgmt_get_iface(&iface[0]);
if (rc && rc != -DER_NONEXIST) {
D_ERROR("failed to get iface: " DF_RC "\n", DP_RC(rc));
D_GOTO(crt, rc);
}
/** if no interface returned, use the default */
if (rc == -DER_NONEXIST)
rc = crt_context_create(&daos_eq_ctx);
else
rc = crt_context_create_on_iface(iface, &daos_eq_ctx);
} else {
rc = crt_context_create(&daos_eq_ctx);
}
if (rc != 0) {
D_ERROR("failed to create client context: "DF_RC"\n",
DP_RC(rc));
D_ERROR("failed to create client context: " DF_RC "\n", DP_RC(rc));
D_GOTO(crt, rc);
}

Expand Down Expand Up @@ -656,7 +669,23 @@ daos_eq_create(daos_handle_t *eqh)

eqx = daos_eq2eqx(eq);

rc = crt_context_create(&eqx->eqx_ctx);
if (d_dynamic_ctx_g) {
char iface[DAOS_SYS_INFO_STRING_MAX];

rc = dc_mgmt_get_iface(&iface[0]);
if (rc && rc != -DER_NONEXIST) {
D_ERROR("failed to get iface: " DF_RC "\n", DP_RC(rc));
return rc;
}

/** if no interface returned, use the default */
if (rc == -DER_NONEXIST)
rc = crt_context_create(&eqx->eqx_ctx);
else
rc = crt_context_create_on_iface(iface, &eqx->eqx_ctx);
} else {
rc = crt_context_create(&eqx->eqx_ctx);
}
if (rc) {
D_WARN("Failed to create CART context; using the global one, "DF_RC"\n", DP_RC(rc));
eqx->eqx_ctx = daos_eq_ctx;
Expand Down
15 changes: 9 additions & 6 deletions src/client/dfs/dfs_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,6 @@
/** Max recursion depth for symlinks */
#define DFS_MAX_RECURSION 40

/** MAX value for the HI OID */
#define MAX_OID_HI ((1UL << 32) - 1)

typedef uint64_t dfs_magic_t;
typedef uint16_t dfs_sb_ver_t;
typedef uint16_t dfs_layout_ver_t;
Expand Down Expand Up @@ -164,6 +161,8 @@ struct dfs {
daos_handle_t coh;
/** refcount on cont handle that through the DFS API */
uint32_t coh_refcount;
/** The last oid.hi in the sequence */
uint32_t last_hi;
/** Transaction handle epoch. DAOS_EPOCH_MAX for DAOS_TX_NONE */
daos_epoch_t th_epoch;
/** Transaction handle */
Expand Down Expand Up @@ -343,20 +342,24 @@ oid_gen(dfs_t *dfs, daos_oclass_id_t oclass, bool file, daos_obj_id_t *oid)

D_MUTEX_LOCK(&dfs->lock);
/** If we ran out of local OIDs, alloc one from the container */
if (dfs->oid.hi >= MAX_OID_HI) {
if (dfs->oid.hi == dfs->last_hi) {
/** Allocate an OID for the namespace */
rc = daos_cont_alloc_oids(dfs->coh, 1, &dfs->oid.lo, NULL);
if (rc) {
D_ERROR("daos_cont_alloc_oids() Failed (%d)\n", rc);
D_MUTEX_UNLOCK(&dfs->lock);
return daos_der2errno(rc);
}
dfs->oid.hi = 0;
/** Start such that dfs->last_hi will be final value */
dfs->oid.hi = dfs->last_hi;
}

/** set oid and lo, bump the current hi value */
oid->lo = dfs->oid.lo;
oid->hi = dfs->oid.hi++;
daos_obj_oid_cycle(&dfs->oid);
if (unlikely(dfs->oid.lo == RESERVED_LO && dfs->oid.hi <= 1))
daos_obj_oid_cycle(&dfs->oid); /* Avoid reserved oids */
oid->hi = dfs->oid.hi;
D_MUTEX_UNLOCK(&dfs->lock);

/** if a regular file, use UINT64 typed dkeys for the array object */
Expand Down
18 changes: 9 additions & 9 deletions src/client/dfs/mnt.c
Original file line number Diff line number Diff line change
Expand Up @@ -685,20 +685,20 @@ dfs_mount(daos_handle_t poh, daos_handle_t coh, int flags, dfs_t **_dfs)

/** if RW, allocate an OID for the namespace */
if (amode == O_RDWR) {
dfs->last_hi = (unsigned int)d_rand();
/** Avoid potential conflict with SB or ROOT */
if (dfs->last_hi <= 1)
dfs->last_hi = 2;

rc = daos_cont_alloc_oids(coh, 1, &dfs->oid.lo, NULL);
if (rc) {
D_ERROR("daos_cont_alloc_oids() Failed, " DF_RC "\n", DP_RC(rc));
D_GOTO(err_root, rc = daos_der2errno(rc));
}

/*
* if this is the first time we allocate on this container,
* account 0 for SB, 1 for root obj.
*/
if (dfs->oid.lo == RESERVED_LO)
dfs->oid.hi = ROOT_HI + 1;
else
dfs->oid.hi = 0;
dfs->oid.hi = dfs->last_hi;
/** Increment so that dfs->last_hi is the last value */
daos_obj_oid_cycle(&dfs->oid);
}

dfs->mounted = DFS_MOUNT;
Expand Down Expand Up @@ -1023,7 +1023,7 @@ dfs_global2local(daos_handle_t poh, daos_handle_t coh, int flags, d_iov_t glob,

/** allocate a new oid on the next file or dir creation */
dfs->oid.lo = 0;
dfs->oid.hi = MAX_OID_HI;
dfs->oid.hi = dfs->last_hi;

rc = D_MUTEX_INIT(&dfs->lock, NULL);
if (rc != 0) {
Expand Down
5 changes: 2 additions & 3 deletions src/client/dfs/obj.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,14 @@ dfs_obj_get_info(dfs_t *dfs, dfs_obj_t *obj, dfs_obj_info_t *info)
if (dfs->attr.da_dir_oclass_id)
info->doi_dir_oclass_id = dfs->attr.da_dir_oclass_id;
else
rc = daos_obj_get_oclass(dfs->coh, 0, 0, 0,
rc = daos_obj_get_oclass(dfs->coh, DAOS_OT_MULTI_HASHED, 0, 0,
&info->doi_dir_oclass_id);

if (dfs->attr.da_file_oclass_id)
info->doi_file_oclass_id = dfs->attr.da_file_oclass_id;
else
rc = daos_obj_get_oclass(dfs->coh, 0, 0, 0,
rc = daos_obj_get_oclass(dfs->coh, DAOS_OT_ARRAY_BYTE, 0, 0,
&info->doi_file_oclass_id);

if (rc) {
D_ERROR("daos_obj_get_oclass() failed " DF_RC "\n", DP_RC(rc));
return daos_der2errno(rc);
Expand Down
54 changes: 23 additions & 31 deletions src/common/lru.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ lru_hop_rec_decref(struct d_hash_table *htable, d_list_t *link)

D_ASSERT(llink->ll_ref > 0);
llink->ll_ref--;
if (llink->ll_ref == 1 && llink->ll_ops->lop_wakeup)

/* eviction waiter is the last one holds refcount */
if (llink->ll_wait_evict &&
llink->ll_ops->lop_wakeup && daos_lru_is_last_user(llink))
llink->ll_ops->lop_wakeup(llink);

/* Delete from hash only if no more references */
Expand Down Expand Up @@ -215,15 +218,6 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key,
if (link != NULL) {
llink = link2llink(link);
D_ASSERT(llink->ll_evicted == 0);
if (llink->ll_evicting) {
/**
* Avoid calling `lru_hop_rec_decref()` at this point
* to prevent `wakeup()` from being invoked twice.
*/
D_ASSERT(llink->ll_ref > 1);
llink->ll_ref--;
D_GOTO(out, rc = -DER_SHUTDOWN);
}
/* remove busy item from LRU */
if (!d_list_empty(&llink->ll_qlink))
d_list_del_init(&llink->ll_qlink);
Expand Down Expand Up @@ -257,24 +251,17 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key,
return rc;
}

static void
lru_ref_release_internal(struct daos_lru_cache *lcache, struct daos_llink *llink, bool wait)
void
daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink)
{
D_ASSERT(lcache != NULL && llink != NULL && llink->ll_ref > 1);
D_ASSERT(d_list_empty(&llink->ll_qlink));

lru_hop_rec_decref(&lcache->dlc_htable, &llink->ll_link);

if (wait && llink->ll_ref > 1) {
D_ASSERT(llink->ll_evicting == 0);
llink->ll_evicting = 1;
lcache->dlc_ops->lop_wait(llink);
llink->ll_evicting = 0;
llink->ll_evicted = 1;
}

if (llink->ll_ref == 1) { /* the last refcount */
if (lcache->dlc_csize == 0)
/* zero-sized cache always evicts unused item */
if (lcache->dlc_csize == 0 && !llink->ll_evicted)
llink->ll_evicted = 1;

if (llink->ll_evicted) {
Expand All @@ -297,15 +284,20 @@ lru_ref_release_internal(struct daos_lru_cache *lcache, struct daos_llink *llink
}

void
daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink)
{
lru_ref_release_internal(lcache, llink, false);
}

void
daos_lru_ref_wait_evict(struct daos_lru_cache *lcache, struct daos_llink *llink)
daos_lru_ref_evict_wait(struct daos_lru_cache *lcache, struct daos_llink *llink)
{
D_ASSERT(lcache->dlc_ops->lop_wait);

lru_ref_release_internal(lcache, llink, true);
if (!llink->ll_evicted)
daos_lru_ref_evict(lcache, llink);

if (lcache->dlc_ops->lop_wait && !daos_lru_is_last_user(llink)) {
/* Wait until I'm the last one.
* XXX: the implementation can only support one waiter for now, if there
* is a secondary ULT calls this function on the same item, it will hit
* the assertion.
*/
D_ASSERT(!llink->ll_wait_evict);
llink->ll_wait_evict = 1;
lcache->dlc_ops->lop_wait(llink);
llink->ll_wait_evict = 0;
}
}
Loading

0 comments on commit d0a295c

Please sign in to comment.