Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-15120 crt: Misc d_rand-related fixes #13738

Merged
merged 1 commit into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 4 additions & 9 deletions src/cart/crt_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
*/
#include <malloc.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <sys/resource.h>
#include "crt_internal.h"

Expand Down Expand Up @@ -68,6 +67,7 @@ crt_lib_init(void)
{
int rc;
uint64_t start_rpcid;
struct timespec now;

rc = D_RWLOCK_INIT(&crt_gdata.cg_rwlock, NULL);
D_ASSERT(rc == 0);
Expand All @@ -82,7 +82,9 @@ crt_lib_init(void)
crt_gdata.cg_inited = 0;
crt_gdata.cg_primary_prov = CRT_PROV_OFI_TCP_RXM;

d_srand(d_timeus_secdiff(0) + getpid());
rc = d_gettime(&now);
D_ASSERTF(rc == 0, "d_gettime: " DF_RC "\n", DP_RC(rc));
d_srand(now.tv_sec * 1000 * 1000 * 1000 + now.tv_nsec + getpid());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: i feel now.tv_nsec + getpid() is sufficient for seed. wont now.tv_sec * 1000 * 1000 * 1000 cause overflows? coverity might complain if so:)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't make this seeding change until I noticed a couple of identical seeds (printed by a temporary fprintf that I've deleted from this PR) being used during a few dmg system start commands of 6 engines. The nanosecond-based seeding seemed to be absent of such collisions. (Actually I would prefer calling getrandom to obtain truly random bytes, but let's see how the current approach works first.)

I think overflows in this case is okay, for d_srand uses the lower 32 bits only. Didn't think of Coverity though; if Coverity complains, that's my bad and feel free to assign such issue to me. :)

start_rpcid = ((uint64_t)d_rand()) << 32;

crt_gdata.cg_rpcid = start_rpcid;
Expand Down Expand Up @@ -610,8 +612,6 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt)
char *domain_env = NULL;
char *auth_key;
char *auth_key_env = NULL;
struct timeval now;
unsigned int seed;
char *path;
bool server = flags & CRT_FLAG_BIT_SERVER;
int rc = 0;
Expand Down Expand Up @@ -673,11 +673,6 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt)

D_RWLOCK_WRLOCK(&crt_gdata.cg_rwlock);
if (crt_gdata.cg_inited == 0) {
/* feed a seed for pseudo-random number generator */
gettimeofday(&now, NULL);
seed = (unsigned int)(now.tv_sec * 1000000 + now.tv_usec);
d_srand(seed);

crt_gdata.cg_server = server;
crt_gdata.cg_auto_swim_disable =
(flags & CRT_FLAG_BIT_AUTO_SWIM_DISABLE) ? 1 : 0;
Expand Down
4 changes: 2 additions & 2 deletions src/common/rsvc.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ rsvc_client_choose(struct rsvc_client *client, crt_endpoint_t *ep)
chosen = client->sc_leader_index;
} else {
if (client->sc_next < 0)
client->sc_next = d_randn(client->sc_ranks->rl_nr);
client->sc_next = d_rand() % client->sc_ranks->rl_nr;
chosen = client->sc_next;
/* The hintless search is a round robin of all replicas. */
client->sc_next++;
Expand Down Expand Up @@ -148,7 +148,7 @@ rsvc_client_process_error(struct rsvc_client *client, int rc,
* search.
*/
D_DEBUG(DB_MD, "give up leader rank %u\n", ep->ep_rank);
client->sc_next = d_randn(client->sc_ranks->rl_nr);
client->sc_next = d_rand() % client->sc_ranks->rl_nr;
if (client->sc_next == leader_index) {
client->sc_next++;
client->sc_next %= client->sc_ranks->rl_nr;
Expand Down
15 changes: 1 addition & 14 deletions src/gurt/misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,6 @@ d_rand()
return result;
}

/* Return a random integer in [0, n), where n must be positive. */
long int
d_randn(long int n)
{
long int i;

D_ASSERT(n > 0);
i = ((double)d_rand() / D_RAND_MAX) * n;
if (i >= n)
i = 0;
return i;
}

/* Developer/debug version, poison memory on free.
* This tries several ways to access the buffer size however none of them are perfect so for now
* this is no in release builds.
Expand Down Expand Up @@ -544,7 +531,7 @@ d_rank_list_shuffle(d_rank_list_t *rank_list)
return;

for (i = 0; i < rank_list->rl_nr; i++) {
j = rand() % rank_list->rl_nr;
j = d_rand() % rank_list->rl_nr;
tmp = rank_list->rl_ranks[i];
rank_list->rl_ranks[i] = rank_list->rl_ranks[j];
rank_list->rl_ranks[j] = tmp;
Expand Down
1 change: 0 additions & 1 deletion src/include/gurt/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ extern "C" {

void d_srand(long int);
long int d_rand(void);
long int d_randn(long int n);

/* Instruct the compiler these are allocation functions that return a pointer, and if possible
* which function needs to be used to free them.
Expand Down
4 changes: 1 addition & 3 deletions src/pool/cli.c
Original file line number Diff line number Diff line change
Expand Up @@ -1631,9 +1631,7 @@ choose_map_refresh_rank(struct map_refresh_arg *arg)

if (arg->mra_i == -1) {
/* Let i be a random integer in [0, n). */
i = ((double)rand() / RAND_MAX) * n;
if (i == n)
i = 0;
i = d_rand() % n;
} else {
/* Continue the round robin. */
i = arg->mra_i;
Expand Down
6 changes: 3 additions & 3 deletions src/pool/srv_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ init_reconf_map(struct pool_map *map, d_rank_list_t *replicas, d_rank_t self,

/* Shuffle rmap.rcm_domains for randomness in replica placement. */
for (i = 0; i < rmap.rcm_domains_len; i++) {
int j = i + d_randn(rmap.rcm_domains_len - i);
int j = i + d_rand() % (rmap.rcm_domains_len - i);

D_ASSERTF(i <= j && j < rmap.rcm_domains_len, "i=%d j=%d len=%d\n", i, j,
rmap.rcm_domains_len);
Expand Down Expand Up @@ -441,7 +441,7 @@ find_vacancy_in_domain(struct reconf_domain *rdomain, d_rank_list_t *replicas)
if ((engine->do_comp.co_status & POOL_SVC_MAP_STATES) &&
!d_rank_list_find(replicas, engine->do_comp.co_rank, NULL /* idx */)) {
/* Pick this vacant engine with a probability of 1/n. */
if (d_randn(n) == 0)
if (d_rand() % n == 0)
return engine->do_comp.co_rank;
n--;
}
Expand Down Expand Up @@ -478,7 +478,7 @@ find_replica_in_domain(struct reconf_domain *rdomain, d_rank_list_t *replicas, d
if ((engine->do_comp.co_status & POOL_SVC_MAP_STATES) &&
engine->do_comp.co_rank != self &&
d_rank_list_find(replicas, engine->do_comp.co_rank, NULL /* idx */)) {
if (d_randn(n) == 0)
if (d_rand() % n == 0)
return engine->do_comp.co_rank;
n--;
}
Expand Down
Loading