Skip to content

Commit

Permalink
test
Browse files Browse the repository at this point in the history
test
  • Loading branch information
yyyshi committed Mar 19, 2024
1 parent c8151d4 commit c27d7ba
Show file tree
Hide file tree
Showing 44 changed files with 595 additions and 38 deletions.
120 changes: 107 additions & 13 deletions src/bio/bio_buffer.c

Large diffs are not rendered by default.

16 changes: 15 additions & 1 deletion src/bio/bio_bulk.c
Original file line number Diff line number Diff line change
Expand Up @@ -608,43 +608,53 @@ roundup_pgs(unsigned int pgs)
int
bulk_map_one(struct bio_desc *biod, struct bio_iov *biov, void *data)
{
// data 表示的是当前处理的是否是bulk 数据
// bulk map one 里面可能会执行 dma map one
struct bio_bulk_args *arg = data;
struct bio_bulk_hdl *hdl = NULL;
uint64_t off, end;
unsigned int pg_cnt, pg_off;
int rc = 0;

D_ASSERT(bulk_create_fn != NULL && bulk_free_fn != NULL);
// 必不为空且内部的bulk ctx 必不为空
D_ASSERT(arg != NULL && arg->ba_bulk_ctxt != NULL);

D_ASSERT(biod && biod->bd_chk_type == BIO_CHK_TYPE_IO);
D_ASSERT(biod->bd_rdma);
D_ASSERT(biov);

// 如果当前还没有bulk hdl数组,先初始化一下
if (biod->bd_bulk_hdls == NULL) {
rc = bulk_iod_init(biod);
if (rc)
return rc;
}

/* Zero length IOV */
// 跳过数据长度为 0 的biov
if (bio_iov2req_len(biov) == 0) {
D_ASSERT(bio_iov2raw_len(biov) == 0);
bio_iov_set_raw_buf(biov, NULL);
goto done;
}

// biov 转化为page cnt 和page off。就是size 单元不同了
// 最后调用spdk bio 接口时还要将pg 转化 为io unit
// todo: 一个数据最终存储到硬盘中,是怎么决定存储在哪里的?一定是有特殊的规划和管理的,这个是在哪里确定的
// todo: 为啥这个object 要存到硬盘的这里,而那个object 要存到硬盘的那里?
// todo: 数据放在哪里了
dma_biov2pg(biov, &off, &end, &pg_cnt, &pg_off);

// todo: 当前iov 占用的数据不是很大(或者是hole,或者是scm 设备)
// todo: 当前iov 占用的数据很大(或者是hole,或者是scm 设备)
if (bypass_bulk_cache(biod, biov, pg_cnt)) {
// dma_map_one 这里还会执行biov (biov 是从biod 中取出的)到pg 转化: dma_biov2pg
rc = dma_map_one(biod, biov, NULL);
goto done;
}
D_ASSERT(!BIO_ADDR_IS_DEDUP(&biov->bi_addr));

// 获取bulk 的hdl
hdl = bulk_get_hdl(biod, biov, roundup_pgs(pg_cnt), pg_off, arg);
if (hdl == NULL) {
if (biod->bd_retry)
Expand All @@ -654,7 +664,9 @@ bulk_map_one(struct bio_desc *biod, struct bio_iov *biov, void *data)
return -DER_NOMEM;
}

// 更新dma buffer 的地址
bio_iov_set_raw_buf(biov, bulk_hdl2addr(hdl, pg_off));
// 添加region 到biod 的rsrvd_dma 里面的region 数组里
rc = iod_add_region(biod, hdl->bbh_chunk, hdl->bbh_pg_idx, hdl->bbh_used_bytes,
off, end, bio_iov2media(biov));
if (rc) {
Expand All @@ -663,6 +675,7 @@ bulk_map_one(struct bio_desc *biod, struct bio_iov *biov, void *data)
}

/* Update the used bytes for shared handle */
// todo: 更新未使用的字节数
if (hdl->bbh_shareable) {
D_ASSERT(hdl->bbh_bulk_off == 0);
hdl->bbh_used_bytes += bio_iov2len(biov);
Expand All @@ -671,6 +684,7 @@ bulk_map_one(struct bio_desc *biod, struct bio_iov *biov, void *data)
D_ASSERT(biod->bd_bulk_hdls != NULL);
D_ASSERT(biod->bd_bulk_cnt < biod->bd_bulk_max);

// 添加当前biod 保存的bulk hdl
biod->bd_bulk_hdls[biod->bd_bulk_cnt] = hdl;
biod->bd_bulk_cnt++;

Expand Down
28 changes: 26 additions & 2 deletions src/bio/bio_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ struct bio_dma_chunk {
/* Link to edb_idle_list or edb_used_list or bbg_dma_chks */
d_list_t bdc_link;
/* Base pointer of the chunk address */
// chunk 的base 地址
void *bdc_ptr;
/* Page offset (4K page) to unused fraction */
unsigned int bdc_pg_idx;
Expand Down Expand Up @@ -391,11 +392,17 @@ struct bio_xs_context {
/* Per VOS instance I/O context */
struct bio_io_context {
d_list_t bic_link; /* link to bxb_io_ctxts */
// 当前bio ctx 的blob 和blobstore
struct spdk_blob *bic_blob;
struct bio_xs_blobstore *bic_xs_blobstore;
// xs 的ctx
struct bio_xs_context *bic_xs_ctxt;
uint32_t bic_inflight_dmas;
// xs 的io unit
// 根据bs 可以获取对应的io unit 个数
// 根据blob 可以获取对应的cluster 个数
uint32_t bic_io_unit;
// 当前bio 对应的pool 的uuid
uuid_t bic_pool_id;
unsigned int bic_opening:1,
bic_closing:1,
Expand All @@ -405,12 +412,16 @@ struct bio_io_context {
/* A contiguous DMA buffer region reserved by certain io descriptor */
struct bio_rsrvd_region {
/* The DMA chunk where the region is located */
// region 所在的chunk
struct bio_dma_chunk *brr_chk;
/* Start page idx within the DMA chunk */
// dma chunk 的起始页索引
unsigned int brr_pg_idx;
/* Payload offset (from brr_pg_idx) in bytes, used for SCM only */
// 这个是单独给scm 用的
unsigned int brr_chk_off;
/* Offset within the SPDK blob in bytes */
// spdk blob 的offset 字节
uint64_t brr_off;
/* End (not included) in bytes */
uint64_t brr_end;
Expand All @@ -419,27 +430,36 @@ struct bio_rsrvd_region {
};

/* Reserved DMA buffer for certain io descriptor */
// dma region 和dma chunk 的信息(两个数组)
struct bio_rsrvd_dma {
/* DMA regions reserved by the io descriptor */
// dma region 数组
struct bio_rsrvd_region *brd_regions;
/* Capacity of the region array */
// dma region 数组的capacity
unsigned int brd_rg_max;
/* Total number of reserved regions */
// dma region 数组个数
unsigned int brd_rg_cnt;
/* Pointer array for all referenced DMA chunks */
// 数组指针,描述所有的dma chunks
struct bio_dma_chunk **brd_dma_chks;
/* Capacity of the pointer array */
// dma chunk 最大限制
unsigned int brd_chk_max;
/* Total number of chunks being referenced */
// 当前数组保存的dma chunk 的个数
unsigned int brd_chk_cnt;
};

/* I/O descriptor */
struct bio_desc {
struct umem_instance *bd_umem;
// bio ctx,里面有xs 的ctx
struct bio_io_context *bd_ctxt;
/* DMA buffers reserved by this io descriptor */
// 当前biod 拥有的dma buffers,又会按照region 来拆分
// todo: 这里存储的是当前已经预留的所有资源,包括scm 和nvme 的资源,这个将决定哪个object 将被写到哪个硬盘上
struct bio_rsrvd_dma bd_rsrvd;
/* Report blob i/o completion */
ABT_eventual bd_dma_done;
Expand All @@ -460,6 +480,7 @@ struct bio_desc {
bd_async_post:1,
bd_non_blocking:1;
/* Cached bulk handles being used by this IOD */
// 当前iod 缓存的bulk hdls
struct bio_bulk_hdl **bd_bulk_hdls;
unsigned int bd_bulk_max;
unsigned int bd_bulk_cnt;
Expand Down Expand Up @@ -589,6 +610,7 @@ iod_dma_buf(struct bio_desc *biod)
D_ASSERT(biod->bd_ctxt->bic_xs_ctxt);
D_ASSERT(biod->bd_ctxt->bic_xs_ctxt->bxc_dma_buf);

// 返回xs ctx 的dma buffer
return biod->bd_ctxt->bic_xs_ctxt->bxc_dma_buf;
}

Expand All @@ -597,16 +619,18 @@ dma_biov2pg(struct bio_iov *biov, uint64_t *off, uint64_t *end,
unsigned int *pg_cnt, unsigned int *pg_off)
{
// biov 的头尾。中间包含的是当前biov 这些数据
// todo: 这里内部又是根据什么转化的,即 bi_addr.ba_off 是怎么来的
*off = bio_iov2raw_off(biov);
*end = bio_iov2raw_off(biov) + bio_iov2raw_len(biov);

// 不同设备的转换方式不同
// todo: 为啥不同设备的转换方式不同
if (bio_iov2media(biov) == DAOS_MEDIA_SCM) {
// 1. pmem 场景。每页4k大小
*pg_cnt = (*end - *off + BIO_DMA_PAGE_SZ - 1) >>
BIO_DMA_PAGE_SHIFT;
*pg_off = 0;
} else {
// 转换成page cnt 和page off
// 2. nvme :转换成page cnt 和page off
*pg_cnt = ((*end + BIO_DMA_PAGE_SZ - 1) >> BIO_DMA_PAGE_SHIFT) -
(*off >> BIO_DMA_PAGE_SHIFT);
*pg_off = *off & ((uint64_t)BIO_DMA_PAGE_SZ - 1);
Expand Down
1 change: 1 addition & 0 deletions src/common/debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ io_bypass_init(void)
tok = daos_str_trimwhite(tok);
for (iob = &io_bypass_dict[0]; iob->iob_str; iob++) {
if (strcasecmp(tok, iob->iob_str) == 0) {
// 用于debug 的io bypass 参数
daos_io_bypass |= iob->iob_bit;
D_PRINT("debugging mode: %s is disabled\n",
iob->iob_str);
Expand Down
4 changes: 4 additions & 0 deletions src/common/lru.c
Original file line number Diff line number Diff line change
Expand Up @@ -203,20 +203,23 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key,
if (lcache->dlc_ops->lop_print_key)
lcache->dlc_ops->lop_print_key(key, key_size);

// 查询
link = d_hash_rec_find(&lcache->dlc_htable, key, key_size);
if (link != NULL) {
llink = link2llink(link);
D_ASSERT(llink->ll_evicted == 0);
/* remove busy item from LRU */
if (!d_list_empty(&llink->ll_qlink))
d_list_del_init(&llink->ll_qlink);
// 如果查到了,返回
D_GOTO(found, rc = 0);
}

if (create_args == NULL)
D_GOTO(out, rc = -DER_NONEXIST);

/* llink does not exist create one */
// 如果没查到,那么创建一个并insert 进去
rc = lcache->dlc_ops->lop_alloc_ref(key, key_size, create_args, &llink);
if (rc)
D_GOTO(out, rc);
Expand All @@ -227,6 +230,7 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key,
llink->ll_ops = lcache->dlc_ops;
D_INIT_LIST_HEAD(&llink->ll_qlink);

// insert 到oi table
rc = d_hash_rec_insert(&lcache->dlc_htable, key, key_size,
&llink->ll_link, true);
if (rc) {
Expand Down
2 changes: 2 additions & 0 deletions src/common/misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,7 @@ struct d_hlink *
daos_hhash_link_lookup(uint64_t key)
{
D_ASSERT(daos_ht.dht_hhash != NULL);
// todo: 啥时候插入的
return d_hhash_link_lookup(daos_ht.dht_hhash, key);
}

Expand Down Expand Up @@ -721,6 +722,7 @@ daos_dti_gen(struct dtx_id *dti, bool zero)
uuid_generate(dti_uuid);

uuid_copy(dti->dti_uuid, dti_uuid);
// todo: 为啥这里不是epoch
dti->dti_hlc = d_hlc_get();
}
}
Expand Down
6 changes: 6 additions & 0 deletions src/common/pool_map.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ struct pool_comp_sorter {
};

/** In memory data structure for pool map */
// pool map 的内存信息
struct pool_map {
/** protect the refcount */
pthread_mutex_t po_lock;
Expand All @@ -62,18 +63,23 @@ struct pool_map {
/** refcount on the pool map */
int po_ref;
/** # domain layers */
// pool 的容错域信息
unsigned int po_domain_layers;
/**
* Sorters for the binary search of different domain types.
* These sorters are in ascending order for binary search of sorters.
*/
// 排好序的容错域信息
struct pool_comp_sorter *po_domain_sorters;
/** sorter for binary search of target */
// 排好序的target 们
struct pool_comp_sorter po_target_sorter;
/**
* Tree root of all components.
* NB: All components must be stored in contiguous buffer.
*/
// 当前pool 的components 的tree 的root
// todo: 怎么构建和维护的tree
struct pool_domain *po_tree;
/**
* number of currently failed pool components of each type
Expand Down
10 changes: 10 additions & 0 deletions src/dtx/dtx_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -1309,6 +1309,8 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul

D_ASSERT(dth->dth_mbs != NULL);

// 直接在dth_mbs 尾部追加一个dte
// todo: 什么含义
size = sizeof(*dte) + sizeof(*mbs) + dth->dth_mbs->dm_data_size;
D_ALLOC(dte, size);
if (dte == NULL) {
Expand All @@ -1335,6 +1337,8 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
flags = DCF_SHARED;
else
flags = 0;

// 在一次dtx 要结束的时候,向cos 缓存中添加dtx entry
rc = dtx_add_cos(cont, dte, &dth->dth_leader_oid,
dth->dth_dkey_hash, dth->dth_epoch, flags);
dtx_entry_put(dte);
Expand Down Expand Up @@ -1505,7 +1509,9 @@ dtx_end(struct dtx_handle *dth, struct ds_cont_child *cont, int result)
if (daos_is_zero_dti(&dth->dth_xid))
goto out;

// 说明tx 执行出错了
if (result < 0) {
// 虽然本次执行错误,但是因为cos 缓存中还没done,所以需要commit
if (dth->dth_dti_cos_count > 0 && !dth->dth_cos_done) {
int rc;

Expand All @@ -1517,6 +1523,10 @@ dtx_end(struct dtx_handle *dth, struct ds_cont_child *cont, int result)
* to commit them, because they are still in CoS cache,
* and can be committed next time.
*/
// todo: 什么意思
// 1. 如果当前节点为非leader 节点,即使由于某种原因导致修改失败,我们依然需要
// 提交piggyback dtxs,因为这些可能已经在其他节点被提交了
// 2. 如果当前节点是leader 节点,是否提交失败不重要,因为他们已经在cos 缓存中了,下一次一定会被提交
rc = vos_dtx_commit(cont->sc_hdl, dth->dth_dti_cos,
dth->dth_dti_cos_count, NULL);
if (rc < 0)
Expand Down
6 changes: 6 additions & 0 deletions src/dtx/dtx_cos.c
Original file line number Diff line number Diff line change
Expand Up @@ -333,11 +333,14 @@ dtx_list_cos(struct ds_cont_child *cont, daos_unit_oid_t *oid,
int rc;
int i = 0;

// key 是oid 和dkey 的hash
key.oid = *oid;
key.dkey_hash = dkey_hash;
d_iov_set(&kiov, &key, sizeof(key));
d_iov_set(&riov, NULL, 0);

// todo: 这些东西是存储在哪里的
// 通过dtx cos hdl 和key 来查询cos 信息
rc = dbtree_lookup(cont->sc_dtx_cos_hdl, &kiov, &riov);
if (rc != 0)
return rc == -DER_NONEXIST ? 0 : rc;
Expand All @@ -360,6 +363,7 @@ dtx_list_cos(struct ds_cont_child *cont, daos_unit_oid_t *oid,
if (dti == NULL)
return -DER_NOMEM;

// 将查到的cos dtx 都返回
d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) {
dti[i] = dcrc->dcrc_dte->dte_xid;
if (++i >= count)
Expand Down Expand Up @@ -389,6 +393,7 @@ dtx_add_cos(struct ds_cont_child *cont, struct dtx_entry *dte,
D_ASSERT(dte->dte_mbs != NULL);
D_ASSERT(epoch != DAOS_EPOCH_MAX);

// key 是oid 和dkey 的hash
key.oid = *oid;
key.dkey_hash = dkey_hash;
d_iov_set(&kiov, &key, sizeof(key));
Expand All @@ -398,6 +403,7 @@ dtx_add_cos(struct ds_cont_child *cont, struct dtx_entry *dte,
rbund.flags = flags;
d_iov_set(&riov, &rbund, sizeof(rbund));

// 插入cos dtx item
rc = dbtree_upsert(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ,
DAOS_INTENT_UPDATE, &kiov, &riov, NULL);

Expand Down
2 changes: 2 additions & 0 deletions src/gurt/hash.c
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ d_hash_murmur64(const unsigned char *key, unsigned int key_len,
* num_buckets representing the bucket
* the given key hashes to.
*/
// jump 一致性hash 算法为给定的key 提供一个桶idx。这个算法在扩展桶时,将会
// hash 最小数量(1/n)的key 到新的桶
uint32_t
d_hash_jump(uint64_t key, uint32_t num_buckets)
{
Expand Down
5 changes: 5 additions & 0 deletions src/include/cart/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,17 @@ typedef struct crt_group {
} crt_group_t;

/** transport endpoint identifier */
// 传输对端唯一标识符
// 描述的是xxx 个group 下的rank[yyy] 下的第zzz 个target
typedef struct {
/** group handle, NULL means the primary group */
// todo: 所在的group,这个group 是怎么划分和维护的
crt_group_t *ep_grp;
/** rank number within the group */
// 在上面所述的group 中的rank 索引
d_rank_t ep_rank;
/** tag, now used as the context ID of the target rank */
// ep_rank 所决定的rank 下的target 的索引
uint32_t ep_tag;
} crt_endpoint_t;

Expand Down
1 change: 1 addition & 0 deletions src/include/daos/dtx.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ struct dtx_id {
/** The uuid of the transaction */
uuid_t dti_uuid;
/** The HLC timestamp (not epoch) of the transaction */
// todo: 这个为什么不是epoch ?
uint64_t dti_hlc;
};

Expand Down
Loading

0 comments on commit c27d7ba

Please sign in to comment.