Skip to content

Commit

Permalink
Merge branch 'master' into samirrav/DAOS-9825-Final
Browse files Browse the repository at this point in the history
Features: control telemetry

Required-githooks: true
Signed-off-by: Samir Raval <[email protected]>
  • Loading branch information
Samir Raval committed Dec 19, 2024
2 parents 5b5f7a7 + 9474a51 commit 662baa9
Show file tree
Hide file tree
Showing 77 changed files with 1,358 additions and 322 deletions.
8 changes: 5 additions & 3 deletions .github/workflows/bullseye-coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ env:
# the organizational defaults values for these variables
# TODO: we really need to define a list of supported versions (ideally it's no more than 2)
# build is done on the lowest version and test on the highest with a "sanity test"
# stage done on all versions in the list ecept the highest
# stage done on all versions in the list except the highest
EL8_BUILD_VERSION: ${{ vars.EL8_BUILD_VERSION_MASTER }}
EL8_VERSION: ${{ vars.EL8_VERSION_MASTER }}
EL9_BUILD_VERSION: ${{ vars.EL9_BUILD_VERSION_MASTER }}
Expand Down Expand Up @@ -365,7 +365,8 @@ jobs:
- name: Publish test results
if: (!cancelled()) && (success() || failure()) &&
steps.run-test.outcome != 'skipped'
uses: EnricoMi/publish-unit-test-result-action@v2
# yamllint disable-line rule:line-length
uses: EnricoMi/publish-unit-test-result-action@4e7013f9576bd22ffdae979dc6e68cb9ec2aeece # v2.7.0
with:
check_name: ${{ env.STAGE_NAME }} Test Results
github_token: ${{ secrets.GITHUB_TOKEN }}
Expand Down Expand Up @@ -632,7 +633,8 @@ jobs:
- name: Publish test results
if: (!cancelled()) && (success() || failure()) &&
steps.run-test.outcome != 'skipped'
uses: EnricoMi/publish-unit-test-result-action@v2
# yamllint disable-line rule:line-length
uses: EnricoMi/publish-unit-test-result-action@4e7013f9576bd22ffdae979dc6e68cb9ec2aeece # v2.7.0
with:
check_name: ${{ env.STAGE_NAME }} Test Results
github_token: ${{ secrets.GITHUB_TOKEN }}
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/ci2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ jobs:
run: docker cp build-post:/home/daos/daos/nlt-junit.xml ./
- name: Publish NLT test results
if: always()
uses: EnricoMi/[email protected]
# yamllint disable-line rule:line-length
uses: EnricoMi/publish-unit-test-result-action@4e7013f9576bd22ffdae979dc6e68cb9ec2aeece # v2.7.0
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
files: nlt-junit.xml
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/landing-builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ jobs:
run: docker cp build-post:/home/daos/daos/nlt-junit.xml ./
- name: Publish NLT test results
if: always()
uses: EnricoMi/[email protected]
# yamllint disable-line rule:line-length
uses: EnricoMi/publish-unit-test-result-action@4e7013f9576bd22ffdae979dc6e68cb9ec2aeece # v2.7.0
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
files: nlt-junit.xml
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ jobs:
- codespell
# - clang-format # not required
- yaml-lint
- copyright
if: (!cancelled())
steps:
- name: Check if any job failed
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/rpm-build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ env:
# the organizational defaults values for these variables
# TODO: we really need to define a list of supported versions (ideally it's no more than 2)
# build is done on the lowest version and test on the highest with a "sanity test"
# stage done on all versions in the list ecept the highest
# stage done on all versions in the list except the highest
EL8_BUILD_VERSION: ${{ vars.EL8_BUILD_VERSION_MASTER }}
EL8_VERSION: ${{ vars.EL8_VERSION_MASTER }}
EL9_BUILD_VERSION: ${{ vars.EL9_BUILD_VERSION_MASTER }}
Expand Down Expand Up @@ -373,7 +373,8 @@ jobs:
- name: Publish test results
if: (!cancelled()) && (success() || failure()) &&
steps.run-test.outcome != 'skipped'
uses: EnricoMi/publish-unit-test-result-action@v2
# yamllint disable-line rule:line-length
uses: EnricoMi/publish-unit-test-result-action@4e7013f9576bd22ffdae979dc6e68cb9ec2aeece # v2.7.0
with:
check_name: ${{ env.STAGE_NAME }} Test Results
github_token: ${{ secrets.GITHUB_TOKEN }}
Expand Down Expand Up @@ -640,7 +641,8 @@ jobs:
- name: Publish test results
if: (!cancelled()) && (success() || failure()) &&
steps.run-test.outcome != 'skipped'
uses: EnricoMi/publish-unit-test-result-action@v2
# yamllint disable-line rule:line-length
uses: EnricoMi/publish-unit-test-result-action@4e7013f9576bd22ffdae979dc6e68cb9ec2aeece # v2.7.0
with:
check_name: ${{ env.STAGE_NAME }} Test Results
github_token: ${{ secrets.GITHUB_TOKEN }}
Expand Down
2 changes: 1 addition & 1 deletion docs/admin/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,7 @@ Alternately, the administrator may erase and re-format the DAOS system to start

### Engines become unavailable

Engines may become unavailable due to server power losses and reboots, network switch failures, etc. After staying unavailable for a certain period of time, these engines may become "excluded" or "errored" in `dmg system query` output. Once the states of all engines stabilize (see [`CRT_EVENT_DELAY`](env_variables.md)), each pool will check whether there is enough redundancy (see [Pool RF](pool_operations.md#pool-redundancy-factor)) to tolerate the unavailability of the "excluded" or "errored" engines. If there is enough redundancy, these engines will be excluded from the pool ("disabled ranks" in `dmg pool query --health-only` output); otherwise, the pool will perform no exclusion ("suspect ranks" in `dmg pool query --health-only` output as described in [Querying a Pool](pool_operations.md#querying-a-pool)) and may become temporarily unavailable (as seen by timeouts of `dmg pool query`, `dmg pool list`, etc.). Similarly, when engines become available, whenever the states of all engines stabilize, each pool will perform the aforementioned check for any unavailable engines that remain.
Engines may become unavailable due to server power losses and reboots, network switch failures, etc. After staying unavailable for a certain period of time, these engines may become "excluded" or "errored" in `dmg system query` output. Once the states of all engines stabilize (see [`CRT_EVENT_DELAY`](env_variables.md)), each pool will check whether there is enough redundancy (see [Pool RF](pool_operations.md#pool-redundancy-factor)) to tolerate the unavailability of the "excluded" or "errored" engines. If there is enough redundancy, these engines will be excluded from the pool ("Disabled ranks" in `dmg pool query --health-only` output); otherwise, the pool will perform no exclusion ("Dead ranks" in `dmg pool query --health-only` output as described in [Querying a Pool](pool_operations.md#querying-a-pool)) and may become temporarily unavailable (as seen by timeouts of `dmg pool query`, `dmg pool list`, etc.). Similarly, when engines become available, whenever the states of all engines stabilize, each pool will perform the aforementioned check for any unavailable engines that remain.

To restore availability as well as capacity and performance, try to start all "excluded" or "errored" engines. Starting all of them at the same time minimizes the chance of triggering rebuild jobs. In many cases, the following command suffices:
```
Expand Down
2 changes: 1 addition & 1 deletion src/client/dfs/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def scons():
libraries = ['daos_common', 'daos', 'uuid', 'gurt']

dfs_src = ['common.c', 'cont.c', 'dir.c', 'file.c', 'io.c', 'lookup.c', 'mnt.c', 'obj.c',
'pipeline.c', 'readdir.c', 'rename.c', 'xattr.c', 'dfs_sys.c']
'pipeline.c', 'readdir.c', 'rename.c', 'xattr.c', 'dfs_sys.c', 'metrics.c']
dfs = denv.d_library('dfs', dfs_src, LIBS=libraries)
denv.Install('$PREFIX/lib64/', dfs)

Expand Down
4 changes: 4 additions & 0 deletions src/client/dfs/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,8 @@ entry_stat(dfs_t *dfs, daos_handle_t th, daos_handle_t oh, const char *name, siz
stbuf->st_atim.tv_sec = stbuf->st_mtim.tv_sec;
stbuf->st_atim.tv_nsec = stbuf->st_mtim.tv_nsec;
}

DFS_OP_STAT_INCR(dfs, DOS_STAT);
return 0;
}

Expand Down Expand Up @@ -710,6 +712,7 @@ open_dir(dfs_t *dfs, dfs_obj_t *parent, int flags, daos_oclass_id_t cid, struct
D_ASSERT(rc == 0);
dir->d.chunk_size = entry->chunk_size;
dir->d.oclass = entry->oclass;
DFS_OP_STAT_INCR(dfs, DOS_MKDIR);
return 0;
}
}
Expand Down Expand Up @@ -742,6 +745,7 @@ open_dir(dfs_t *dfs, dfs_obj_t *parent, int flags, daos_oclass_id_t cid, struct
oid_cp(&dir->oid, entry->oid);
dir->d.chunk_size = entry->chunk_size;
dir->d.oclass = entry->oclass;
DFS_OP_STAT_INCR(dfs, DOS_OPENDIR);
return 0;
}

Expand Down
4 changes: 4 additions & 0 deletions src/client/dfs/dfs_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include <daos.h>
#include <daos_fs.h>

#include "metrics.h"

/** D-key name of SB metadata */
#define SB_DKEY "DFS_SB_METADATA"

Expand Down Expand Up @@ -190,6 +192,8 @@ struct dfs {
struct dfs_mnt_hdls *cont_hdl;
/** the root dir stat buf */
struct stat root_stbuf;
/** DFS top-level metrics */
struct dfs_metrics *metrics;
};

struct dfs_entry {
Expand Down
2 changes: 2 additions & 0 deletions src/client/dfs/dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ dfs_mkdir(dfs_t *dfs, dfs_obj_t *parent, const char *name, mode_t mode, daos_ocl
if (rc != 0)
return daos_der2errno(rc);

DFS_OP_STAT_INCR(dfs, DOS_MKDIR);
return rc;
}

Expand Down Expand Up @@ -220,6 +221,7 @@ dfs_remove(dfs_t *dfs, dfs_obj_t *parent, const char *name, bool force, daos_obj
if (oid)
oid_cp(oid, entry.oid);

DFS_OP_STAT_INCR(dfs, DOS_UNLINK);
out:
rc = check_tx(th, rc);
if (rc == ERESTART)
Expand Down
44 changes: 42 additions & 2 deletions src/client/dfs/io.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,20 @@

#include "dfs_internal.h"

static void
dfs_update_file_metrics(dfs_t *dfs, daos_size_t read_bytes, daos_size_t write_bytes)
{
if (dfs == NULL || dfs->metrics == NULL)
return;

if (read_bytes > 0)
d_tm_inc_gauge(dfs->metrics->dm_read_bytes, read_bytes);
if (write_bytes > 0)
d_tm_inc_gauge(dfs->metrics->dm_write_bytes, write_bytes);
}

struct dfs_read_params {
dfs_t *dfs;
daos_size_t *read_size;
daos_array_iod_t arr_iod;
daos_range_t rg;
Expand All @@ -35,6 +48,8 @@ read_cb(tse_task_t *task, void *data)
D_GOTO(out, rc);
}

DFS_OP_STAT_INCR(params->dfs, DOS_READ);
dfs_update_file_metrics(params->dfs, params->arr_iod.arr_nr_read, 0);
*params->read_size = params->arr_iod.arr_nr_read;
out:
D_FREE(params);
Expand All @@ -61,6 +76,7 @@ dfs_read_int(dfs_t *dfs, dfs_obj_t *obj, daos_off_t off, dfs_iod_t *iod, d_sg_li
if (params == NULL)
D_GOTO(err_task, rc = -DER_NOMEM);

params->dfs = dfs;
params->read_size = read_size;

/** set array location */
Expand Down Expand Up @@ -90,6 +106,7 @@ dfs_read_int(dfs_t *dfs, dfs_obj_t *obj, daos_off_t off, dfs_iod_t *iod, d_sg_li
* completion cb that frees params in this case, so we can just ignore the rc here.
*/
dc_task_schedule(task, true);

return 0;

err_params:
Expand Down Expand Up @@ -125,6 +142,7 @@ dfs_read(dfs_t *dfs, dfs_obj_t *obj, d_sg_list_t *sgl, daos_off_t off, daos_size
daos_event_launch(ev);
daos_event_complete(ev, 0);
}
DFS_OP_STAT_INCR(dfs, DOS_READ);
return 0;
}

Expand All @@ -146,7 +164,9 @@ dfs_read(dfs_t *dfs, dfs_obj_t *obj, d_sg_list_t *sgl, daos_off_t off, daos_size
return daos_der2errno(rc);
}

DFS_OP_STAT_INCR(dfs, DOS_READ);
*read_size = iod.arr_nr_read;
dfs_update_file_metrics(dfs, iod.arr_nr_read, 0);
return 0;
}

Expand All @@ -173,6 +193,7 @@ dfs_readx(dfs_t *dfs, dfs_obj_t *obj, dfs_iod_t *iod, d_sg_list_t *sgl, daos_siz
daos_event_launch(ev);
daos_event_complete(ev, 0);
}
DFS_OP_STAT_INCR(dfs, DOS_READ);
return 0;
}

Expand All @@ -189,7 +210,9 @@ dfs_readx(dfs_t *dfs, dfs_obj_t *obj, dfs_iod_t *iod, d_sg_list_t *sgl, daos_siz
return daos_der2errno(rc);
}

DFS_OP_STAT_INCR(dfs, DOS_READ);
*read_size = arr_iod.arr_nr_read;
dfs_update_file_metrics(dfs, arr_iod.arr_nr_read, 0);
return 0;
}

Expand Down Expand Up @@ -223,6 +246,7 @@ dfs_write(dfs_t *dfs, dfs_obj_t *obj, d_sg_list_t *sgl, daos_off_t off, daos_eve
daos_event_launch(ev);
daos_event_complete(ev, 0);
}
DFS_OP_STAT_INCR(dfs, DOS_WRITE);
return 0;
}

Expand All @@ -238,8 +262,12 @@ dfs_write(dfs_t *dfs, dfs_obj_t *obj, d_sg_list_t *sgl, daos_off_t off, daos_eve
daos_event_errno_rc(ev);

rc = daos_array_write(obj->oh, DAOS_TX_NONE, &iod, sgl, ev);
if (rc)
if (rc == 0) {
DFS_OP_STAT_INCR(dfs, DOS_WRITE);
dfs_update_file_metrics(dfs, 0, buf_size);
} else {
D_ERROR("daos_array_write() failed, " DF_RC "\n", DP_RC(rc));
}

return daos_der2errno(rc);
}
Expand All @@ -248,6 +276,8 @@ int
dfs_writex(dfs_t *dfs, dfs_obj_t *obj, dfs_iod_t *iod, d_sg_list_t *sgl, daos_event_t *ev)
{
daos_array_iod_t arr_iod;
daos_size_t buf_size;
int i;
int rc;

if (dfs == NULL || !dfs->mounted)
Expand All @@ -266,6 +296,7 @@ dfs_writex(dfs_t *dfs, dfs_obj_t *obj, dfs_iod_t *iod, d_sg_list_t *sgl, daos_ev
daos_event_launch(ev);
daos_event_complete(ev, 0);
}
DFS_OP_STAT_INCR(dfs, DOS_WRITE);
return 0;
}

Expand All @@ -276,9 +307,18 @@ dfs_writex(dfs_t *dfs, dfs_obj_t *obj, dfs_iod_t *iod, d_sg_list_t *sgl, daos_ev
if (ev)
daos_event_errno_rc(ev);

buf_size = 0;
if (dfs->metrics != NULL && sgl != NULL)
for (i = 0; i < sgl->sg_nr; i++)
buf_size += sgl->sg_iovs[i].iov_len;

rc = daos_array_write(obj->oh, DAOS_TX_NONE, &arr_iod, sgl, ev);
if (rc)
if (rc == 0) {
DFS_OP_STAT_INCR(dfs, DOS_WRITE);
dfs_update_file_metrics(dfs, 0, buf_size);
} else {
D_ERROR("daos_array_write() failed (%d)\n", rc);
}

return daos_der2errno(rc);
}
Loading

0 comments on commit 662baa9

Please sign in to comment.