Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into grom72/ndctl-valida…
Browse files Browse the repository at this point in the history
…tion

Skip-list: test_dfuse_daos_build_wt_pil4dfs:DAOS-16556

Priority: 2
Cancel-prev-build: false
Allow-unstable-test: true

Required-githooks: true
Signed-off-by: Tomasz Gromadzki <[email protected]>
  • Loading branch information
grom72 committed Oct 1, 2024
2 parents 8d7da45 + abf9c8c commit d13a99b
Show file tree
Hide file tree
Showing 591 changed files with 22,075 additions and 15,377 deletions.
22 changes: 21 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,9 @@ pipeline {
booleanParam(name: 'CI_medium_md_on_ssd_TEST',
defaultValue: true,
description: 'Run the Functional Hardware Medium MD on SSD test stage')
booleanParam(name: 'CI_medium_vmd_TEST',
defaultValue: true,
description: 'Run the Functional Hardware Medium VMD test stage')
booleanParam(name: 'CI_medium_verbs_provider_TEST',
defaultValue: true,
description: 'Run the Functional Hardware Medium Verbs Provider test stage')
Expand Down Expand Up @@ -309,6 +312,9 @@ pipeline {
string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_VERBS_PROVIDER_LABEL',
defaultValue: 'ci_nvme5',
description: 'Label to use for 5 node Functional Hardware Medium Verbs Provider (MD on SSD) stages')
string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_VMD_LABEL',
defaultValue: 'ci_vmd5',
description: 'Label to use for the Functional Hardware Medium VMD stage')
string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_UCX_PROVIDER_LABEL',
defaultValue: 'ci_ofed5',
description: 'Label to use for 5 node Functional Hardware Medium UCX Provider stage')
Expand Down Expand Up @@ -1050,7 +1056,8 @@ pipeline {
stash name: 'fault-inject-valgrind',
includes: '*.memcheck.xml',
allowEmpty: true
archiveArtifacts artifacts: 'nlt_logs/el8.fault-injection/'
archiveArtifacts artifacts: 'nlt_logs/el8.fault-injection/',
allowEmptyArchive: true
job_status_update()
}
}
Expand Down Expand Up @@ -1182,6 +1189,19 @@ pipeline {
run_if_landing: false,
job_status: job_status_internal
),
'Functional Hardware Medium VMD': getFunctionalTestStage(
name: 'Functional Hardware Medium VMD',
pragma_suffix: '-hw-medium-vmd',
label: params.FUNCTIONAL_HARDWARE_MEDIUM_VMD_LABEL,
next_version: next_version,
stage_tags: 'hw_vmd,medium',
/* groovylint-disable-next-line UnnecessaryGetter */
default_tags: startedByTimer() ? 'pr daily_regression' : 'pr',
nvme: 'auto',
run_if_pr: false,
run_if_landing: false,
job_status: job_status_internal
),
'Functional Hardware Medium Verbs Provider': getFunctionalTestStage(
name: 'Functional Hardware Medium Verbs Provider',
pragma_suffix: '-hw-medium-verbs-provider',
Expand Down
2 changes: 1 addition & 1 deletion ci/gha_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ provision_cluster() {
while [ $((SECONDS-START)) -lt $wait_seconds ]; do
if clush -B -S -l root -w "$nodestring" '[ -d /var/chef/reports ]'; then
# shellcheck disable=SC2016
clush -B -S -l root -w "$nodestring" --connect_timeout 30 --command_timeout 600 "if [ -e /root/job_info ]; then
clush -B -S -l root -w "$nodestring" --connect_timeout 30 --command_timeout 900 "if [ -e /root/job_info ]; then
cat /root/job_info
fi
echo \"Last provisioning run info:
Expand Down
10 changes: 8 additions & 2 deletions debian/changelog
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
daos (2.7.100-6) unstable; urgency=medium
daos (2.7.100-7) unstable; urgency=medium
[ Tomasz Gromadzki ]
* Add support of the PMDK package 2.1.0 with NDCTL enabled.
* Increase the default ULT stack size to 20KiB if the engine uses
Expand All @@ -12,7 +12,13 @@ daos (2.7.100-6) unstable; urgency=medium
Otherwise, a user is supposed to be stopped by an error
like: "Unsafe shutdown count is not supported for this source".

-- Tomasz Gromadzki <[email protected]> Thu, 26 Sep 2024 12:00:00 +0200
-- Tomasz Gromadzki <[email protected]> Tue, 02 Oct 2024 12:00:00 +0200

daos (2.7.100-6) unstable; urgency=medium
[ Kris Jacque ]
* Bump minimum golang-go version to 1.21

-- Kris Jacque <[email protected]> Mon, 23 Sep 2024 11:06:00 -0700

daos (2.7.100-5) unstable; urgency=medium
[ Michael MacDonald ]
Expand Down
4 changes: 2 additions & 2 deletions debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Build-Depends: debhelper (>= 10),
dpdk-dev (>= 21.11.2),
libisal-crypto-dev,
libcunit1-dev,
golang-go (>= 1.18),
golang-go (>= 2:1.21),
libboost-dev,
libspdk-dev (>= 22.01.2),
libipmctl-dev,
Expand Down Expand Up @@ -117,7 +117,7 @@ Depends: python (>=3.8), python3, python-yaml, python3-yaml,
${shlibs:Depends}, ${misc:Depends},
daos-client (= ${binary:Version}),
daos-admin (= ${binary:Version}),
golang-go (>=1.18),
golang-go (>= 2:1.21),
libcapstone-dev,
libndctl-dev,
libdaxctl-dev
Expand Down
30 changes: 8 additions & 22 deletions docs/admin/administration.md
Original file line number Diff line number Diff line change
Expand Up @@ -620,21 +620,17 @@ Usage:
[nvme-faulty command options]
-u, --uuid= Device UUID to set
-f, --force Do not require confirmation
-l, --host= Single host address <ipv4addr/hostname> to connect to
```
To manually evict an NVMe SSD (auto eviction is covered later in this section),
the device state needs to be set faulty by running the following command:
```bash
$ dmg -l boro-11 storage set nvme-faulty --uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19
$ dmg storage set nvme-faulty --host=boro-11 --uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19
NOTICE: This command will permanently mark the device as unusable!
Are you sure you want to continue? (yes/no)
yes
-------
boro-11
-------
Devices
UUID:5bd91603-d3c7-4fb7-9a71-76bc25690c19 [TrAddr:]
Targets:[] Rank:0 State:EVICTED LED:ON
set-faulty operation performed successfully on the following host: wolf-310:10001
```
The device state will transition from "NORMAL" to "EVICTED" (shown above), during which time the
faulty device reaction will have been triggered (all targets on the SSD will be rebuilt).
Expand Down Expand Up @@ -693,19 +689,14 @@ Usage:
[nvme command options]
--old-uuid= Device UUID of hot-removed SSD
--new-uuid= Device UUID of new device
--no-reint Bypass reintegration of device and just bring back online.
-l, --host= Single host address <ipv4addr/hostname> to connect to
```
To replace an NVMe SSD with an evicted device and reintegrate it into use with
DAOS, run the following command:
```bash
$ dmg -l boro-11 storage replace nvme --old-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 --new-uuid=80c9f1be-84b9-4318-a1be-c416c96ca48b
-------
boro-11
-------
Devices
UUID:80c9f1be-84b9-4318-a1be-c416c96ca48b [TrAddr:]
Targets:[] Rank:1 State:NORMAL LED:OFF
$ dmg storage replace nvme --host=boro-11 --old-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 --new-uuid=80c9f1be-84b9-4318-a1be-c416c96ca48b
dev-replace operation performed successfully on the following host: boro-11:10001
```
The old, now replaced device will remain in an "EVICTED" state until it is unplugged.
The new device will transition from a "NEW" state to a "NORMAL" state (shown above).
Expand All @@ -716,14 +707,9 @@ In order to reuse a device that was previously set as FAULTY and evicted from th
system, an admin can run the following command (setting the old device UUID to be the
new device UUID):
```bash
$ dmg -l boro-11 storage replace nvme --old-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 --new-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19
$ dmg storage replace nvme --host=boro-11 ---old-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 --new-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19
NOTICE: Attempting to reuse a previously set FAULTY device!
-------
boro-11
-------
Devices
UUID:5bd91603-d3c7-4fb7-9a71-76bc25690c19 [TrAddr:]
Targets:[] Rank:1 State:NORMAL LED:OFF
dev-replace operation performed successfully on the following host: boro-11:10001
```
The FAULTY device will transition from an "EVICTED" state back to a "NORMAL" state,
and will again be available for use with DAOS. The use case of this command will mainly
Expand Down
21 changes: 14 additions & 7 deletions docs/user/filesystem.md
Original file line number Diff line number Diff line change
Expand Up @@ -1021,16 +1021,23 @@ libpil4dfs intercepting summary for ops on DFS:
[op_sum ] 5003
```

### Turn on compatible mode in libpil4dfs
Fake file descriptor (FD) is used in regular mode in libpil4dfs.so for efficiency. open() returns fake fd to applications. In cases of some APIs are not intercepted, applications could crash with the error "Bad File Descriptor". Compatible mode is provided to work around such situations.
Setting env "D_IL_COMPATIBLE=1" turns on compatible mode. Kernel fd allocated by dfuse instead of fake fd will be returned to applications. This mode provides better compatibility with degraded performance in open, openat, and opendir, etc. Please start dfuse with "--disable-caching" to disable caching before using compatible mode.
### Bypassing function interception in libpil4dfs
libpil4dfs enhances I/O performance by bypassing the fuse kernel when going over dfuse for I/O intensive workloads. In some scenarios however, for short-running applications (e.g., simple linux commands like cat, mkdir, chmod, etc.), there is no enough incentive to justify initializing the DAOS environment in user space with libpil4dfs, since this is relatively expensive. Such overhead is particularly noticeable for processes that complete within tens or hundreds of milliseconds and run frequently.
To address this issue, DAOS can disable function interception by libpil4dfs for specific executables/commands listed below:

### Child Process Inheritance
"arch", "as", "awk", "basename", "bc", "cal", "cat", "chmod", "chown", "clang", "clear", "cmake", "cmake3", "cp", "cpp", "daos", "daos_agent", "daos_engine", "daos_server", "df", "dfuse", "dmg", "expr", "f77", "f90", "f95", "file", "gawk", "gcc", "gfortran", "gmake", "go", "gofmt", "grep", "g++", "head", "link", "ln", "ls", "kill", "m4", "make", "mkdir", "mktemp", "mv", "nasm", "yasm", "nm", "numactl", "patchelf", "ping", "pkg-config", "ps", "pwd", "ranlib", "readelf", "readlink", "rename", "rm", "rmdir", "rpm", "sed", "seq", "size", "sleep", "sort", "ssh", "stat", "strace", "strip", "su", "sudo", "tail", "tee", "telnet", "time", "top", "touch", "tr", "truncate", "uname", "vi", "vim", "whoami", "yes"

Normally child processes inherit environmental variables from parent processes. In rare cases, e.g.
scons, envs are striped off when calling execve(). It might be useful to force pil4dfs related env
set in child processes by setting env "D_IL_ENFORCE_EXEC_ENV=1". This flag is 0 if not set.
Also some scripting tools for package management, configuration and compiling,
"autoconf", "configure", "dnf", "dnf-3", "libtool", "libtoolize", "lsb_release", "meson", "scons", scons-3"

In addition, DAOS provides an environment variable (D_IL_BYPASS_LIST) to disable function interception by libpil4dfs for specific applications that are set in that env with the following syntax:
```
$ export D_IL_BYPASS_LIST="app_a:app_b:app_c:app_d"
```

### Turn on compatible mode in libpil4dfs
Fake file descriptor (FD) is used in regular mode in libpil4dfs.so for efficiency. open() returns fake fd to applications. In cases of some APIs are not intercepted, applications could crash with the error "Bad File Descriptor". Compatible mode is provided to work around such situations.
Setting env "D_IL_COMPATIBLE=1" turns on compatible mode. Kernel fd allocated by dfuse instead of fake fd will be returned to applications. This mode provides better compatibility with degraded performance in open, openat, and opendir, etc. Please start dfuse with "--disable-caching" to disable caching before using compatible mode.

### Directory caching

Expand Down
10 changes: 8 additions & 2 deletions site_scons/prereq_tools/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,19 +504,25 @@ def __init__(self, env, opts):
self._build_targets = []

build_dir = self.__env['BUILD_DIR']
targets = ['test', 'server', 'client']
main_targets = ['client', 'server']
targets = ['test'] + main_targets
self.__env.Alias('client', build_dir)
self.__env.Alias('server', build_dir)
self.__env.Alias('test', build_dir)
self._build_targets = []
check = any(item in BUILD_TARGETS for item in targets)
if not check or 'test' in BUILD_TARGETS:
if not check:
self._build_targets.extend(['client', 'server', 'test'])
else:
if 'client' in BUILD_TARGETS:
self._build_targets.append('client')
if 'server' in BUILD_TARGETS:
self._build_targets.append('server')
if 'test' in BUILD_TARGETS:
if not any(item in BUILD_TARGETS for item in main_targets):
print("test target requires client or server")
sys.exit(1)
self._build_targets.append('test')
BUILD_TARGETS.append(build_dir)

env.AddMethod(self.require, 'require')
Expand Down
27 changes: 22 additions & 5 deletions site_scons/site_tools/go_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@
import re
import subprocess # nosec B404

from SCons.Script import Configure, Exit, File, GetOption, Glob, Scanner
from SCons.Script import Configure, Dir, Exit, File, GetOption, Glob, Scanner

GO_COMPILER = 'go'
MIN_GO_VERSION = '1.18.0'
include_re = re.compile(r'\#include [<"](\S+[>"])', re.M)


Expand Down Expand Up @@ -49,6 +48,17 @@ def _scan_go_file(node, env, _path):
return includes


def get_min_go_version():
"""Get go minimum version from go.mod"""
go_mod_path = os.path.join(Dir('#').abspath, "src", "control", "go.mod")
with open(go_mod_path, 'r') as f:
for line in f:
if line.startswith('go '): # e.g. "go 1.21"
parts = line.split()
return get_go_version("go" + parts[1])
return None


def get_go_version(output):
"""Capture only the version after 'go'"""
ver_re = re.compile(r'go([0-9\.]+)')
Expand Down Expand Up @@ -81,6 +91,13 @@ def _check_go_version(context):
context.Result(0)
return 0

context.Display('Getting minimum go version... ')
min_go_version = get_min_go_version()
if min_go_version is None:
context.Result('no minimum go version found in go.mod')
return 0
context.Display(min_go_version + '\n')

context.Display(f'Checking {env.d_go_bin} version... ')
cmd_rc = subprocess.run([env.d_go_bin, 'version'], check=True, stdout=subprocess.PIPE)
out = cmd_rc.stdout.decode('utf-8').strip()
Expand All @@ -93,11 +110,11 @@ def _check_go_version(context):
if go_version is None:
context.Result(f'failed to get version from "{out}"')
return 0
if len([x for x, y in
zip(go_version.split('.'), MIN_GO_VERSION.split('.'))
if len([x for x, y in zip(go_version.split('.'), min_go_version.split('.'))
if int(x) < int(y)]) > 0:
context.Result(f'{out} is too old (min supported: {MIN_GO_VERSION}) ')
context.Result(f'{out} is too old (min supported: {min_go_version}) ')
return 0

context.Result(go_version)
return 1

Expand Down
6 changes: 3 additions & 3 deletions src/bio/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ Devices:
<a id="82"></a>
- Manually Set Device State to FAULTY: **$dmg storage set nvme-faulty**
```
$ dmg storage set nvme-faulty --uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0
$ dmg storage set nvme-faulty --host=localhost --uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0
Devices
UUID:9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 [TrAddr:0000:8d:00.0]
Targets:[0] Rank:0 State:EVICTED
Expand All @@ -219,7 +219,7 @@ Devices
<a id="83"></a>
- Replace an evicted device with a new device: **$dmg storage replace nvme**
```
$ dmg storage replace nvme --old-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 --new-uuid=8131fc39-4b1c-4662-bea1-734e728c434e
$ dmg storage replace nvme --host=localhost --old-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 --new-uuid=8131fc39-4b1c-4662-bea1-734e728c434e
Devices
UUID:8131fc39-4b1c-4662-bea1-734e728c434e [TrAddr:0000:8d:00.0]
Targets:[0] Rank:0 State:NORMAL
Expand All @@ -229,7 +229,7 @@ Devices
<a id="84"></a>
- Reuse a previously evicted device: **$dmg storage replace nvme**
```
$ dmg storage replace nvme --old-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 --new-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0
$ dmg storage replace nvme --host=localhost --old-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 --new-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0
Devices
UUID:9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 [TrAddr:0000:8a:00.0]
Targets:[0] Rank:0 State:NORMAL
Expand Down
Loading

0 comments on commit d13a99b

Please sign in to comment.