From 12da677f85e99e3bd5992620228104960e62857a Mon Sep 17 00:00:00 2001 From: Laura Flores Date: Tue, 13 Dec 2022 19:48:25 +0000 Subject: [PATCH 0001/2492] qa/workunits: update telemetry quincy workunits with basic_pool_options_bluestore collection Signed-off-by: Laura Flores --- qa/workunits/test_telemetry_quincy.sh | 3 ++- qa/workunits/test_telemetry_quincy_x.sh | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/qa/workunits/test_telemetry_quincy.sh b/qa/workunits/test_telemetry_quincy.sh index e8b07ec13032..2ce268eadbbc 100755 --- a/qa/workunits/test_telemetry_quincy.sh +++ b/qa/workunits/test_telemetry_quincy.sh @@ -13,7 +13,8 @@ ceph telemetry preview-all # Assert that new collections are available COLLECTIONS=$(ceph telemetry collection ls) -NEW_COLLECTIONS=("perf_perf" "basic_mds_metadata" "basic_pool_usage" "basic_rook_v01" "perf_memory_metrics") +NEW_COLLECTIONS=("perf_perf" "basic_mds_metadata" "basic_pool_usage" + "basic_rook_v01" "perf_memory_metrics" "basic_pool_options_bluestore") for col in ${NEW_COLLECTIONS[@]}; do if ! [[ $COLLECTIONS == *$col* ]]; then diff --git a/qa/workunits/test_telemetry_quincy_x.sh b/qa/workunits/test_telemetry_quincy_x.sh index 4734132d024c..bfb050cfa59b 100755 --- a/qa/workunits/test_telemetry_quincy_x.sh +++ b/qa/workunits/test_telemetry_quincy_x.sh @@ -12,7 +12,8 @@ fi ceph -s COLLECTIONS=$(ceph telemetry collection ls) -NEW_COLLECTIONS=("perf_perf" "basic_mds_metadata" "basic_pool_usage" "basic_rook_v01" "perf_memory_metrics") +NEW_COLLECTIONS=("perf_perf" "basic_mds_metadata" "basic_pool_usage" + "basic_rook_v01" "perf_memory_metrics" "basic_pool_options_bluestore") for col in ${NEW_COLLECTIONS[@]}; do if ! [[ $COLLECTIONS == *$col* ]]; then From 73ddc4d202113c3dd4b09d602287f09358e2495a Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Thu, 15 Dec 2022 16:58:51 +0000 Subject: [PATCH 0002/2492] osdc: fix the ENOCONN normalization in Objecter::_linger_reconnect() Problem's description: https://tracker.ceph.com/issues/53789#note-12 Fixes: https://tracker.ceph.com/issues/53789 Signed-off-by: Radoslaw Zarzynski --- src/osdc/Objecter.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index eff79c5e9061..5d3c4e88b123 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -670,8 +670,8 @@ void Objecter::_linger_reconnect(LingerOp *info, bs::error_code ec) << " (last_error " << info->last_error << ")" << dendl; std::unique_lock wl(info->watch_lock); if (ec) { + ec = _normalize_watch_error(ec); if (!info->last_error) { - ec = _normalize_watch_error(ec); if (info->handle) { boost::asio::defer(finish_strand, CB_DoWatchError(this, info, ec)); } From 1c64c6be303f5ed4110468101a01576508468a74 Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Fri, 16 Dec 2022 19:33:07 +0300 Subject: [PATCH 0003/2492] pybind/rados: fix missed changes for PEP484 style type annotations originally brought by https://github.com/ceph/ceph/pull/36918 Fixes: https://tracker.ceph.com/issues/58304 Signed-off-by: Igor Fedotov --- src/pybind/rados/rados.pyx | 16 ++++++++-------- src/test/pybind/test_rados.py | 5 +++++ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/pybind/rados/rados.pyx b/src/pybind/rados/rados.pyx index b8ae8165c6c2..e0cf6d9422a6 100644 --- a/src/pybind/rados/rados.pyx +++ b/src/pybind/rados/rados.pyx @@ -1047,10 +1047,10 @@ Rados object in state %s." % self.state) # NOTE(sileht): looks weird but test_monmap_dump pass int target = str(target) - target = cstr(target, 'target', opt=True) + target_raw = cstr(target, 'target', opt=True) cdef: - char *_target = opt_str(target) + char *_target = opt_str(target_raw) char **_cmd = to_bytes_array(cmds) size_t _cmdlen = len(cmds) @@ -1063,7 +1063,7 @@ Rados object in state %s." % self.state) size_t _outs_len try: - if target: + if target_raw: with nogil: ret = rados_mon_command_target(self.cluster, _target, _cmd, _cmdlen, @@ -1148,10 +1148,10 @@ Rados object in state %s." % self.state) self.require_state("connected") cmds = [cstr(cmd, 'cmd')] - target = cstr(target, 'target', opt=True) + target_raw = cstr(target, 'target', opt=True) cdef: - char *_target = opt_str(target) + char *_target = opt_str(target_raw) char **_cmd = to_bytes_array(cmds) size_t _cmdlen = len(cmds) @@ -1165,7 +1165,7 @@ Rados object in state %s." % self.state) size_t _outs_len try: - if target is not None: + if target_raw is not None: with nogil: ret = rados_mgr_command_target(self.cluster, _target, @@ -3779,9 +3779,9 @@ returned %d, but should return zero on success." % (self.name, ret)) :para max_return: list no more than max_return key/value pairs :returns: an iterator over the requested omap values, return value from this action """ - start_after = cstr(start_after, 'start_after') if start_after else None + start_after_raw = cstr(start_after, 'start_after') if start_after else None cdef: - char *_start_after = opt_str(start_after) + char *_start_after = opt_str(start_after_raw) ReadOp _read_op = read_op rados_omap_iter_t iter_addr = NULL int _max_return = max_return diff --git a/src/test/pybind/test_rados.py b/src/test/pybind/test_rados.py index e68269ff01ed..f6954e69a81c 100644 --- a/src/test/pybind/test_rados.py +++ b/src/test/pybind/test_rados.py @@ -556,6 +556,11 @@ def test_get_omap_keys(self): eq(ret, 0) with assert_raises(ObjectNotFound): self.ioctx.operate_read_op(read_op, "no_such") + with ReadOpCtx() as read_op: + iter, ret = self.ioctx.get_omap_keys(read_op,"2",2) + eq(ret, 0) + self.ioctx.operate_read_op(read_op, "hw") + eq(list(iter), [("3", None)]) def test_clear_omap(self): keys = ("1", "2", "3") From 3c922133eee7466ed2169d0ff5c83e94c3a03cff Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Mon, 22 Aug 2022 17:29:27 +0300 Subject: [PATCH 0004/2492] os/bluestore: assert on improper releases in AvlAllocator Let's assert on unexpected unit release before we damage internal data structures. Plus uniforms some logging output for avl/hybrid allocator with bitmap one to enable replay tool usage. Signed-off-by: Igor Fedotov --- src/os/bluestore/AvlAllocator.cc | 60 +++++++++++++++++++++-------- src/os/bluestore/HybridAllocator.cc | 8 ++-- 2 files changed, 47 insertions(+), 21 deletions(-) diff --git a/src/os/bluestore/AvlAllocator.cc b/src/os/bluestore/AvlAllocator.cc index 4584bfae713f..9d828ee313ce 100644 --- a/src/os/bluestore/AvlAllocator.cc +++ b/src/os/bluestore/AvlAllocator.cc @@ -105,6 +105,16 @@ void AvlAllocator::_add_to_tree(uint64_t start, uint64_t size) rs_before = std::prev(rs_after); } + if ((rs_before != range_tree.end() && rs_before->end > start) || + (rs_after != range_tree.end() && rs_after->start < end)) { + derr << __func__ << " inconsistent tree state " << std::hex + << " 0x" << start << "~" << end - start + << std::dec << dendl; + _dump(); + } + ceph_assert(rs_before == range_tree.end() || rs_before->end <= start); + ceph_assert(rs_after == range_tree.end() || rs_after->start >= end); + bool merge_before = (rs_before != range_tree.end() && rs_before->end == start); bool merge_after = (rs_after != range_tree.end() && rs_after->start == end); @@ -169,6 +179,14 @@ void AvlAllocator::_remove_from_tree(uint64_t start, uint64_t size) auto rs = range_tree.find(range_t{start, end}, range_tree.key_comp()); /* Make sure we completely overlap with someone */ + if (rs == range_tree.end() || + rs->start > start || + rs->end < end) { + derr << __func__ << " inconsistent tree state " << std::hex + << " 0x" << rs->start << "~" << rs->end - rs->start + << std::dec << dendl; + _dump(); + } ceph_assert(rs != range_tree.end()); ceph_assert(rs->start <= start); ceph_assert(rs->end >= end); @@ -292,7 +310,9 @@ int AvlAllocator::_allocate( if (start == -1ULL) { return -ENOSPC; } - + dout(20) << __func__ << " allocated 0x" << std::hex + << start << "~" << size + << std::dec << dendl; _remove_from_tree(start, size); *offset = start; @@ -306,9 +326,9 @@ void AvlAllocator::_release(const interval_set& release_set) const auto offset = p.get_start(); const auto length = p.get_len(); ceph_assert(offset + length <= uint64_t(device_size)); - ldout(cct, 10) << __func__ << std::hex - << " offset 0x" << offset - << " length 0x" << length + ldout(cct, 20) << __func__ << std::hex + << " 0x" << offset + << "~" << length << std::dec << dendl; _add_to_tree(offset, length); } @@ -316,9 +336,9 @@ void AvlAllocator::_release(const interval_set& release_set) void AvlAllocator::_release(const PExtentVector& release_set) { for (auto& e : release_set) { - ldout(cct, 10) << __func__ << std::hex - << " offset 0x" << e.offset - << " length 0x" << e.length + ldout(cct, 20) << __func__ << std::hex + << " 0x" << e.offset + << "~" << e.length << std::dec << dendl; _add_to_tree(e.offset, e.length); } @@ -346,14 +366,20 @@ AvlAllocator::AvlAllocator(CephContext* cct, cct->_conf.get_val("bluestore_avl_alloc_ff_max_search_bytes")), range_count_cap(max_mem / sizeof(range_seg_t)), cct(cct) -{} +{ + ldout(cct, 10) << __func__ << " 0x" << std::hex << get_capacity() << "/" + << get_block_size() << std::dec << dendl; +} AvlAllocator::AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size, std::string_view name) : AvlAllocator(cct, device_size, block_size, 0 /* max_mem */, name) -{} +{ + ldout(cct, 10) << __func__ << " 0x" << std::hex << get_capacity() << "/" + << get_block_size() << std::dec << dendl; +} AvlAllocator::~AvlAllocator() { @@ -368,10 +394,10 @@ int64_t AvlAllocator::allocate( PExtentVector* extents) { ldout(cct, 10) << __func__ << std::hex - << " want 0x" << want - << " unit 0x" << unit - << " max_alloc_size 0x" << max_alloc_size - << " hint 0x" << hint + << " 0x" << want + << "/" << unit + << "," << max_alloc_size + << "," << hint << std::dec << dendl; ceph_assert(std::has_single_bit(unit)); ceph_assert(want % unit == 0); @@ -450,8 +476,8 @@ void AvlAllocator::init_add_free(uint64_t offset, uint64_t length) std::lock_guard l(lock); ceph_assert(offset + length <= uint64_t(device_size)); ldout(cct, 10) << __func__ << std::hex - << " offset 0x" << offset - << " length 0x" << length + << " 0x" << offset + << "~" << length << std::dec << dendl; _add_to_tree(offset, length); } @@ -463,8 +489,8 @@ void AvlAllocator::init_rm_free(uint64_t offset, uint64_t length) std::lock_guard l(lock); ceph_assert(offset + length <= uint64_t(device_size)); ldout(cct, 10) << __func__ << std::hex - << " offset 0x" << offset - << " length 0x" << length + << " 0x" << offset + << "~" << length << std::dec << dendl; _remove_from_tree(offset, length); } diff --git a/src/os/bluestore/HybridAllocator.cc b/src/os/bluestore/HybridAllocator.cc index 2201d5958246..cfb5858fd11d 100644 --- a/src/os/bluestore/HybridAllocator.cc +++ b/src/os/bluestore/HybridAllocator.cc @@ -23,10 +23,10 @@ int64_t HybridAllocator::allocate( PExtentVector* extents) { ldout(cct, 10) << __func__ << std::hex - << " want 0x" << want - << " unit 0x" << unit - << " max_alloc_size 0x" << max_alloc_size - << " hint 0x" << hint + << " 0x" << want + << "/" << unit + << "," << max_alloc_size + << "," << hint << std::dec << dendl; ceph_assert(std::has_single_bit(unit)); ceph_assert(want % unit == 0); From 5b6be4565f5f2eae2b31a53b3d62837c5c213228 Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Tue, 23 Aug 2022 20:18:31 +0300 Subject: [PATCH 0005/2492] os/bluestore: log values in hex in AvlAllocator Signed-off-by: Igor Fedotov --- src/os/bluestore/AvlAllocator.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/os/bluestore/AvlAllocator.cc b/src/os/bluestore/AvlAllocator.cc index 9d828ee313ce..317291e65f49 100644 --- a/src/os/bluestore/AvlAllocator.cc +++ b/src/os/bluestore/AvlAllocator.cc @@ -293,12 +293,16 @@ int AvlAllocator::_allocate( ceph_assert(align != 0); uint64_t* cursor = &lbas[cbits(align) - 1]; start = _pick_block_after(cursor, size, unit); - dout(20) << __func__ << " first fit=" << start << " size=" << size << dendl; + dout(20) << __func__ + << std::hex << " first fit params: 0x" << start << "~" << size + << std::dec << dendl; } if (start == -1ULL) { do { start = _pick_block_fits(size, unit); - dout(20) << __func__ << " best fit=" << start << " size=" << size << dendl; + dout(20) << __func__ + << std::hex << " best fit params: 0x" << start << "~" << size + << std::dec << dendl; if (start != uint64_t(-1ULL)) { break; } From 36961d644ce11e89c789e9112e9f3702f8580829 Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Tue, 23 Aug 2022 20:21:48 +0300 Subject: [PATCH 0006/2492] os/bluestore: do not call allocator's release on empty set Signed-off-by: Igor Fedotov --- src/os/bluestore/BlueStore.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 33d1d9983087..33eb0943c0a2 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -13055,7 +13055,8 @@ void BlueStore::_txc_release_alloc(TransContext *txc) { bool discard_queued = false; // it's expected we're called with lazy_release_lock already taken! - if (unlikely(cct->_conf->bluestore_debug_no_reuse_blocks)) { + if (unlikely(cct->_conf->bluestore_debug_no_reuse_blocks || + txc->released.size() == 0)) { goto out; } discard_queued = bdev->try_discard(txc->released); From 08d80006d07d98031125a1d854cb4f778eaee75b Mon Sep 17 00:00:00 2001 From: Nitzan Mordechai Date: Sun, 12 Feb 2023 07:14:23 +0000 Subject: [PATCH 0007/2492] qa/*/test_envlibrados_for_rocksdb.sh: subscrib repo subscription-manager repos will fail with 'does not match a valid repository ID' we should use dnf config-manager --set-enabled instead Fixes: https://tracker.ceph.com/issues/58560 Signed-off-by: Nitzan Mordechai --- qa/workunits/rados/test_envlibrados_for_rocksdb.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/workunits/rados/test_envlibrados_for_rocksdb.sh b/qa/workunits/rados/test_envlibrados_for_rocksdb.sh index 371452f40429..e072b8299844 100755 --- a/qa/workunits/rados/test_envlibrados_for_rocksdb.sh +++ b/qa/workunits/rados/test_envlibrados_for_rocksdb.sh @@ -26,7 +26,7 @@ case $(distro_id) in case $(distro_id) in rhel) # RHEL needs CRB repo for snappy-devel - sudo subscription-manager repos --enable "codeready-builder-for-rhel-8-x86_64-rpms" + sudo dnf config-manager --set-enabled "codeready-builder-for-rhel-8-x86_64-rpms" ;; esac install git gcc-c++.x86_64 snappy-devel zlib zlib-devel bzip2 bzip2-devel libradospp-devel.x86_64 cmake libarchive-3.3.3 From bf52c18043117cccb68c5d9dabe8672dd2b943b3 Mon Sep 17 00:00:00 2001 From: Adam King Date: Mon, 30 Jan 2023 14:38:41 -0500 Subject: [PATCH 0008/2492] qa/cephadm: basic test for monitoring stack Testing that the monitoring stack daemons are all basically functioning by checking their HTTP APIs are responsive and and that putting down a mon daemon, which should cause an alert, actually triggers an alert that is viewable in the prometheus and alertmanager API Signed-off-by: Adam King --- .../task/test_monitoring_stack_basic.yaml | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml diff --git a/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml new file mode 100644 index 000000000000..62947ef65d9c --- /dev/null +++ b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml @@ -0,0 +1,55 @@ +roles: +- - host.a + - mon.a + - mgr.a + - osd.0 +- - host.b + - mon.b + - mgr.b + - osd.1 +- - host.c + - mon.c + - osd.2 +tasks: +- install: +- cephadm: +- cephadm.shell: + host.a: + - | + set -e + set -x + ceph orch apply node-exporter + ceph orch apply grafana + ceph orch apply alertmanager + ceph orch apply prometheus + sleep 240 + ceph orch ls + ceph orch ps + ceph orch host ls + MON_DAEMON=$(ceph orch ps --daemon-type mon -f json | jq -r 'last | .daemon_name') + GRAFANA_HOST=$(ceph orch ps --daemon-type grafana -f json | jq -e '.[]' | jq -r '.hostname') + PROM_HOST=$(ceph orch ps --daemon-type prometheus -f json | jq -e '.[]' | jq -r '.hostname') + ALERTM_HOST=$(ceph orch ps --daemon-type alertmanager -f json | jq -e '.[]' | jq -r '.hostname') + GRAFANA_IP=$(ceph orch host ls -f json | jq -r --arg GRAFANA_HOST "$GRAFANA_HOST" '.[] | select(.hostname==$GRAFANA_HOST) | .addr') + PROM_IP=$(ceph orch host ls -f json | jq -r --arg PROM_HOST "$PROM_HOST" '.[] | select(.hostname==$PROM_HOST) | .addr') + ALERTM_IP=$(ceph orch host ls -f json | jq -r --arg ALERTM_HOST "$ALERTM_HOST" '.[] | select(.hostname==$ALERTM_HOST) | .addr') + # check each host node-exporter metrics endpoint is responsive + ALL_HOST_IPS=$(ceph orch host ls -f json | jq -r '.[] | .addr') + for ip in $ALL_HOST_IPS; do + curl -s http://${ip}:9100/metric + done + # check grafana endpoints are responsive and database health is okay + curl -k -s https://${GRAFANA_IP}:3000/api/health + curl -k -s https://${GRAFANA_IP}:3000/api/health | jq -e '.database == "ok"' + # stop mon daemon in order to trigger an alert + ceph orch daemon stop $MON_DAEMON + sleep 120 + # check prometheus endpoints are responsive and mon down alert is firing + curl -s http://${PROM_IP}:9095/api/v1/status/config + curl -s http://${PROM_IP}:9095/api/v1/status/config | jq -e '.status == "success"' + curl -s http://${PROM_IP}:9095/api/v1/alerts + curl -s http://${PROM_IP}:9095/api/v1/alerts | jq -e '.data | .alerts | .[] | select(.labels | .alertname == "CephMonDown") | .state == "firing"' + # check alertmanager endpoints are responsive and mon down alert is active + curl -s http://${ALERTM_IP}:9093/api/v1/status + curl -s http://${ALERTM_IP}:9093/api/v1/alerts + curl -s http://${ALERTM_IP}:9093/api/v1/alerts | jq -e '.data | .[] | select(.labels | .alertname == "CephMonDown") | .status | .state == "active"' From 5b56098f17dd9abe4c15cbc7f487c0e94841beaf Mon Sep 17 00:00:00 2001 From: Neeraj Pratap Singh Date: Thu, 17 Nov 2022 23:33:15 +0530 Subject: [PATCH 0009/2492] mds: scrub repair does not clear earlier damage health status Fixes: https://tracker.ceph.com/issues/54557 Signed-off-by: Neeraj Pratap Singh --- src/mds/CDir.cc | 1 + src/mds/CInode.cc | 4 ++++ src/mds/DamageTable.cc | 28 ++++++++++++++++++++++++++++ src/mds/DamageTable.h | 7 +++++++ 4 files changed, 40 insertions(+) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index e6844cb7a4bd..f9aed746051f 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -3750,6 +3750,7 @@ bool CDir::scrub_local() mdcache->repair_dirfrag_stats(this); scrub_infop->header->set_repaired(); good = true; + mdcache->mds->damage_table.remove_dentry_damage_entry(this); } return good; } diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 4ac963166e98..9aa3a8c67a89 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -4783,6 +4783,7 @@ void CInode::validate_disk_state(CInode::validated_data *results, false); // Flag that we repaired this BT so that it won't go into damagetable results->backtrace.repaired = true; + in->mdcache->mds->damage_table.remove_backtrace_damage_entry(in->ino()); if (in->mdcache->mds->logger) in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_repaired); } @@ -4921,6 +4922,9 @@ void CInode::validate_disk_state(CInode::validated_data *results, << "freshly-calculated rstats don't match existing ones (will be fixed)"; in->mdcache->repair_inode_stats(in); results->raw_stats.repaired = true; + for (const auto &p : in->dirfrags){ + in->mdcache->mds->damage_table.remove_dirfrag_damage_entry(p.second); + } } else { results->raw_stats.error_str << "freshly-calculated rstats don't match existing ones"; diff --git a/src/mds/DamageTable.cc b/src/mds/DamageTable.cc index 22802079d85d..2079d23333a8 100644 --- a/src/mds/DamageTable.cc +++ b/src/mds/DamageTable.cc @@ -15,6 +15,7 @@ #include "common/debug.h" #include "mds/CDir.h" +#include "mds/CInode.h" #include "DamageTable.h" @@ -200,6 +201,33 @@ bool DamageTable::notify_remote_damaged(inodeno_t ino, std::string_view path) return false; } +void DamageTable::remove_dentry_damage_entry(CDir *dir) +{ + if (dentries.count( + DirFragIdent(dir->inode->ino(), dir->frag) + ) > 0){ + const auto frag_dentries = + dentries.at(DirFragIdent(dir->inode->ino(), dir->frag)); + for(const auto &i : frag_dentries) { + erase(i.second->id); + } + } +} + +void DamageTable::remove_dirfrag_damage_entry(CDir *dir) +{ + if (is_dirfrag_damaged(dir)){ + erase(dirfrags.find(DirFragIdent(dir->inode->ino(), dir->frag))->second->id); + } +} + +void DamageTable::remove_backtrace_damage_entry(inodeno_t ino) +{ + if (is_remote_damaged(ino)){ + erase(remotes.find(ino)->second->id); + } +} + bool DamageTable::oversized() const { return by_id.size() > (size_t)(g_conf()->mds_damage_table_max_entries); diff --git a/src/mds/DamageTable.h b/src/mds/DamageTable.h index 18a61e08b122..a1b96fe22186 100644 --- a/src/mds/DamageTable.h +++ b/src/mds/DamageTable.h @@ -22,6 +22,7 @@ #include "include/random.h" class CDir; +class CInode; typedef uint64_t damage_entry_id_t; @@ -155,6 +156,12 @@ class DamageTable */ bool notify_remote_damaged(inodeno_t ino, std::string_view path); + void remove_dentry_damage_entry(CDir *dir); + + void remove_dirfrag_damage_entry(CDir *dir); + + void remove_backtrace_damage_entry(inodeno_t ino); + bool is_dentry_damaged( const CDir *dir_frag, std::string_view dname, From 81944f60478abe2d4253caaeb4165da5ccffaca1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Zarzy=C5=84ski?= Date: Tue, 7 Mar 2023 13:30:11 +0100 Subject: [PATCH 0010/2492] osd: don't send stale hb msgr's addresses in MOSDBoot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See comments in the ticket for the RCA. NOTE: we can't just hold a reference to what `get_myaddrs()` returns as the `safe_item_history` is involved: ```cpp template class safe_item_history { //... T *current = nullptr; // ... const T& operator=(const T& other) { std::lock_guard l(lock); history.push_back(other); current = &history.back(); return *current; } ``` Fixes: https://tracker.ceph.com/issues/58915 Signed-off-by: Radosław Zarzyński --- src/osd/OSD.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 7ab35bf0e080..f26e87fa00ee 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -6742,13 +6742,11 @@ void OSD::_send_boot() cluster_messenger->get_loopback_connection().get(); entity_addrvec_t client_addrs = client_messenger->get_myaddrs(); entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs(); - entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs(); - entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs(); dout(20) << " initial client_addrs " << client_addrs << ", cluster_addrs " << cluster_addrs - << ", hb_back_addrs " << hb_back_addrs - << ", hb_front_addrs " << hb_front_addrs + << ", hb_back_addrs " << hb_back_server_messenger->get_myaddrs() + << ", hb_front_addrs " << hb_front_server_messenger->get_myaddrs() << dendl; if (cluster_messenger->set_addr_unknowns(client_addrs)) { dout(10) << " assuming cluster_addrs match client_addrs " @@ -6763,7 +6761,6 @@ void OSD::_send_boot() if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) { dout(10) << " assuming hb_back_addrs match cluster_addrs " << cluster_addrs << dendl; - hb_back_addrs = hb_back_server_messenger->get_myaddrs(); } if (auto session = local_connection->get_priv(); !session) { hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection); @@ -6773,7 +6770,6 @@ void OSD::_send_boot() if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) { dout(10) << " assuming hb_front_addrs match client_addrs " << client_addrs << dendl; - hb_front_addrs = hb_front_server_messenger->get_myaddrs(); } if (auto session = local_connection->get_priv(); !session) { hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection); @@ -6784,6 +6780,8 @@ void OSD::_send_boot() // are, so now is a good time! set_numa_affinity(); + entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs(); + entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs(); MOSDBoot *mboot = new MOSDBoot( superblock, get_osdmap_epoch(), service.get_boot_epoch(), hb_back_addrs, hb_front_addrs, cluster_addrs, From c7f5037293737322a20617bd7e43ab28da258d22 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Thu, 30 Mar 2023 20:36:55 +0200 Subject: [PATCH 0011/2492] ceph.spec.in: enable build on riscv64 for openSUSE Factory Signed-off-by: Andreas Schwab --- ceph.spec.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ceph.spec.in b/ceph.spec.in index f0dd8e8a941a..5583c59128e3 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -189,7 +189,7 @@ URL: http://ceph.com/ Source0: %{?_remote_tarball_prefix}@TARBALL_BASENAME@.tar.bz2 %if 0%{?suse_version} # _insert_obs_source_lines_here -ExclusiveArch: x86_64 aarch64 ppc64le s390x +ExclusiveArch: x86_64 aarch64 ppc64le s390x riscv64 %endif ################################################################################# # dependencies that apply across all distro families From e2b2e8eb74c4d5cf06a5a2cb872a30b508bf75a2 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Wed, 12 Oct 2022 15:48:51 -0400 Subject: [PATCH 0012/2492] mds: trim cache during standby-replay Fixes: 138fea6a7638697acb1a9e824db7b8d04ad8d671 Signed-off-by: Patrick Donnelly --- src/mds/MDCache.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 77303ddd8ada..9d8bd04d66c5 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -13595,7 +13595,7 @@ void MDCache::upkeep_main(void) if (active_with_clients) { trim_client_leases(); } - if (is_open()) { + if (is_open() || mds->is_standby_replay()) { trim(); } if (active_with_clients) { From fe35c9b200c5b5c6066a4bbec4207cf9b19957f4 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 13 Oct 2022 15:50:49 -0400 Subject: [PATCH 0013/2492] include: remove unused lru method Signed-off-by: Patrick Donnelly --- src/include/lru.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/include/lru.h b/src/include/lru.h index 3f5069ee3ed3..33f2f4e08ff9 100644 --- a/src/include/lru.h +++ b/src/include/lru.h @@ -185,10 +185,6 @@ class LRU { return NULL; } - void lru_status() { - //generic_dout(10) << "lru: " << lru_get_size() << " items, " << top.size() << " top, " << bottom.size() << " bot, " << pintail.size() << " pintail" << dendl; - } - protected: // adjust top/bot balance, as necessary void adjust() { From a1ca8e8b9ab280fd0bdb36a1c3a42ea82dfeaea6 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 13 Oct 2022 15:51:06 -0400 Subject: [PATCH 0014/2492] mds: log lru stats during trim Signed-off-by: Patrick Donnelly --- src/mds/MDCache.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 9d8bd04d66c5..59a8f0739e2e 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6786,6 +6786,13 @@ std::pair MDCache::trim_lru(uint64_t count, expiremap& expiremap << " pinned=" << lru.lru_get_num_pinned() << dendl; + dout(20) << "bottom_lru: " << bottom_lru.lru_get_size() << " items" + ", " << bottom_lru.lru_get_top() << " top" + ", " << bottom_lru.lru_get_bot() << " bot" + ", " << bottom_lru.lru_get_pintail() << " pintail" + ", " << bottom_lru.lru_get_num_pinned() << " pinned" + << dendl; + const uint64_t trim_counter_start = trim_counter.get(); bool throttled = false; while (1) { @@ -6806,6 +6813,13 @@ std::pair MDCache::trim_lru(uint64_t count, expiremap& expiremap } unexpirables.clear(); + dout(20) << "lru: " << lru.lru_get_size() << " items" + ", " << lru.lru_get_top() << " top" + ", " << lru.lru_get_bot() << " bot" + ", " << lru.lru_get_pintail() << " pintail" + ", " << lru.lru_get_num_pinned() << " pinned" + << dendl; + // trim dentries from the LRU until count is reached // if mds is in standby_replay and skip trimming the inodes while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) { From 589e59af11e40164695ca13f4ce4f2bc140b18b8 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 10 Nov 2022 08:22:35 -0500 Subject: [PATCH 0015/2492] mds: revert standby-replay trimming changes Revert "mds: do not trim the inodes from the lru list in standby_replay" Revert "mds: trim cache during standby replay" This reverts commit 79bb44c1b9f1715378a9550a81984e949e454ff4. This reverts commit c0fe25bb2a87856c1281eddcb4da2efe0d7fbf75. standby-replay daemons were changed to keep minimal metadata from the journal in cache but the original intent of standby-replay was to have a cache that's as warm as the rank itself. This reverts the two commits which changed that behavior. Part of these reason for this is that the new rapid cache trimming behavior was not correct at all. The trimming loop would break when it runs into a dentry with non-null linkage. This would nearly always be the case. It was thought that this was a problem introduced by [2] as MDCache::standby_trim_segment has a different trim check [4] but the original issue (tracker 48673) is as old as [1], indicating the problem predates [2]. So, this commit reverts all of that. I have lingering suspicions that the standby-replay daemon is not pinning some dentries properly which causes [5] but this did not show up unless the MDS was rapidly evicting some dentries. More research needs done there. [1] c0fe25bb2a87856c1281eddcb4da2efe0d7fbf75 [2] 79bb44c1b9f1715378a9550a81984e949e454ff4 [3] https://github.com/ceph/ceph/blob/84fba097049ec4f72549588eaacc64f30c7a88a8/src/mds/MDCache.cc#L6816-L6820 [4] https://github.com/ceph/ceph/blob/84fba097049ec4f72549588eaacc64f30c7a88a8/src/mds/MDCache.cc#L7476-L7481 [5] https://tracker.ceph.com/issues/50246 Fixes: https://tracker.ceph.com/issues/48673 Signed-off-by: Patrick Donnelly --- src/mds/MDCache.cc | 39 +++++---------------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 59a8f0739e2e..d44ea8187c10 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6821,19 +6821,17 @@ std::pair MDCache::trim_lru(uint64_t count, expiremap& expiremap << dendl; // trim dentries from the LRU until count is reached - // if mds is in standby_replay and skip trimming the inodes - while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) { + while (!throttled && (cache_toofull() || count > 0)) { throttled |= trim_counter_start+trimmed >= trim_threshold; if (throttled) break; CDentry *dn = static_cast(lru.lru_expire()); if (!dn) { break; } - if (is_standby_replay && dn->get_linkage()->inode) { - // we move the inodes that need to be trimmed to the end of the lru queue. - // refer to MDCache::standby_trim_segment - lru.lru_insert_bot(dn); - break; + if ((is_standby_replay && dn->get_linkage()->inode && + dn->get_linkage()->inode->item_open_file.is_on_list())) { + dout(20) << "unexpirable: " << *dn << dendl; + unexpirables.push_back(dn); } else if (trim_dentry(dn, expiremap)) { unexpirables.push_back(dn); } else { @@ -7479,69 +7477,42 @@ void MDCache::try_trim_non_auth_subtree(CDir *dir) void MDCache::standby_trim_segment(LogSegment *ls) { - auto try_trim_inode = [this](CInode *in) { - if (in->get_num_ref() == 0 && - !in->item_open_file.is_on_list() && - in->parent != NULL && - in->parent->get_num_ref() == 0){ - touch_dentry_bottom(in->parent); - } - }; - - auto try_trim_dentry = [this](CDentry *dn) { - if (dn->get_num_ref() > 0) - return; - auto in = dn->get_linkage()->inode; - if(in && in->item_open_file.is_on_list()) - return; - touch_dentry_bottom(dn); - }; - ls->new_dirfrags.clear_list(); ls->open_files.clear_list(); while (!ls->dirty_dirfrags.empty()) { CDir *dir = ls->dirty_dirfrags.front(); dir->mark_clean(); - if (dir->inode) - try_trim_inode(dir->inode); } while (!ls->dirty_inodes.empty()) { CInode *in = ls->dirty_inodes.front(); in->mark_clean(); - try_trim_inode(in); } while (!ls->dirty_dentries.empty()) { CDentry *dn = ls->dirty_dentries.front(); dn->mark_clean(); - try_trim_dentry(dn); } while (!ls->dirty_parent_inodes.empty()) { CInode *in = ls->dirty_parent_inodes.front(); in->clear_dirty_parent(); - try_trim_inode(in); } while (!ls->dirty_dirfrag_dir.empty()) { CInode *in = ls->dirty_dirfrag_dir.front(); in->filelock.remove_dirty(); - try_trim_inode(in); } while (!ls->dirty_dirfrag_nest.empty()) { CInode *in = ls->dirty_dirfrag_nest.front(); in->nestlock.remove_dirty(); - try_trim_inode(in); } while (!ls->dirty_dirfrag_dirfragtree.empty()) { CInode *in = ls->dirty_dirfrag_dirfragtree.front(); in->dirfragtreelock.remove_dirty(); - try_trim_inode(in); } while (!ls->truncating_inodes.empty()) { auto it = ls->truncating_inodes.begin(); CInode *in = *it; ls->truncating_inodes.erase(it); in->put(CInode::PIN_TRUNCATING); - try_trim_inode(in); } } From 4e20faa5bd3ee1d6607f662a1cf9f2d3a42a540c Mon Sep 17 00:00:00 2001 From: myoungwon oh Date: Fri, 23 Jun 2023 05:05:27 +0000 Subject: [PATCH 0016/2492] src/tools/ceph_dedup_tool: remove unused code Signed-off-by: Myoungwon Oh --- src/tools/ceph_dedup_tool.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/tools/ceph_dedup_tool.cc b/src/tools/ceph_dedup_tool.cc index f3c942a97604..b8c79efa42ef 100644 --- a/src/tools/ceph_dedup_tool.cc +++ b/src/tools/ceph_dedup_tool.cc @@ -570,11 +570,6 @@ class SampleDedupWorkerThread : public Thread return cur_reference >= dedup_threshold && dedup_threshold != -1; } - void init(size_t dedup_threshold_) { - std::unique_lock lock(fingerprint_lock); - fp_map.clear(); - dedup_threshold = dedup_threshold_; - } FpStore(size_t chunk_threshold) : dedup_threshold(chunk_threshold) { } private: From 25890a22a929872cf073edab946d23ee5d6b5801 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Wed, 5 Jul 2023 06:01:10 -0500 Subject: [PATCH 0017/2492] test/rgw: annotating variables with maybe_unused Some variables in test/rgw/rgw_cr_test.cc are only used in asserts, while this file is also used in builds without NDEBUG set. Adding the [[maybe_unused]] attribute clears the compilation warnings. Signed-off-by: Ronen Friedman --- src/test/rgw/rgw_cr_test.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/test/rgw/rgw_cr_test.cc b/src/test/rgw/rgw_cr_test.cc index 37120925291a..2c250b390a98 100644 --- a/src/test/rgw/rgw_cr_test.cc +++ b/src/test/rgw/rgw_cr_test.cc @@ -55,12 +55,14 @@ struct TempPool { fmt::format("{}-{}-{}", ::time(nullptr), ::getpid(),num++); TempPool() { - auto r = store->getRados()->get_rados_handle()->pool_create(name.c_str()); + [[maybe_unused]] auto r = + store->getRados()->get_rados_handle()->pool_create(name.c_str()); assert(r == 0); } ~TempPool() { - auto r = store->getRados()->get_rados_handle()->pool_delete(name.c_str()); + [[maybe_unused]] auto r = + store->getRados()->get_rados_handle()->pool_delete(name.c_str()); assert(r == 0); } @@ -70,8 +72,9 @@ struct TempPool { operator librados::IoCtx() { librados::IoCtx ioctx; - auto r = store->getRados()->get_rados_handle()->ioctx_create(name.c_str(), - ioctx); + [[maybe_unused]] auto r = + store->getRados()->get_rados_handle()->ioctx_create(name.c_str(), + ioctx); assert(r == 0); return ioctx; } From cd323cb664a125834e41b77f711eac898e548382 Mon Sep 17 00:00:00 2001 From: Ville Ojamo <14869000+bluikko@users.noreply.github.com> Date: Fri, 7 Jul 2023 17:02:19 +0700 Subject: [PATCH 0018/2492] doc/radosgw/admin.rst: use underscores in config var names Following the current policy, config var names in `ceph.conf` etc. should use underscores instead of spaces. Signed-off-by: Ville Ojamo <14869000+bluikko@users.noreply.github.com> --- doc/radosgw/admin.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/radosgw/admin.rst b/doc/radosgw/admin.rst index fc2651ec0d3d..9ba9be3e212b 100644 --- a/doc/radosgw/admin.rst +++ b/doc/radosgw/admin.rst @@ -434,9 +434,9 @@ Default Quotas You can set default quotas in the config. These defaults are used when creating a new user and have no effect on existing users. If the relevant default quota is set in config, then that quota is set on the -new user, and that quota is enabled. See ``rgw bucket default quota max objects``, -``rgw bucket default quota max size``, ``rgw user default quota max objects``, and -``rgw user default quota max size`` in `Ceph Object Gateway Config Reference`_ +new user, and that quota is enabled. See ``rgw_bucket_default_quota_max_objects``, +``rgw_bucket_default_quota_max_size``, ``rgw_user_default_quota_max_objects``, and +``rgw_user_default_quota_max_size`` in `Ceph Object Gateway Config Reference`_ Quota Cache ----------- @@ -444,8 +444,8 @@ Quota Cache Quota statistics are cached on each RGW instance. If there are multiple instances, then the cache can keep quotas from being perfectly enforced, as each instance will have a different view of quotas. The options that control -this are ``rgw bucket quota ttl``, ``rgw user quota bucket sync interval`` and -``rgw user quota sync interval``. The higher these values are, the more +this are ``rgw_bucket_quota_ttl``, ``rgw_user_quota_bucket_sync_interval`` and +``rgw_user_quota_sync_interval``. The higher these values are, the more efficient quota operations are, but the more out-of-sync multiple instances will be. The lower these values are, the closer to perfect enforcement multiple instances will achieve. If all three are 0, then quota caching is @@ -647,7 +647,7 @@ Usage The Ceph Object Gateway logs usage for each user. You can track user usage within date ranges too. -- Add ``rgw enable usage log = true`` in [client.rgw] section of ceph.conf and restart the radosgw service. +- Add ``rgw_enable_usage_log = true`` in [client.rgw] section of ceph.conf and restart the radosgw service. Options include: From f2b5f0727401fd3ab975976555864c21860f3579 Mon Sep 17 00:00:00 2001 From: Vedansh Bhartia Date: Thu, 13 Jul 2023 17:17:06 +0530 Subject: [PATCH 0019/2492] rgw: Fix potential null dereference in rgw/driver/dbstore/sqlite/statement.cc Signed-off-by: Vedansh Bhartia --- src/rgw/driver/dbstore/sqlite/statement.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/rgw/driver/dbstore/sqlite/statement.cc b/src/rgw/driver/dbstore/sqlite/statement.cc index 3e44f4c0b6e5..199774f4d9ad 100644 --- a/src/rgw/driver/dbstore/sqlite/statement.cc +++ b/src/rgw/driver/dbstore/sqlite/statement.cc @@ -118,10 +118,10 @@ void eval0(const DoutPrefixProvider* dpp, const stmt_execution& stmt) if (ec != sqlite::errc::done) { const char* errmsg = ::sqlite3_errmsg(db); ldpp_dout(dpp, 20) << "evaluation failed: " << errmsg - << " (" << ec << ")\nstatement: " << sql.get() << dendl; + << " (" << ec << ")\nstatement: " << (sql ? sql.get() : "") << dendl; throw sqlite::error(errmsg, ec); } - ldpp_dout(dpp, 20) << "evaluation succeeded: " << sql.get() << dendl; + ldpp_dout(dpp, 20) << "evaluation succeeded: " << (sql ? sql.get() : "") << dendl; } void eval1(const DoutPrefixProvider* dpp, const stmt_execution& stmt) @@ -137,10 +137,10 @@ void eval1(const DoutPrefixProvider* dpp, const stmt_execution& stmt) sqlite3* db = ::sqlite3_db_handle(stmt.get()); const char* errmsg = ::sqlite3_errmsg(db); ldpp_dout(dpp, 1) << "evaluation failed: " << errmsg << " (" << ec - << ")\nstatement: " << sql.get() << dendl; + << ")\nstatement: " << (sql ? sql.get() : "") << dendl; throw sqlite::error(errmsg, ec); } - ldpp_dout(dpp, 20) << "evaluation succeeded: " << sql.get() << dendl; + ldpp_dout(dpp, 20) << "evaluation succeeded: " << (sql ? sql.get() : "") << dendl; } int column_int(const stmt_execution& stmt, int column) @@ -181,14 +181,14 @@ auto read_text_rows(const DoutPrefixProvider* dpp, sqlite3* db = ::sqlite3_db_handle(stmt.get()); const char* errmsg = ::sqlite3_errmsg(db); ldpp_dout(dpp, 1) << "evaluation failed: " << errmsg << " (" << ec - << ")\nstatement: " << sql.get() << dendl; + << ")\nstatement: " << (sql ? sql.get() : "") << dendl; throw sqlite::error(errmsg, ec); } entries[count] = column_text(stmt, 0); ++count; } ldpp_dout(dpp, 20) << "statement evaluation produced " << count - << " results: " << sql.get() << dendl; + << " results: " << (sql ? sql.get() : "") << dendl; return entries.first(count); } From 1a2744282c46ca113f5f9f168d1f4d66c734e64d Mon Sep 17 00:00:00 2001 From: TomNewChao Date: Tue, 25 Jul 2023 15:08:44 +0800 Subject: [PATCH 0020/2492] Ceph dashboard supports multiple languages mgr/dashboard/frontend:Ceph dashboard supports multiple languages Signed-off-by: TomNewChao --- ceph.spec.in | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ceph.spec.in b/ceph.spec.in index 7af4123826a7..d1d11e8af451 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -24,6 +24,7 @@ %bcond_with zbd %bcond_with cmake_verbose_logging %bcond_without ceph_test_package +%bcond_without mgr_dashboard_frontend_support_multi_language %ifarch s390 %bcond_with tcmalloc %else @@ -470,6 +471,9 @@ BuildRequires: libnuma-devel %if 0%{?rhel} >= 8 BuildRequires: /usr/bin/pathfix.py %endif +%if 0%{with mgr_dashboard_frontend_support_multi_language} +BuildRequires: npm +%endif %description Ceph is a massively scalable, open-source, distributed storage system that runs @@ -1351,7 +1355,13 @@ cmake .. \ -DSYSTEMD_SYSTEM_UNIT_DIR:PATH=%{_unitdir} \ -DWITH_MANPAGE:BOOL=ON \ -DWITH_PYTHON3:STRING=%{python3_version} \ +%if 0%{with mgr_dashboard_frontend_support_multi_language} + -DWITH_MGR_DASHBOARD_FRONTEND:BOOL=ON \ + -DDASHBOARD_FRONTEND_LANGS:STRING="cs,de,es,fr,id,it,ja,ko,pl,zh-Hans,zh-Hant,pt" \ + -DWITH_SYSTEM_NPM:BOOL=ON \ +%else -DWITH_MGR_DASHBOARD_FRONTEND:BOOL=OFF \ +%endif %if 0%{?suse_version} -DWITH_RADOSGW_SELECT_PARQUET:BOOL=OFF \ %endif From 10edb2ffbdb7c4ef839f1ba7b88d7d85a682b7be Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 22 Aug 2023 14:10:46 +0800 Subject: [PATCH 0021/2492] crimson/osd/pg_recovery: avoiding duplicated object recovering UrgentRecovery and other recoveries may collide when doing `PGRecovery::add_recovering`, this is not an error. We should allow this to happen Signed-off-by: Xuehan Xu --- src/crimson/osd/pg_recovery.cc | 91 +++++++++++++++++++----------- src/crimson/osd/recovery_backend.h | 6 +- 2 files changed, 60 insertions(+), 37 deletions(-) diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc index 09b45779ec87..efbbf7e4f3ee 100644 --- a/src/crimson/osd/pg_recovery.cc +++ b/src/crimson/osd/pg_recovery.cc @@ -266,20 +266,27 @@ PGRecovery::recover_missing( RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, const hobject_t &soid, eversion_t need) { - if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) { - return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking( - trigger, - pg->get_recovery_backend()->recover_delete(soid, need)); + logger().info("{} {} v {}", __func__, soid, need); + auto [recovering, added] = pg->get_recovery_backend()->add_recovering(soid); + if (added) { + logger().info("{} {} v {}, new recovery", __func__, soid, need); + if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) { + return recovering.wait_track_blocking( + trigger, + pg->get_recovery_backend()->recover_delete(soid, need)); + } else { + return recovering.wait_track_blocking( + trigger, + pg->get_recovery_backend()->recover_object(soid, need) + .handle_exception_interruptible( + [=, this, soid = std::move(soid)] (auto e) { + on_failed_recover({ pg->get_pg_whoami() }, soid, need); + return seastar::make_ready_future<>(); + }) + ); + } } else { - return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking( - trigger, - pg->get_recovery_backend()->recover_object(soid, need) - .handle_exception_interruptible( - [=, this, soid = std::move(soid)] (auto e) { - on_failed_recover({ pg->get_pg_whoami() }, soid, need); - return seastar::make_ready_future<>(); - }) - ); + return recovering.wait_for_recovered(); } } @@ -288,16 +295,23 @@ RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_deletes( const hobject_t& soid, eversion_t need) { - return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking( - trigger, - pg->get_recovery_backend()->push_delete(soid, need).then_interruptible( - [=, this] { - object_stat_sum_t stat_diff; - stat_diff.num_objects_recovered = 1; - on_global_recover(soid, stat_diff, true); - return seastar::make_ready_future<>(); - }) - ); + logger().info("{} {} v {}", __func__, soid, need); + auto [recovering, added] = pg->get_recovery_backend()->add_recovering(soid); + if (added) { + logger().info("{} {} v {}, new recovery", __func__, soid, need); + return recovering.wait_track_blocking( + trigger, + pg->get_recovery_backend()->push_delete(soid, need).then_interruptible( + [=, this] { + object_stat_sum_t stat_diff; + stat_diff.num_objects_recovered = 1; + on_global_recover(soid, stat_diff, true); + return seastar::make_ready_future<>(); + }) + ); + } else { + return recovering.wait_for_recovered(); + } } RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_pushes( @@ -305,15 +319,22 @@ RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_pushes( const hobject_t& soid, eversion_t need) { - return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking( - trigger, - pg->get_recovery_backend()->recover_object(soid, need) - .handle_exception_interruptible( - [=, this, soid = std::move(soid)] (auto e) { - on_failed_recover({ pg->get_pg_whoami() }, soid, need); - return seastar::make_ready_future<>(); - }) - ); + logger().info("{} {} v {}", __func__, soid, need); + auto [recovering, added] = pg->get_recovery_backend()->add_recovering(soid); + if (added) { + logger().info("{} {} v {}, new recovery", __func__, soid, need); + return recovering.wait_track_blocking( + trigger, + pg->get_recovery_backend()->recover_object(soid, need) + .handle_exception_interruptible( + [=, this, soid = std::move(soid)] (auto e) { + on_failed_recover({ pg->get_pg_whoami() }, soid, need); + return seastar::make_ready_future<>(); + }) + ); + } else { + return recovering.wait_for_recovered(); + } } void PGRecovery::on_local_recover( @@ -449,9 +470,11 @@ void PGRecovery::enqueue_push( const hobject_t& obj, const eversion_t& v) { - logger().debug("{}: obj={} v={}", + logger().info("{}: obj={} v={}", __func__, obj, v); - pg->get_recovery_backend()->add_recovering(obj); + auto [recovering, added] = pg->get_recovery_backend()->add_recovering(obj); + if (!added) + return; std::ignore = pg->get_recovery_backend()->recover_object(obj, v).\ handle_exception_interruptible([] (auto) { ceph_abort_msg("got exception on backfill's push"); diff --git a/src/crimson/osd/recovery_backend.h b/src/crimson/osd/recovery_backend.h index 65e9bb01fbda..abf695891596 100644 --- a/src/crimson/osd/recovery_backend.h +++ b/src/crimson/osd/recovery_backend.h @@ -45,10 +45,10 @@ class RecoveryBackend { coll{coll}, backend{backend} {} virtual ~RecoveryBackend() {} - WaitForObjectRecovery& add_recovering(const hobject_t& soid) { + std::pair add_recovering(const hobject_t& soid) { auto [it, added] = recovering.emplace(soid, new WaitForObjectRecovery{}); - assert(added); - return *(it->second); + assert(it->second); + return {*(it->second), added}; } WaitForObjectRecovery& get_recovering(const hobject_t& soid) { assert(is_recovering(soid)); From 094af522c7062cb91c87991ec55cbf67ee1da4dc Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 24 Aug 2023 12:14:13 +0800 Subject: [PATCH 0022/2492] crimson/osd/pg: discard watches' states after iterating all cached obcs Discarding watches' states while iterating cached obcs might have the following problem: 1. discard a watch's state 2. the corresponding object context's use_count drops to zero 3. the object context is unreferenced by obc lru 4. obc is deleted from obc lru by lru's evict() 5. obc iteration is corrupted Signed-off-by: Xuehan Xu --- src/crimson/osd/pg.cc | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index 85f6116e604b..696b6651a37f 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -1454,14 +1454,19 @@ void PG::on_change(ceph::os::Transaction &t) { } void PG::context_registry_on_change() { - obc_registry.for_each([](ObjectContextRef obc) { - assert(obc); - for (auto j = obc->watchers.begin(); - j != obc->watchers.end(); - j = obc->watchers.erase(j)) { - j->second->discard_state(); - } + std::vector> watchers; + obc_registry.for_each([&watchers](ObjectContextRef obc) { + assert(obc); + for (auto j = obc->watchers.begin(); + j != obc->watchers.end(); + j = obc->watchers.erase(j)) { + watchers.emplace_back(j->second); + } }); + + for (auto &watcher : watchers) { + watcher->discard_state(); + } } bool PG::can_discard_op(const MOSDOp& m) const { From 46f01d832487d3a3183783d41450fd8f49347097 Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Fri, 25 Aug 2023 16:10:26 +0300 Subject: [PATCH 0023/2492] test/store_test: get rid off assert_death. Looks like death assertions aren't 100% reliable and might cause deadlock sometimes. Hence getting rid of them and enabling optional sending exception from *Store::queue_transaction() Fixes: https://tracker.ceph.com/issues/61193 Signed-off-by: Igor Fedotov --- src/common/options/global.yaml.in | 6 ++++++ src/os/bluestore/BlueStore.cc | 17 +++++++++++++++-- src/os/bluestore/BlueStore.h | 6 ++++++ src/os/kstore/KStore.cc | 16 ++++++++++++++-- src/os/kstore/KStore.h | 5 +++++ src/os/memstore/MemStore.cc | 6 +++++- src/test/objectstore/store_test.cc | 19 ++++++++++++++----- src/test/objectstore/store_test_fixture.cc | 4 ---- src/test/objectstore/store_test_fixture.h | 8 -------- 9 files changed, 65 insertions(+), 22 deletions(-) diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 3a3a4a137291..cbe895941183 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -6343,3 +6343,9 @@ options: default: 0 services: - mgr +- name: objectstore_debug_throw_on_failed_txc + type: bool + level: dev + desc: Enables exception throwing instead of process abort on transaction submission error. + default: false + with_legacy: false diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index e3e4833f1a2d..3d9f680ba6c2 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -7852,6 +7852,7 @@ int BlueStore::_mount() int BlueStore::umount() { + dout(5) << __func__ << dendl; ceph_assert(_kv_only || mounted); _osr_drain_all(); @@ -14434,7 +14435,13 @@ void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t) << " not handled on operation " << op->op << " (op " << pos << ", counting from 0)" << dendl; _dump_transaction<0>(cct, t); - ceph_abort_msg("unexpected error"); + if (!g_conf().get_val("objectstore_debug_throw_on_failed_txc")) { + ceph_abort_msg("unexpected error"); + } else { + txc->osr->undo_queue(txc); + delete txc; + throw r; + } } // these operations implicity create the object @@ -14680,7 +14687,13 @@ void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t) << dendl; derr << msg << dendl; _dump_transaction<0>(cct, t); - ceph_abort_msg("unexpected error"); + if (!g_conf().get_val("objectstore_debug_throw_on_failed_txc")) { + ceph_abort_msg("unexpected error"); + } else { + txc->osr->undo_queue(txc); + delete txc; + throw r; + } } } } diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index c3c53264ec1d..486c7cd4de09 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2055,6 +2055,12 @@ class BlueStore : public ObjectStore, txc->seq = ++last_seq; q.push_back(*txc); } + void undo_queue(TransContext* txc) { + std::lock_guard l(qlock); + ceph_assert(&q.back() == txc); + --last_seq; + q.pop_back(); + } void drain() { std::unique_lock l(qlock); diff --git a/src/os/kstore/KStore.cc b/src/os/kstore/KStore.cc index 9526a756419c..7158486ca388 100644 --- a/src/os/kstore/KStore.cc +++ b/src/os/kstore/KStore.cc @@ -2285,7 +2285,13 @@ void KStore::_txc_add_transaction(TransContext *txc, Transaction *t) f.close_section(); f.flush(*_dout); *_dout << dendl; - ceph_abort_msg("unexpected error"); + if (!g_conf().get_val("objectstore_debug_throw_on_failed_txc")) { + ceph_abort_msg("unexpected error"); + } else { + txc->osr->undo_queue(txc); + delete txc; + throw r; + } } // object operations @@ -2534,7 +2540,13 @@ void KStore::_txc_add_transaction(TransContext *txc, Transaction *t) f.close_section(); f.flush(*_dout); *_dout << dendl; - ceph_abort_msg("unexpected error"); + if (!g_conf().get_val("objectstore_debug_throw_on_failed_txc")) { + ceph_abort_msg("unexpected error"); + } else { + txc->osr->undo_queue(txc); + delete txc; + throw r; + } } } } diff --git a/src/os/kstore/KStore.h b/src/os/kstore/KStore.h index 9e3c7acd73b4..30a7606fd2fb 100644 --- a/src/os/kstore/KStore.h +++ b/src/os/kstore/KStore.h @@ -276,6 +276,11 @@ class KStore : public ObjectStore { std::lock_guard l(qlock); q.push_back(*txc); } + void undo_queue(TransContext* txc) { + std::lock_guard l(qlock); + ceph_assert(&q.back() == txc); + q.pop_back(); + } void flush() { std::unique_lock l(qlock); diff --git a/src/os/memstore/MemStore.cc b/src/os/memstore/MemStore.cc index 99e99dcba041..8ada7524dbcd 100644 --- a/src/os/memstore/MemStore.cc +++ b/src/os/memstore/MemStore.cc @@ -1032,7 +1032,11 @@ void MemStore::_do_transaction(Transaction& t) f.close_section(); f.flush(*_dout); *_dout << dendl; - ceph_abort_msg("unexpected error"); + if (!g_conf().get_val("objectstore_debug_throw_on_failed_txc")) { + ceph_abort_msg("unexpected error"); + } else { + throw r; + } } } diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 482d30283041..d23c17a019dc 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -3245,7 +3245,8 @@ TEST_P(StoreTest, SimpleCloneTest) { int r; coll_t cid; - SetDeathTestStyle("threadsafe"); + SetVal(g_conf(), "objectstore_debug_throw_on_failed_txc", "true"); + g_conf().apply_changes(nullptr); auto ch = store->create_new_collection(cid); { @@ -3531,8 +3532,12 @@ TEST_P(StoreTest, SimpleCloneTest) { ObjectStore::Transaction t; t.remove_collection(cid); cerr << "Invalid rm coll" << std::endl; - PrCtl unset_dumpable; - EXPECT_DEATH(queue_transaction(store, ch, std::move(t)), ""); + try { + queue_transaction(store, ch, std::move(t)); + FAIL() << "remove_collection failed to return ENOTEMPTY."; + } catch (int err) { + ASSERT_EQ(err, -ENOTEMPTY); + } } { ObjectStore::Transaction t; @@ -3554,8 +3559,12 @@ TEST_P(StoreTest, SimpleCloneTest) { t.remove(cid, hoid); t.remove(cid, hoid2); t.remove_collection(cid); - PrCtl unset_dumpable; - EXPECT_DEATH(queue_transaction(store, ch, std::move(t)), ""); + try { + queue_transaction(store, ch, std::move(t)); + FAIL() << "remove_collection failed to return ENOTEMPTY."; + } catch (int err) { + ASSERT_EQ(err, -ENOTEMPTY); + } } { ObjectStore::Transaction t; diff --git a/src/test/objectstore/store_test_fixture.cc b/src/test/objectstore/store_test_fixture.cc index a3bdc7a36ac3..0cffd79a709d 100644 --- a/src/test/objectstore/store_test_fixture.cc +++ b/src/test/objectstore/store_test_fixture.cc @@ -77,10 +77,6 @@ void StoreTestFixture::TearDown() // config settings. Hence setting it to 'unsafe' here as test case is closing. g_conf()._clear_safe_to_start_threads(); PopSettings(0); - if (!orig_death_test_style.empty()) { - ::testing::FLAGS_gtest_death_test_style = orig_death_test_style; - orig_death_test_style.clear(); - } } void StoreTestFixture::SetVal(ConfigProxy& _conf, const char* key, const char* val) diff --git a/src/test/objectstore/store_test_fixture.h b/src/test/objectstore/store_test_fixture.h index 3f25fd493d0d..0495c21bd327 100644 --- a/src/test/objectstore/store_test_fixture.h +++ b/src/test/objectstore/store_test_fixture.h @@ -13,8 +13,6 @@ class StoreTestFixture : virtual public ::testing::Test { std::stack> saved_settings; ConfigProxy* conf = nullptr; - std::string orig_death_test_style; - public: std::unique_ptr store; ObjectStore::CollectionHandle ch; @@ -25,12 +23,6 @@ class StoreTestFixture : virtual public ::testing::Test { void SetUp() override; void TearDown() override; - void SetDeathTestStyle(const char* new_style) { - if (orig_death_test_style.empty()) { - orig_death_test_style = ::testing::FLAGS_gtest_death_test_style; - } - ::testing::FLAGS_gtest_death_test_style = new_style; - } void SetVal(ConfigProxy& conf, const char* key, const char* val); struct SettingsBookmark { From a778e1533a2e0e6edc75916c7005cfc5a4baa41d Mon Sep 17 00:00:00 2001 From: myoungwon oh Date: Fri, 23 Jun 2023 05:03:21 +0000 Subject: [PATCH 0024/2492] src/tools/ceph_dedup_tool: add prints to check the dedup progress Signed-off-by: Myoungwon Oh --- src/tools/ceph_dedup_tool.cc | 50 +++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/src/tools/ceph_dedup_tool.cc b/src/tools/ceph_dedup_tool.cc index b8c79efa42ef..fa6bd5e4d5dc 100644 --- a/src/tools/ceph_dedup_tool.cc +++ b/src/tools/ceph_dedup_tool.cc @@ -551,6 +551,17 @@ class SampleDedupWorkerThread : public Thread public: using dup_count_t = ssize_t; + void maybe_print_status() { + utime_t now = ceph_clock_now(); + if (next_report != utime_t() && now > next_report) { + cerr << (int)(now - start) << "s : read " + << total_bytes << " bytes so far..." + << std::endl; + next_report = now; + next_report += report_period; + } + } + bool find(string& fp) { std::shared_lock lock(fingerprint_lock); auto found_item = fp_map.find(fp); @@ -562,6 +573,8 @@ class SampleDedupWorkerThread : public Thread std::unique_lock lock(fingerprint_lock); auto found_iter = fp_map.find(chunk.fingerprint); ssize_t cur_reference = 1; + total_bytes += chunk.size; + maybe_print_status(); if (found_iter == fp_map.end()) { fp_map.insert({chunk.fingerprint, 1}); } else { @@ -570,12 +583,20 @@ class SampleDedupWorkerThread : public Thread return cur_reference >= dedup_threshold && dedup_threshold != -1; } - FpStore(size_t chunk_threshold) : dedup_threshold(chunk_threshold) { } + FpStore(size_t chunk_threshold, uint32_t report_period) : + dedup_threshold(chunk_threshold), report_period(report_period) { + next_report = start; + next_report += report_period; + } private: ssize_t dedup_threshold = -1; std::unordered_map fp_map; std::shared_mutex fingerprint_lock; + const utime_t start = ceph_clock_now(); + utime_t next_report; + const uint32_t report_period = default_report_period; + size_t total_bytes = 0; }; struct SampleDedupGlobal { @@ -583,8 +604,9 @@ class SampleDedupWorkerThread : public Thread const double sampling_ratio = -1; SampleDedupGlobal( int chunk_threshold, - int sampling_ratio) : - fp_store(chunk_threshold), + int sampling_ratio, + uint32_t report_period) : + fp_store(chunk_threshold, report_period), sampling_ratio(static_cast(sampling_ratio) / 100) { } }; @@ -608,6 +630,14 @@ class SampleDedupWorkerThread : public Thread ~SampleDedupWorkerThread() { }; + size_t get_total_duplicated_size() const { + return total_duplicated_size; + } + + size_t get_total_object_size() const { + return total_object_size; + } + protected: void* entry() override { crawl(); @@ -1527,6 +1557,7 @@ int make_crawling_daemon(const po::variables_map &opts) string base_pool_name = get_opts_pool_name(opts); string chunk_pool_name = get_opts_chunk_pool(opts); unsigned max_thread = get_opts_max_thread(opts); + uint32_t report_period = default_report_period; bool loop = false; if (opts.count("loop")) { @@ -1550,6 +1581,7 @@ int make_crawling_daemon(const po::variables_map &opts) } std::string chunk_algo = get_opts_chunk_algo(opts); + report_period = get_opts_report_period(opts); Rados rados; int ret = rados.init_with_context(g_ceph_context); @@ -1640,9 +1672,11 @@ int make_crawling_daemon(const po::variables_map &opts) } SampleDedupWorkerThread::SampleDedupGlobal sample_dedup_global( - chunk_dedup_threshold, sampling_ratio); + chunk_dedup_threshold, sampling_ratio, report_period); std::list threads; + size_t total_size = 0; + size_t total_duplicate_size = 0; for (unsigned i = 0; i < max_thread; i++) { cout << " add thread.. " << std::endl; ObjectCursor shard_start; @@ -1668,8 +1702,16 @@ int make_crawling_daemon(const po::variables_map &opts) } for (auto &p : threads) { + total_size += p.get_total_object_size(); + total_duplicate_size += p.get_total_duplicated_size(); p.join(); } + + cerr << "Summary: read " + << total_size << " bytes so far and found saveable space (" + << total_duplicate_size << " bytes)." + << std::endl; + if (loop) { sleep(wakeup_period); } else { From ced1627fb25b0a42cd53152b06a8cd452ba31482 Mon Sep 17 00:00:00 2001 From: myoungwon oh Date: Fri, 23 Jun 2023 07:07:07 +0000 Subject: [PATCH 0025/2492] src/tools/ceph_dedup_tool: print the progress in the process of chunk scrub Signed-off-by: Myoungwon Oh --- src/tools/ceph_dedup_tool.cc | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/tools/ceph_dedup_tool.cc b/src/tools/ceph_dedup_tool.cc index fa6bd5e4d5dc..c8432f87fa26 100644 --- a/src/tools/ceph_dedup_tool.cc +++ b/src/tools/ceph_dedup_tool.cc @@ -433,6 +433,7 @@ void EstimateDedupRatio::estimate_dedup_ratio() } } +static void print_chunk_scrub(); void ChunkScrub::chunk_scrub_common() { ObjectCursor shard_start; @@ -459,6 +460,13 @@ void ChunkScrub::chunk_scrub_common() &shard_start, &shard_end); + const utime_t start = ceph_clock_now(); + utime_t next_report; + if (report_period) { + next_report = start; + next_report += report_period; + } + ObjectCursor c(shard_start); while(c < shard_end) { @@ -477,6 +485,17 @@ void ChunkScrub::chunk_scrub_common() delete formatter; return; } + + utime_t now = ceph_clock_now(); + if (n == 0 && // first thread only + next_report != utime_t() && now > next_report) { + cerr << (int)(now - start) << "s, interim findings is : " + << std::endl; + print_chunk_scrub(); + next_report = now; + next_report += report_period; + } + auto oid = i.oid; cout << oid << std::endl; chunk_refs_t refs; From bbce7f8ac8e9c5a096e59d6b388b0294ab8bff19 Mon Sep 17 00:00:00 2001 From: myoungwon oh Date: Fri, 23 Jun 2023 13:32:15 +0000 Subject: [PATCH 0026/2492] src/tools/ceph_dedup_tool: verbose print only if debug is enabled Signed-off-by: Myoungwon Oh --- src/tools/ceph_dedup_tool.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/tools/ceph_dedup_tool.cc b/src/tools/ceph_dedup_tool.cc index c8432f87fa26..b4a461aabed5 100644 --- a/src/tools/ceph_dedup_tool.cc +++ b/src/tools/ceph_dedup_tool.cc @@ -497,7 +497,9 @@ void ChunkScrub::chunk_scrub_common() } auto oid = i.oid; - cout << oid << std::endl; + if (debug) { + cout << oid << std::endl; + } chunk_refs_t refs; { bufferlist t; From 315ad2605ad15ff6bf4cfe525df65661b134844b Mon Sep 17 00:00:00 2001 From: myoungwon oh Date: Tue, 29 Aug 2023 08:04:17 +0000 Subject: [PATCH 0027/2492] src/tools/ceph_dedup_tool: move default values to options_description Signed-off-by: Myoungwon Oh --- src/tools/ceph_dedup_tool.cc | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/tools/ceph_dedup_tool.cc b/src/tools/ceph_dedup_tool.cc index b4a461aabed5..c394ec829f48 100644 --- a/src/tools/ceph_dedup_tool.cc +++ b/src/tools/ceph_dedup_tool.cc @@ -132,8 +132,6 @@ map dedup_estimates; // chunk size -> result using namespace librados; unsigned default_op_size = 1 << 26; -unsigned default_max_thread = 2; -int32_t default_report_period = 10; ceph::mutex glock = ceph::make_mutex("glock"); po::options_description make_usage() { @@ -169,8 +167,8 @@ po::options_description make_usage() { ("chunk-algorithm", po::value(), ": , set chunk-algorithm") ("fingerprint-algorithm", po::value(), ": , set fingerprint-algorithm") ("chunk-pool", po::value(), ": set chunk pool name") - ("max-thread", po::value(), ": set max thread") - ("report-period", po::value(), ": set report-period") + ("max-thread", po::value()->default_value(2), ": set max thread") + ("report-period", po::value()->default_value(10), ": set report-period") ("max-seconds", po::value(), ": set max runtime") ("max-read-size", po::value(), ": set max read size") ("pool", po::value(), ": set pool name") @@ -616,7 +614,7 @@ class SampleDedupWorkerThread : public Thread std::shared_mutex fingerprint_lock; const utime_t start = ceph_clock_now(); utime_t next_report; - const uint32_t report_period = default_report_period; + const uint32_t report_period; size_t total_bytes = 0; }; @@ -1029,8 +1027,8 @@ int estimate_dedup_ratio(const po::variables_map &opts) uint64_t chunk_size = 8192; uint64_t min_chunk_size = 8192; uint64_t max_chunk_size = 4*1024*1024; - unsigned max_thread = default_max_thread; - uint32_t report_period = default_report_period; + unsigned max_thread = get_opts_max_thread(opts); + uint32_t report_period = get_opts_report_period(opts); uint64_t max_read_size = default_op_size; uint64_t max_seconds = 0; int ret; @@ -1069,8 +1067,6 @@ int estimate_dedup_ratio(const po::variables_map &opts) } else { cout << "4MB is set as max chunk size by default" << std::endl; } - max_thread = get_opts_max_thread(opts); - report_period = get_opts_report_period(opts); if (opts.count("max-seconds")) { max_seconds = opts["max-seconds"].as(); } else { @@ -1188,9 +1184,9 @@ int chunk_scrub_common(const po::variables_map &opts) std::string object_name, target_object_name; string chunk_pool_name, op_name; int ret; - unsigned max_thread = default_max_thread; + unsigned max_thread = get_opts_max_thread(opts); std::map::const_iterator i; - uint32_t report_period = default_report_period; + uint32_t report_period = get_opts_report_period(opts); ObjectCursor begin; ObjectCursor end; librados::pool_stat_t s; @@ -1341,8 +1337,6 @@ int chunk_scrub_common(const po::variables_map &opts) return 0; } - max_thread = get_opts_max_thread(opts); - report_period = get_opts_report_period(opts); glock.lock(); begin = chunk_io_ctx.object_list_begin(); end = chunk_io_ctx.object_list_end(); @@ -1578,7 +1572,7 @@ int make_crawling_daemon(const po::variables_map &opts) string base_pool_name = get_opts_pool_name(opts); string chunk_pool_name = get_opts_chunk_pool(opts); unsigned max_thread = get_opts_max_thread(opts); - uint32_t report_period = default_report_period; + uint32_t report_period = get_opts_report_period(opts); bool loop = false; if (opts.count("loop")) { @@ -1602,7 +1596,6 @@ int make_crawling_daemon(const po::variables_map &opts) } std::string chunk_algo = get_opts_chunk_algo(opts); - report_period = get_opts_report_period(opts); Rados rados; int ret = rados.init_with_context(g_ceph_context); From 1b7a7a8df88ffac007dbafdecc131807de66c046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Zarzy=C5=84ski?= Date: Wed, 30 Aug 2023 15:19:07 +0200 Subject: [PATCH 0028/2492] common/weighted_shuffle: don't feed std::discrete_distribution with all-zero weights MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This flaw results is the assertions like the following one: ``` /usr/include/c++/11/bits/random.tcc:2667: void std::discrete_distribution<_IntType>::param_type::_M_initialize() [with _IntType = int]: Assertion '__sum > 0' failed. Aborted (core dumped) ``` The reason behind is that `std::discrete_distribution` sums the weights and uses the result as a divisor. Fixes: https://tracker.ceph.com/issues/62645 Signed-off-by: Radosław Zarzyński --- src/common/weighted_shuffle.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/common/weighted_shuffle.h b/src/common/weighted_shuffle.h index 10def0a011a4..dd8f22da014d 100644 --- a/src/common/weighted_shuffle.h +++ b/src/common/weighted_shuffle.h @@ -14,6 +14,8 @@ void weighted_shuffle(RandomIt first, RandomIt last, { if (first == last) { return; + } else if (std::accumulate(weight_first, weight_last, 0) == 0) { + return; } else { std::discrete_distribution d{weight_first, weight_last}; if (auto n = d(g); n > 0) { From d02b17ff84c61123ed27d79dc177c2cfbbe6a72f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Zarzy=C5=84ski?= Date: Wed, 30 Aug 2023 15:23:34 +0200 Subject: [PATCH 0029/2492] test/test_weighted_shuffle: verify weights containing zeros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Radosław Zarzyński --- src/test/test_weighted_shuffle.cc | 52 +++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/src/test/test_weighted_shuffle.cc b/src/test/test_weighted_shuffle.cc index 9f92cbdc0951..efc1cdeb7cb1 100644 --- a/src/test/test_weighted_shuffle.cc +++ b/src/test/test_weighted_shuffle.cc @@ -37,3 +37,55 @@ TEST(WeightedShuffle, Basic) { epsilon); } } + +TEST(WeightedShuffle, ZeroedWeights) { + std::array choices{'a', 'b', 'c', 'd', 'e'}; + std::array weights{0, 0, 0, 0, 0}; + std::map> frequency { + {'a', {0, 0, 0, 0, 0}}, + {'b', {0, 0, 0, 0, 0}}, + {'c', {0, 0, 0, 0, 0}}, + {'d', {0, 0, 0, 0, 0}}, + {'e', {0, 0, 0, 0, 0}} + }; // count each element appearing in each position + const int samples = 10000; + std::random_device rd; + for (auto i = 0; i < samples; i++) { + weighted_shuffle(begin(choices), end(choices), + begin(weights), end(weights), + std::mt19937{rd()}); + for (size_t j = 0; j < choices.size(); ++j) + ++frequency[choices[j]][j]; + } + + for (char ch : choices) { + // all samples on the diagonal + ASSERT_EQ(std::accumulate(begin(frequency[ch]), end(frequency[ch]), 0), + samples); + ASSERT_EQ(frequency[ch][ch-'a'], samples); + } +} + +TEST(WeightedShuffle, SingleNonZeroWeight) { + std::array choices{'a', 'b', 'c', 'd', 'e'}; + std::array weights{0, 42, 0, 0, 0}; + std::map> frequency { + {'a', {0, 0, 0, 0, 0}}, + {'b', {0, 0, 0, 0, 0}}, + {'c', {0, 0, 0, 0, 0}}, + {'d', {0, 0, 0, 0, 0}}, + {'e', {0, 0, 0, 0, 0}} + }; // count each element appearing in each position + const int samples = 10000; + std::random_device rd; + for (auto i = 0; i < samples; i++) { + weighted_shuffle(begin(choices), end(choices), + begin(weights), end(weights), + std::mt19937{rd()}); + for (size_t j = 0; j < choices.size(); ++j) + ++frequency[choices[j]][j]; + } + + // 'b' is always first + ASSERT_EQ(frequency['b'][0], samples); +} From f4e2c3351f6d871ffe38c66e95ac99688f1d28c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Zarzy=C5=84ski?= Date: Tue, 5 Sep 2023 15:21:21 +0200 Subject: [PATCH 0030/2492] crimson: drop store from ECBackend to not shadow PGBackend::store MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This problem was leading to crashes like this one: ``` ../src/crimson/os/futurized_store.h:113:41: runtime error: member access within misaligned address 0xbebebebebebebebe for type 'struct Shard', which requires 8 byte alignment 0xbebebebebebebebe: note: pointer points here --Type for more, q to quit, c to continue without paging-- Thread 1 "crimson-osd" received signal SIGSEGV, Segmentation fault. 0x000055558e7a1dca in crimson::os::FuturizedStore::Shard::do_transaction (txn=..., ch=..., this=0xbebebebebebebebe) at ../src/crimson/os/futurized_store.h:113 113 return do_transaction_no_callbacks( (gdb) bt ``` Signed-off-by: Radosław Zarzyński --- src/crimson/osd/ec_backend.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/crimson/osd/ec_backend.h b/src/crimson/osd/ec_backend.h index 3dbcc4def2e0..56fbb4454231 100644 --- a/src/crimson/osd/ec_backend.h +++ b/src/crimson/osd/ec_backend.h @@ -33,7 +33,6 @@ class ECBackend : public PGBackend epoch_t min_epoch, epoch_t max_epoch, std::vector&& log_entries) final; CollectionRef coll; - crimson::os::FuturizedStore::Shard* store; seastar::future<> request_committed(const osd_reqid_t& reqid, const eversion_t& version) final { return seastar::now(); From 44c24aaf33a6b3f552c49da7da63656f097c3914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Zarzy=C5=84ski?= Date: Fri, 1 Sep 2023 19:39:49 +0200 Subject: [PATCH 0031/2492] mon/OSDMonitor: fix the hint for set-allow-crimson MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Radosław Zarzyński --- src/mon/OSDMonitor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 4ad44d0309d5..ebb130a072cd 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3518,7 +3518,7 @@ bool OSDMonitor::preprocess_boot(MonOpRequestRef op) if (!osdmap.get_allow_crimson()) { mon.clog->info() << "Disallowing boot of crimson-osd without allow_crimson " - << "OSDMap flag. Run ceph osd set_allow_crimson to set " + << "OSDMap flag. Run ceph osd set-allow-crimson to set " << "allow_crimson flag. Note that crimson-osd is " << "considered unstable and may result in crashes or " << "data loss. Its usage should be restricted to " From a78e660728c6c0442cdbfa65db776b5856aee933 Mon Sep 17 00:00:00 2001 From: Kim Minjong Date: Fri, 3 Feb 2023 11:47:47 +0900 Subject: [PATCH 0032/2492] ceph-volume: fix a bug in _check_generic_reject_reasons The types of removable and ro are wrong. Here, both filters are not working at all. Changed this from integer to string and corrected the test data. Delete redundant logic in get_block_devs_sysfs. Given the name of the function, I think it is correct to judge from _check_generic_reject_reasons, and in fact it was before v17.2.4. Fixes: https://tracker.ceph.com/issues/58591 Signed-off-by: Kim Minjong --- .../ceph_volume/tests/util/test_device.py | 22 +++++++++---------- src/ceph-volume/ceph_volume/util/device.py | 4 ++-- src/ceph-volume/ceph_volume/util/disk.py | 2 -- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/ceph-volume/ceph_volume/tests/util/test_device.py b/src/ceph-volume/ceph_volume/tests/util/test_device.py index e382981d9232..e2ea026286f9 100644 --- a/src/ceph-volume/ceph_volume/tests/util/test_device.py +++ b/src/ceph-volume/ceph_volume/tests/util/test_device.py @@ -241,7 +241,7 @@ def test_is_ceph_disk_member_not_available_blkid(self, fake_call, monkeypatch, p @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) def test_reject_removable_device(self, fake_call, device_info): - data = {"/dev/sdb": {"removable": 1}} + data = {"/dev/sdb": {"removable": "1"}} lsblk = {"TYPE": "disk", "NAME": "sdb"} device_info(devices=data,lsblk=lsblk) disk = device.Device("/dev/sdb") @@ -249,7 +249,7 @@ def test_reject_removable_device(self, fake_call, device_info): @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) def test_reject_device_with_gpt_headers(self, fake_call, device_info): - data = {"/dev/sdb": {"removable": 0, "size": 5368709120}} + data = {"/dev/sdb": {"removable": "0", "size": 5368709120}} lsblk = {"TYPE": "disk", "NAME": "sdb"} blkid= {"PTTYPE": "gpt"} device_info( @@ -262,7 +262,7 @@ def test_reject_device_with_gpt_headers(self, fake_call, device_info): @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) def test_accept_non_removable_device(self, fake_call, device_info): - data = {"/dev/sdb": {"removable": 0, "size": 5368709120}} + data = {"/dev/sdb": {"removable": "0", "size": 5368709120}} lsblk = {"TYPE": "disk", "NAME": "sdb"} device_info(devices=data,lsblk=lsblk) disk = device.Device("/dev/sdb") @@ -286,7 +286,7 @@ def test_accept_symlink_to_device(self, fake_call): m_os_path_islink.return_value = True m_os_path_realpath.return_value = '/dev/sdb' - data = {"/dev/sdb": {"ro": 0, "size": 5368709120}} + data = {"/dev/sdb": {"ro": "0", "size": 5368709120}} lsblk = {"TYPE": "disk"} device_info(devices=data,lsblk=lsblk) disk = device.Device("/dev/test_symlink") @@ -304,7 +304,7 @@ def test_reject_symlink_to_device_mapper(self, fake_call): m_os_path_islink.return_value = True m_os_readlink.return_value = '/dev/dm-0' - data = {"/dev/mapper/mpatha": {"ro": 0, "size": 5368709120}} + data = {"/dev/mapper/mpatha": {"ro": "0", "size": 5368709120}} lsblk = {"TYPE": "disk"} device_info(devices=data,lsblk=lsblk) disk = device.Device("/dev/mapper/mpatha") @@ -312,7 +312,7 @@ def test_reject_symlink_to_device_mapper(self, @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) def test_reject_readonly_device(self, fake_call, device_info): - data = {"/dev/cdrom": {"ro": 1}} + data = {"/dev/cdrom": {"ro": "1"}} lsblk = {"TYPE": "disk", "NAME": "cdrom"} device_info(devices=data,lsblk=lsblk) disk = device.Device("/dev/cdrom") @@ -328,7 +328,7 @@ def test_reject_smaller_than_5gb(self, fake_call, device_info): @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) def test_accept_non_readonly_device(self, fake_call, device_info): - data = {"/dev/sda": {"ro": 0, "size": 5368709120}} + data = {"/dev/sda": {"ro": "0", "size": 5368709120}} lsblk = {"TYPE": "disk", "NAME": "sda"} device_info(devices=data,lsblk=lsblk) disk = device.Device("/dev/sda") @@ -594,10 +594,10 @@ class TestDeviceOrdering(object): def setup_method(self): self.data = { - "/dev/sda": {"removable": 0}, - "/dev/sdb": {"removable": 1}, # invalid - "/dev/sdc": {"removable": 0}, - "/dev/sdd": {"removable": 1}, # invalid + "/dev/sda": {"removable": "0"}, + "/dev/sdb": {"removable": "1"}, # invalid + "/dev/sdc": {"removable": "0"}, + "/dev/sdd": {"removable": "1"}, # invalid } @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) diff --git a/src/ceph-volume/ceph_volume/util/device.py b/src/ceph-volume/ceph_volume/util/device.py index d61222afe0a0..d01d395339d4 100644 --- a/src/ceph-volume/ceph_volume/util/device.py +++ b/src/ceph-volume/ceph_volume/util/device.py @@ -594,8 +594,8 @@ def has_partitions(self): def _check_generic_reject_reasons(self): reasons = [ - ('removable', 1, 'removable'), - ('ro', 1, 'read-only'), + ('removable', '1', 'removable'), + ('ro', '1', 'read-only'), ] rejected = [reason for (k, v, reason) in reasons if self.sys_api.get(k, '') == v] diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py index a96b0f0a7f84..fa900e5da0f3 100644 --- a/src/ceph-volume/ceph_volume/util/disk.py +++ b/src/ceph-volume/ceph_volume/util/disk.py @@ -779,8 +779,6 @@ def holder_inner_loop(): continue type_ = 'disk' holders = os.listdir(os.path.join(_sys_block_path, dev, 'holders')) - if get_file_contents(os.path.join(_sys_block_path, dev, 'removable')) == "1": - continue if holder_inner_loop(): continue dm_dir_path = os.path.join(_sys_block_path, dev, 'dm') From bd5e1a83495e31e457827f564c56fba23f4da8c9 Mon Sep 17 00:00:00 2001 From: Kim Minjong Date: Fri, 3 Feb 2023 13:57:05 +0900 Subject: [PATCH 0033/2492] ceph-volume: allow removable devices but exclude USB Changed the logic to exclude USB devices to pass hot swap devices that were passed through before the bug. Fixes: https://tracker.ceph.com/issues/57907 Fixes: https://tracker.ceph.com/issues/58189 Fixes: https://tracker.ceph.com/issues/58306 Fixes: https://tracker.ceph.com/issues/58591 Signed-off-by: Kim Minjong --- src/ceph-volume/ceph_volume/util/device.py | 2 +- src/ceph-volume/ceph_volume/util/disk.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ceph-volume/ceph_volume/util/device.py b/src/ceph-volume/ceph_volume/util/device.py index d01d395339d4..bb806292f2c4 100644 --- a/src/ceph-volume/ceph_volume/util/device.py +++ b/src/ceph-volume/ceph_volume/util/device.py @@ -594,7 +594,7 @@ def has_partitions(self): def _check_generic_reject_reasons(self): reasons = [ - ('removable', '1', 'removable'), + ('id_bus', 'usb', 'id_bus'), ('ro', '1', 'read-only'), ] rejected = [reason for (k, v, reason) in reasons if diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py index fa900e5da0f3..dc1d9b1b7920 100644 --- a/src/ceph-volume/ceph_volume/util/disk.py +++ b/src/ceph-volume/ceph_volume/util/disk.py @@ -916,6 +916,10 @@ def get_devices(_sys_block_path='/sys/block', device=''): metadata['path'] = diskname metadata['type'] = block[2] + # some facts from udevadm + p = udevadm_property(sysdir) + metadata['id_bus'] = p.get('ID_BUS', '') + device_facts[diskname] = metadata return device_facts From 0e95b27402e46c34586f460d2140af48d03fa305 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Thu, 7 Sep 2023 11:58:22 -0700 Subject: [PATCH 0034/2492] ceph-volume: Fix unbound var in disk.get_devices() 00ba00fdfab8 looks to have regressed. Signed-off-by: Zack Cerza --- src/ceph-volume/ceph_volume/util/disk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py index a96b0f0a7f84..0a59cb0ba1cf 100644 --- a/src/ceph-volume/ceph_volume/util/disk.py +++ b/src/ceph-volume/ceph_volume/util/disk.py @@ -878,6 +878,7 @@ def get_devices(_sys_block_path='/sys/block', device=''): for key, file_ in facts: metadata[key] = get_file_contents(os.path.join(sysdir, file_)) + device_slaves = [] if block[2] != 'part': device_slaves = os.listdir(os.path.join(sysdir, 'slaves')) metadata['partitions'] = get_partitions_facts(sysdir) From 3758f6e7433c58b9e62ae35184659cffabdbd133 Mon Sep 17 00:00:00 2001 From: "David.Hall" Date: Fri, 8 Sep 2023 15:12:49 -0500 Subject: [PATCH 0035/2492] SignatureDoesNotMatch for certain RGW Admin Ops endpoints when using v4 auth https://tracker.ceph.com/issues/62105 Change from std::map<> to std::multimap<> to allow for duplicates rgwadmin submits duplicates in a very few cases, so we need to handle them. Signed-off-by: David.Hall --- src/rgw/rgw_auth_s3.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rgw/rgw_auth_s3.cc b/src/rgw/rgw_auth_s3.cc index a2def87040ef..ccbdfff0c427 100644 --- a/src/rgw/rgw_auth_s3.cc +++ b/src/rgw/rgw_auth_s3.cc @@ -574,7 +574,7 @@ std::string get_v4_canonical_qs(const req_info& info, const bool using_qs) /* Handle case when query string exists. Step 3 described in: http://docs. * aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html */ - std::map canonical_qs_map; + std::multimap canonical_qs_map; for (const auto& s : get_str_vec<5>(*params, "&")) { std::string_view key, val; const auto parsed_pair = parse_key_value(s); @@ -595,7 +595,7 @@ std::string get_v4_canonical_qs(const req_info& info, const bool using_qs) // while awsv4 specs ask for all slashes to be encoded, s3 itself is relaxed // in its implementation allowing non-url-encoded slashes to be present in // presigned urls for instance - canonical_qs_map[aws4_uri_recode(key, true)] = aws4_uri_recode(val, true); + canonical_qs_map.insert({{aws4_uri_recode(key, true), aws4_uri_recode(val, true)}}); } /* Thanks to the early exist we have the guarantee that canonical_qs_map has From 9215996586ba79302c07e5cf586ec2add1ac159e Mon Sep 17 00:00:00 2001 From: Matan Breizman Date: Wed, 12 Jul 2023 12:33:58 +0000 Subject: [PATCH 0036/2492] osd/osd_types: Introduce OSDSuperblock::maps replace oldest/newest_map members with an interval_set in order to support the tracking of an osdmap range gap. Map gap example: ``` 2023-08-28T18:21:05.452+0000 7f409bce2640 3 osd.4 84 handle_osd_map epochs [85,86], i have 84, src has [1,86] 2023-08-28T18:26:25.829+0000 7fcfea0c1640 3 osd.4 86 handle_osd_map epochs [208,208], i have 86, src has [208,400] 2023-08-28T18:26:25.829+0000 7fcfea0c1640 10 osd.4 86 superblock cluster_osdmap_trim_lower_bound new epoch is: 208 2023-08-28T18:26:25.829+0000 7fcfea0c1640 10 osd.4 86 handle_osd_map osd map gap [16~71,208~1] 2023-08-28T18:26:25.829+0000 7fcfea0c1640 3 osd.4 86 handle_osd_map epochs [209,248], i have 208, src has [208,400] 2023-08-28T18:26:25.833+0000 7fcfea0c1640 10 osd.4 86 handle_osd_map osd map gap [31~56,208~4] 2023-08-28T18:26:25.941+0000 7fcfea0c1640 3 osd.4 211 handle_osd_map epochs [212,248], i have 211, src has [208,400] 2023-08-28T18:26:25.945+0000 7fcfea0c1640 10 osd.4 211 handle_osd_map osd map gap [46~41,208~41] 2023-08-28T18:26:25.949+0000 7fcfea0c1640 3 osd.4 211 handle_osd_map epochs [209,248], i have 248, src has [208,400] 2023-08-28T18:26:25.949+0000 7fcfea0c1640 3 osd.4 211 handle_osd_map epochs [212,251], i have 248, src has [208,400] 2023-08-28T18:26:25.953+0000 7fcfea0c1640 10 osd.4 211 handle_osd_map osd map gap [61~26,208~44] 2023-08-28T18:26:26.073+0000 7fcfea0c1640 3 osd.4 251 handle_osd_map epochs [249,288], i have 251, src has [208,400] 2023-08-28T18:26:26.081+0000 7fcfea0c1640 10 osd.4 251 handle_osd_map osd map gap [76~11,208~48] 2023-08-28T18:26:26.081+0000 7fcfea0c1640 3 osd.4 251 handle_osd_map epochs [252,291], i have 255, src has [208,400] ``` Full example: https://gist.github.com/Matan-B/9b0eed8daee3bd6c3216bd3b6d11e8fb Fixes: https://tracker.ceph.com/issues/61962 Signed-off-by: Matan Breizman --- src/crimson/osd/osd.cc | 23 +++++------ src/crimson/osd/shard_services.cc | 12 +++--- src/mon/OSDMonitor.cc | 2 +- src/osd/OSD.cc | 63 ++++++++++++++---------------- src/osd/osd_types.cc | 21 ++++++---- src/osd/osd_types.h | 28 ++++++++++++- src/tools/ceph_objectstore_tool.cc | 4 +- src/tools/rebuild_mondb.cc | 8 ++-- 8 files changed, 93 insertions(+), 68 deletions(-) diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc index cfe4f54ab2e5..ccb7435332b3 100644 --- a/src/crimson/osd/osd.cc +++ b/src/crimson/osd/osd.cc @@ -708,19 +708,17 @@ void OSD::dump_status(Formatter* f) const f->dump_stream("osd_fsid") << superblock.osd_fsid; f->dump_unsigned("whoami", superblock.whoami); f->dump_string("state", pg_shard_manager.get_osd_state_string()); - f->dump_unsigned("oldest_map", superblock.oldest_map); + f->dump_stream("maps") << superblock.maps; f->dump_unsigned("cluster_osdmap_trim_lower_bound", superblock.cluster_osdmap_trim_lower_bound); - f->dump_unsigned("newest_map", superblock.newest_map); f->dump_unsigned("num_pgs", pg_shard_manager.get_num_pgs()); } void OSD::print(std::ostream& out) const { out << "{osd." << superblock.whoami << " " - << superblock.osd_fsid << " [" << superblock.oldest_map - << "," << superblock.newest_map << "] " - << "tlb:" << superblock.cluster_osdmap_trim_lower_bound + << superblock.osd_fsid << " maps " << superblock.maps + << " tlb:" << superblock.cluster_osdmap_trim_lower_bound << " pgs:" << pg_shard_manager.get_num_pgs() << "}"; } @@ -934,16 +932,16 @@ seastar::future<> OSD::_handle_osd_map(Ref m) const auto first = m->get_first(); const auto last = m->get_last(); logger().info("handle_osd_map epochs [{}..{}], i have {}, src has [{}..{}]", - first, last, superblock.newest_map, + first, last, superblock.get_newest_map(), m->cluster_osdmap_trim_lower_bound, m->newest_map); // make sure there is something new, here, before we bother flushing // the queues and such - if (last <= superblock.newest_map) { + if (last <= superblock.get_newest_map()) { return seastar::now(); } // missing some? bool skip_maps = false; - epoch_t start = superblock.newest_map + 1; + epoch_t start = superblock.get_newest_map() + 1; if (first > start) { logger().info("handle_osd_map message skips epochs {}..{}", start, first - 1); @@ -967,10 +965,13 @@ seastar::future<> OSD::_handle_osd_map(Ref m) return pg_shard_manager.store_maps(t, start, m).then([=, this, &t] { // even if this map isn't from a mon, we may have satisfied our subscription monc->sub_got("osdmap", last); - if (!superblock.oldest_map || skip_maps) { - superblock.oldest_map = first; + + if (!superblock.maps.empty()) { + // TODO: support osdmap trimming + // See: } - superblock.newest_map = last; + + superblock.insert_osdmap_epochs(first, last); superblock.current_epoch = last; // note in the superblock that we were clean thru the prior epoch diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc index a6431305d806..404f28d7d7f3 100644 --- a/src/crimson/osd/shard_services.cc +++ b/src/crimson/osd/shard_services.cc @@ -711,16 +711,16 @@ seastar::future<> OSDSingletonState::send_incremental_map( { logger().info("{}: first osdmap: {} " "superblock's oldest map: {}", - __func__, first, superblock.oldest_map); - if (first >= superblock.oldest_map) { + __func__, first, superblock.get_oldest_map()); + if (first >= superblock.get_oldest_map()) { return load_map_bls( - first, superblock.newest_map + first, superblock.get_newest_map() ).then([this, &conn, first](auto&& bls) { auto m = crimson::make_message( monc.get_fsid(), osdmap->get_encoding_features()); m->cluster_osdmap_trim_lower_bound = first; - m->newest_map = superblock.newest_map; + m->newest_map = superblock.get_newest_map(); m->maps = std::move(bls); return conn.send(std::move(m)); }); @@ -736,8 +736,8 @@ seastar::future<> OSDSingletonState::send_incremental_map( * See: OSD::handle_osd_map for how classic updates the * cluster's trim lower bound. */ - m->cluster_osdmap_trim_lower_bound = superblock.oldest_map; - m->newest_map = superblock.newest_map; + m->cluster_osdmap_trim_lower_bound = superblock.get_oldest_map(); + m->newest_map = superblock.get_newest_map(); m->maps.emplace(osdmap->get_epoch(), std::move(bl)); return conn.send(std::move(m)); }); diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 4e3f862b77eb..116e93680c6f 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3656,7 +3656,7 @@ bool OSDMonitor::prepare_boot(MonOpRequestRef op) } // fresh osd? - if (m->sb.newest_map == 0 && osdmap.exists(from)) { + if (m->sb.get_newest_map() == 0 && osdmap.exists(from)) { const osd_info_t& i = osdmap.get_info(from); if (i.up_from > i.lost_at) { dout(10) << " fresh osd; marking lost_at too" << dendl; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 83c49a08a48d..93f5ca238fab 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1380,7 +1380,7 @@ MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to, MOSDMap *m = new MOSDMap(monc->get_fsid(), osdmap->get_encoding_features()); m->cluster_osdmap_trim_lower_bound = sblock.cluster_osdmap_trim_lower_bound; - m->newest_map = sblock.newest_map; + m->newest_map = sblock.get_newest_map(); int max = cct->_conf->osd_map_message_max; ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes; @@ -1459,12 +1459,12 @@ void OSDService::send_incremental_map(epoch_t since, Connection *con, MOSDMap *m = NULL; while (!m) { OSDSuperblock sblock(get_superblock()); - if (since < sblock.oldest_map) { + if (since < sblock.get_oldest_map()) { // just send latest full map MOSDMap *m = new MOSDMap(monc->get_fsid(), osdmap->get_encoding_features()); m->cluster_osdmap_trim_lower_bound = sblock.cluster_osdmap_trim_lower_bound; - m->newest_map = sblock.newest_map; + m->newest_map = sblock.get_newest_map(); get_map_bl(to, m->maps[to]); send_map(m, con); return; @@ -1650,7 +1650,7 @@ void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op) * splitting. The simplest thing is to detect such cases here and drop * them without an error (the client will resend anyway). */ - ceph_assert(m->get_map_epoch() <= superblock.newest_map); + ceph_assert(m->get_map_epoch() <= superblock.get_newest_map()); OSDMapRef opmap = try_get_map(m->get_map_epoch()); if (!opmap) { dout(7) << __func__ << ": " << *pg << " no longer have map for " @@ -2705,10 +2705,9 @@ void OSD::asok_command( f->dump_stream("osd_fsid") << superblock.osd_fsid; f->dump_unsigned("whoami", superblock.whoami); f->dump_string("state", get_state_name(get_state())); - f->dump_unsigned("oldest_map", superblock.oldest_map); + f->dump_stream("maps") << superblock.maps; f->dump_unsigned("cluster_osdmap_trim_lower_bound", superblock.cluster_osdmap_trim_lower_bound); - f->dump_unsigned("newest_map", superblock.newest_map); f->dump_unsigned("num_pgs", num_pgs); f->close_section(); } else if (prefix == "flush_journal") { @@ -3763,7 +3762,7 @@ int OSD::init() dout(5) << "Upgrading superblock adding: " << diff << dendl; if (!superblock.cluster_osdmap_trim_lower_bound) { - superblock.cluster_osdmap_trim_lower_bound = superblock.oldest_map; + superblock.cluster_osdmap_trim_lower_bound = superblock.get_oldest_map(); } ObjectStore::Transaction t; @@ -6277,7 +6276,7 @@ void OSD::tick_without_osd_lock() if (max_waiting_epoch > get_osdmap()->get_epoch()) { dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch << ", requesting new map" << dendl; - osdmap_subscribe(superblock.newest_map + 1, false); + osdmap_subscribe(superblock.get_newest_map() + 1, false); } } @@ -6638,8 +6637,7 @@ void OSD::start_boot() } dout(1) << __func__ << dendl; set_state(STATE_PREBOOT); - dout(10) << "start_boot - have maps " << superblock.oldest_map - << ".." << superblock.newest_map << dendl; + dout(10) << "start_boot - have maps " << superblock.maps << dendl; monc->get_version("osdmap", CB_OSD_GetVersion(this)); } @@ -7952,20 +7950,20 @@ void OSD::trim_maps(epoch_t oldest, bool skip_maps) */ epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound()); dout(20) << __func__ << ": min=" << min << " oldest_map=" - << superblock.oldest_map << " skip_maps=" << skip_maps + << superblock.get_oldest_map() << " skip_maps=" << skip_maps << dendl; - if (min <= superblock.oldest_map) + if (min <= superblock.get_oldest_map()) return; // Trim from the superblock's oldest_map up to `min`. // Break if we have exceeded the txn target size. // If skip_maps is true, we will trim up `min` unconditionally. ObjectStore::Transaction t; - while (superblock.oldest_map < min) { - dout(20) << " removing old osdmap epoch " << superblock.oldest_map << dendl; - t.remove(coll_t::meta(), get_osdmap_pobject_name(superblock.oldest_map)); - t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(superblock.oldest_map)); - ++superblock.oldest_map; + while (superblock.superblock.get_oldest_map() < min) { + dout(20) << " removing old osdmap epoch " << superblock.superblock.get_oldest_map() << dendl; + t.remove(coll_t::meta(), get_osdmap_pobject_name(superblock.superblock.get_oldest_map())); + t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(superblock.superblock.get_oldest_map())); + superblock.maps.erase(superblock.get_oldest_map()); if (t.get_num_ops() > cct->_conf->osd_target_transaction_size) { service.publish_superblock(superblock); write_superblock(cct, superblock, t); @@ -8057,15 +8055,15 @@ void OSD::handle_osd_map(MOSDMap *m) epoch_t first = m->get_first(); epoch_t last = m->get_last(); dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have " - << superblock.newest_map + << superblock.get_newest_map() << ", src has [" << m->cluster_osdmap_trim_lower_bound << "," << m->newest_map << "]" << dendl; logger->inc(l_osd_map); logger->inc(l_osd_mape, last - first + 1); - if (first <= superblock.newest_map) - logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1); + if (first <= superblock.get_newest_map()) + logger->inc(l_osd_mape_dup, superblock.get_newest_map() - first + 1); if (superblock.cluster_osdmap_trim_lower_bound < m->cluster_osdmap_trim_lower_bound) { @@ -8074,12 +8072,12 @@ void OSD::handle_osd_map(MOSDMap *m) dout(10) << " superblock cluster_osdmap_trim_lower_bound new epoch is: " << superblock.cluster_osdmap_trim_lower_bound << dendl; ceph_assert( - superblock.cluster_osdmap_trim_lower_bound >= superblock.oldest_map); + superblock.cluster_osdmap_trim_lower_bound >= superblock.get_oldest_map()); } // make sure there is something new, here, before we bother flushing // the queues and such - if (last <= superblock.newest_map) { + if (last <= superblock.get_newest_map()) { dout(10) << " no new maps here, dropping" << dendl; m->put(); return; @@ -8087,11 +8085,11 @@ void OSD::handle_osd_map(MOSDMap *m) // missing some? bool skip_maps = false; - if (first > superblock.newest_map + 1) { + if (first > superblock.get_newest_map() + 1) { dout(10) << "handle_osd_map message skips epochs " - << superblock.newest_map + 1 << ".." << (first-1) << dendl; - if (m->cluster_osdmap_trim_lower_bound <= superblock.newest_map + 1) { - osdmap_subscribe(superblock.newest_map + 1, false); + << superblock.get_newest_map() + 1 << ".." << (first-1) << dendl; + if (m->cluster_osdmap_trim_lower_bound <= superblock.get_newest_map() + 1) { + osdmap_subscribe(superblock.get_newest_map() + 1, false); m->put(); return; } @@ -8116,7 +8114,7 @@ void OSD::handle_osd_map(MOSDMap *m) map> purged_snaps; // store new maps: queue for disk and put in the osdmap cache - epoch_t start = std::max(superblock.newest_map + 1, first); + epoch_t start = std::max(superblock.get_newest_map() + 1, first); for (epoch_t e = start; e <= last; e++) { if (txn_size >= t.get_num_bytes()) { derr << __func__ << " transaction size overflowed" << dendl; @@ -8227,14 +8225,11 @@ void OSD::handle_osd_map(MOSDMap *m) rerequest_full_maps(); } - if (superblock.oldest_map) { + if (!superblock.maps.empty()) { trim_maps(m->cluster_osdmap_trim_lower_bound, skip_maps); - pg_num_history.prune(superblock.oldest_map); + pg_num_history.prune(superblock.get_oldest_map()); } - - if (!superblock.oldest_map || skip_maps) - superblock.oldest_map = first; - superblock.newest_map = last; + superblock.insert_osdmap_epochs(first, last); superblock.current_epoch = last; // note in the superblock that we were clean thru the prior epoch @@ -8360,7 +8355,7 @@ void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m) for (epoch_t cur = first; cur <= last; cur++) { dout(10) << " advance to epoch " << cur << " (<= last " << last - << " <= newest_map " << superblock.newest_map + << " <= newest_map " << superblock.get_newest_map() << ")" << dendl; OSDMapRef newmap = get_map(cur); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 664d8a287406..948abeaafc8a 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -5705,12 +5705,12 @@ void pg_hit_set_history_t::generate_test_instances(list& void OSDSuperblock::encode(ceph::buffer::list &bl) const { - ENCODE_START(10, 5, bl); + ENCODE_START(11, 5, bl); encode(cluster_fsid, bl); encode(whoami, bl); encode(current_epoch, bl); - encode(oldest_map, bl); - encode(newest_map, bl); + encode((epoch_t)0, bl); // oldest_map + encode((epoch_t)0, bl); // newest_map encode(weight, bl); compat_features.encode(bl); encode(clean_thru, bl); @@ -5721,12 +5721,13 @@ void OSDSuperblock::encode(ceph::buffer::list &bl) const encode(purged_snaps_last, bl); encode(last_purged_snaps_scrub, bl); encode(cluster_osdmap_trim_lower_bound, bl); + encode(maps, bl); ENCODE_FINISH(bl); } void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(10, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(11, 5, 5, bl); if (struct_v < 3) { string magic; decode(magic, bl); @@ -5734,6 +5735,7 @@ void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl) decode(cluster_fsid, bl); decode(whoami, bl); decode(current_epoch, bl); + epoch_t oldest_map, newest_map; decode(oldest_map, bl); decode(newest_map, bl); decode(weight, bl); @@ -5765,6 +5767,11 @@ void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl) } else { cluster_osdmap_trim_lower_bound = 0; } + if (struct_v >= 11) { + decode(maps, bl); + } else { + insert_osdmap_epochs(oldest_map, newest_map); + } DECODE_FINISH(bl); } @@ -5774,8 +5781,6 @@ void OSDSuperblock::dump(Formatter *f) const f->dump_stream("osd_fsid") << osd_fsid; f->dump_int("whoami", whoami); f->dump_int("current_epoch", current_epoch); - f->dump_int("oldest_map", oldest_map); - f->dump_int("newest_map", newest_map); f->dump_float("weight", weight); f->open_object_section("compat"); compat_features.dump(f); @@ -5786,6 +5791,7 @@ void OSDSuperblock::dump(Formatter *f) const f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub; f->dump_int("cluster_osdmap_trim_lower_bound", cluster_osdmap_trim_lower_bound); + f->dump_stream("maps") << maps; } void OSDSuperblock::generate_test_instances(list& o) @@ -5796,8 +5802,7 @@ void OSDSuperblock::generate_test_instances(list& o) z.osd_fsid.parse("02020202-0202-0202-0202-020202020202"); z.whoami = 3; z.current_epoch = 4; - z.oldest_map = 5; - z.newest_map = 9; + z.insert_osdmap_epochs(5, 9); z.mounted = 8; z.clean_thru = 7; o.push_back(new OSDSuperblock(z)); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 16955ef5ef4d..8b86b0a36356 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -5454,7 +5454,31 @@ class OSDSuperblock { uuid_d cluster_fsid, osd_fsid; int32_t whoami = -1; // my role in this fs. epoch_t current_epoch = 0; // most recent epoch - epoch_t oldest_map = 0, newest_map = 0; // oldest/newest maps we have. + interval_set maps; // oldest/newest maps we have. + + epoch_t get_oldest_map() const { + if (!maps.empty()) { + return maps.range_start(); + } + return 0; + } + + epoch_t get_newest_map() const { + if (!maps.empty()) { + // maps stores [oldest_map, newest_map) (exclusive) + return maps.range_end() - 1; + } + return 0; + } + + void insert_osdmap_epochs(epoch_t first, epoch_t last) { + ceph_assert(std::cmp_less_equal(first, last)); + interval_set message_epochs; + message_epochs.insert(first, last - first + 1); + maps.union_of(message_epochs); + ceph_assert(last == get_newest_map()); + } + double weight = 0.0; CompatSet compat_features; @@ -5481,7 +5505,7 @@ inline std::ostream& operator<<(std::ostream& out, const OSDSuperblock& sb) << " osd." << sb.whoami << " " << sb.osd_fsid << " e" << sb.current_epoch - << " [" << sb.oldest_map << "," << sb.newest_map << "]" + << " maps " << sb.maps << " lci=[" << sb.mounted << "," << sb.clean_thru << "]" << " tlb=" << sb.cluster_osdmap_trim_lower_bound << ")"; diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index df6d1f85c39f..19a445824834 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -1630,9 +1630,9 @@ int get_pg_metadata(ObjectStore *store, bufferlist &bl, metadata_section &ms, return -EINVAL; } - if (ms.osdmap.get_epoch() < sb.oldest_map) { + if (ms.osdmap.get_epoch() < sb.get_oldest_map()) { cerr << "PG export's map " << ms.osdmap.get_epoch() - << " is older than OSD's oldest_map " << sb.oldest_map << std::endl; + << " is older than OSD's oldest_map " << sb.get_oldest_map() << std::endl; if (!force) { cerr << " pass --force to proceed anyway (with incomplete PastIntervals)" << std::endl; diff --git a/src/tools/rebuild_mondb.cc b/src/tools/rebuild_mondb.cc index 17e4dadcfdd4..033f63aad22a 100644 --- a/src/tools/rebuild_mondb.cc +++ b/src/tools/rebuild_mondb.cc @@ -216,7 +216,7 @@ int update_osdmap(ObjectStore& fs, OSDSuperblock& sb, MonitorDBStore& ms) // osdmap starts at 1. if we have a "0" first_committed, then there is nothing // to trim. and "1 osdmaps trimmed" in the output message is misleading. so // let's make it an exception. - for (auto e = first_committed; first_committed && e < sb.oldest_map; e++) { + for (auto e = first_committed; first_committed && e < sb.get_oldest_map(); e++) { t->erase(prefix, e); t->erase(prefix, ms.combine_strings("full", e)); ntrimmed++; @@ -225,7 +225,7 @@ int update_osdmap(ObjectStore& fs, OSDSuperblock& sb, MonitorDBStore& ms) // because PaxosService::put_last_committed() set it to last_committed, if it // is zero. which breaks OSDMonitor::update_from_paxos(), in which we believe // that latest_full should always be greater than last_committed. - if (first_committed == 0 && sb.oldest_map < sb.newest_map) { + if (first_committed == 0 && sb.get_oldest_map() < sb.get_newest_map()) { first_committed = 1; } else if (ntrimmed) { first_committed += ntrimmed; @@ -240,8 +240,8 @@ int update_osdmap(ObjectStore& fs, OSDSuperblock& sb, MonitorDBStore& ms) auto ch = fs.open_collection(coll_t::meta()); OSDMap osdmap; - for (auto e = std::max(last_committed+1, sb.oldest_map); - e <= sb.newest_map; e++) { + for (auto e = std::max(last_committed+1, sb.get_oldest_map()); + e <= sb.get_newest_map(); e++) { bool have_crc = false; uint32_t crc = -1; uint64_t features = 0; From 05aeeeebe634213c882fb25842afb4679e6fd61d Mon Sep 17 00:00:00 2001 From: Matan Breizman Date: Mon, 17 Jul 2023 13:12:27 +0000 Subject: [PATCH 0037/2492] osd/OSD: remove `skip_maps` comment Now that oldest/newest maps are stored as an interval set we no longer move the oldest_map forward to `first` epoch. We simply add the new osdmap range from `first` to `last` regardless of `skip_maps`. trim_maps now erases the oldest_map each iteration and support epoch gaps. Signed-off-by: Matan Breizman --- src/osd/OSD.cc | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 93f5ca238fab..2c1d2441b769 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -7932,22 +7932,6 @@ void OSD::osdmap_subscribe(version_t epoch, bool force_request) void OSD::trim_maps(epoch_t oldest, bool skip_maps) { - /* There's a possible leak here. skip_maps is set to true if the received - * MOSDMap message indicates that there's a discontinuity between - * the Monitor cluster's stored set of maps and our set of stored - * maps such that there is a "gap". This happens generally when an OSD - * is down for a while and the cluster has trimmed maps in the mean time. - * - * Because the superblock cannot represent two discontinuous sets of maps, - * OSD::handle_osd_map unconditionally sets superblock.oldest_map to the first - * map in the message. OSD::trim_maps here, however, will only trim up to - * service.map_cache.cached_key_lower_bound() resulting in the maps between - * service.map_cache.cached_key_lower_bound() and MOSDMap::get_first() being - * leaked. Note, trimming past service.map_cache.cached_key_lower_bound() - * here won't work as there may still be PGs with those map epochs recorded. - * - * Fixing this is future work: https://tracker.ceph.com/issues/61962 - */ epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound()); dout(20) << __func__ << ": min=" << min << " oldest_map=" << superblock.get_oldest_map() << " skip_maps=" << skip_maps @@ -7980,7 +7964,8 @@ void OSD::trim_maps(epoch_t oldest, bool skip_maps) int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr); ceph_assert(tr == 0); } - // we should not remove the cached maps + // we should not trim past service.map_cache.cached_key_lower_bound() + // as there may still be PGs with those map epochs recorded. ceph_assert(min <= service.map_cache.cached_key_lower_bound()); } From 9897164265726895eeee2348ee679aed03e75ea1 Mon Sep 17 00:00:00 2001 From: Matan Breizman Date: Tue, 18 Jul 2023 16:08:23 +0000 Subject: [PATCH 0038/2492] osd/OSD: remove `skip_maps` the superblock now stores an interval_set which supports non-contiguous osdmap history. Signed-off-by: Matan Breizman --- src/crimson/osd/osd.cc | 3 --- src/osd/OSD.cc | 44 ++++++++++++++---------------------------- src/osd/OSD.h | 2 +- 3 files changed, 15 insertions(+), 34 deletions(-) diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc index ccb7435332b3..157881ccbe40 100644 --- a/src/crimson/osd/osd.cc +++ b/src/crimson/osd/osd.cc @@ -940,7 +940,6 @@ seastar::future<> OSD::_handle_osd_map(Ref m) return seastar::now(); } // missing some? - bool skip_maps = false; epoch_t start = superblock.get_newest_map() + 1; if (first > start) { logger().info("handle_osd_map message skips epochs {}..{}", @@ -956,8 +955,6 @@ seastar::future<> OSD::_handle_osd_map(Ref m) return get_shard_services().osdmap_subscribe( m->cluster_osdmap_trim_lower_bound - 1, true); } - skip_maps = true; - start = first; } return seastar::do_with(ceph::os::Transaction{}, diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 2c1d2441b769..1fd6b8eec3a8 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -7930,40 +7930,30 @@ void OSD::osdmap_subscribe(version_t epoch, bool force_request) } } -void OSD::trim_maps(epoch_t oldest, bool skip_maps) +void OSD::trim_maps(epoch_t oldest) { epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound()); dout(20) << __func__ << ": min=" << min << " oldest_map=" - << superblock.get_oldest_map() << " skip_maps=" << skip_maps - << dendl; + << superblock.get_oldest_map() << dendl; if (min <= superblock.get_oldest_map()) return; // Trim from the superblock's oldest_map up to `min`. // Break if we have exceeded the txn target size. - // If skip_maps is true, we will trim up `min` unconditionally. ObjectStore::Transaction t; - while (superblock.superblock.get_oldest_map() < min) { - dout(20) << " removing old osdmap epoch " << superblock.superblock.get_oldest_map() << dendl; - t.remove(coll_t::meta(), get_osdmap_pobject_name(superblock.superblock.get_oldest_map())); - t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(superblock.superblock.get_oldest_map())); + while (superblock.get_oldest_map() < min && + t.get_num_ops() < cct->_conf->osd_target_transaction_size) { + dout(20) << " removing old osdmap epoch " << superblock.get_oldest_map() << dendl; + t.remove(coll_t::meta(), get_osdmap_pobject_name(superblock.get_oldest_map())); + t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(superblock.get_oldest_map())); superblock.maps.erase(superblock.get_oldest_map()); - if (t.get_num_ops() > cct->_conf->osd_target_transaction_size) { - service.publish_superblock(superblock); - write_superblock(cct, superblock, t); - int tr = store->queue_transaction(service.meta_ch, t.claim_and_reset(), nullptr); - ceph_assert(tr == 0); - if (skip_maps == false) { - break; - } - } - } - if (t.get_num_ops() > 0) { - service.publish_superblock(superblock); - write_superblock(cct, superblock, t); - int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr); - ceph_assert(tr == 0); } + + service.publish_superblock(superblock); + write_superblock(cct, superblock, t); + int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr); + ceph_assert(tr == 0); + // we should not trim past service.map_cache.cached_key_lower_bound() // as there may still be PGs with those map epochs recorded. ceph_assert(min <= service.map_cache.cached_key_lower_bound()); @@ -8068,8 +8058,6 @@ void OSD::handle_osd_map(MOSDMap *m) return; } - // missing some? - bool skip_maps = false; if (first > superblock.get_newest_map() + 1) { dout(10) << "handle_osd_map message skips epochs " << superblock.get_newest_map() + 1 << ".." << (first-1) << dendl; @@ -8087,10 +8075,6 @@ void OSD::handle_osd_map(MOSDMap *m) m->put(); return; } - // The superblock's oldest_map should be moved forward (skipped) - // to the `first` osdmap of the incoming MOSDMap message. - // Trim all of the skipped osdmaps before updating the oldest_map. - skip_maps = true; } ObjectStore::Transaction t; @@ -8211,7 +8195,7 @@ void OSD::handle_osd_map(MOSDMap *m) } if (!superblock.maps.empty()) { - trim_maps(m->cluster_osdmap_trim_lower_bound, skip_maps); + trim_maps(m->cluster_osdmap_trim_lower_bound); pg_num_history.prune(superblock.get_oldest_map()); } superblock.insert_osdmap_epochs(first, last); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 8ed960d96055..231050d8cc2f 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1685,7 +1685,7 @@ class OSD : public Dispatcher, void handle_osd_map(class MOSDMap *m); void _committed_osd_maps(epoch_t first, epoch_t last, class MOSDMap *m); - void trim_maps(epoch_t oldest, bool skip_maps); + void trim_maps(epoch_t oldest); void note_down_osd(int osd); void note_up_osd(int osd); friend struct C_OnMapCommit; From 95a44bf08794d73f0c9cdd7caaa866e2d7ce5913 Mon Sep 17 00:00:00 2001 From: Matan Breizman Date: Wed, 30 Aug 2023 08:17:06 +0000 Subject: [PATCH 0039/2492] osd/OSD: add log line on mapgaps Signed-off-by: Matan Breizman --- src/osd/OSD.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 1fd6b8eec3a8..d7b60e79cf82 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -8199,6 +8199,9 @@ void OSD::handle_osd_map(MOSDMap *m) pg_num_history.prune(superblock.get_oldest_map()); } superblock.insert_osdmap_epochs(first, last); + if (superblock.maps.num_intervals() > 1) { + dout(10) << __func__ << " osd map gap " << superblock.maps << dendl; + } superblock.current_epoch = last; // note in the superblock that we were clean thru the prior epoch From e7779fefc69d33cc129c792862667aa7b43a4dba Mon Sep 17 00:00:00 2001 From: Matt Benjamin Date: Fri, 8 Sep 2023 17:48:39 -0400 Subject: [PATCH 0040/2492] cmake: BuildFIO.cmake should not introduce -std=gnu++17 Not correct in general, and a build bug because fio-objectstore includes c++20 headers. Fixes: https://tracker.ceph.com/issues/62778 Add CXX_EXTENSIONS ON, and cleanup INTERFACE_COMPILE_OPTIONS per Kefu review Signed-off-by: Matt Benjamin --- cmake/modules/BuildFIO.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/modules/BuildFIO.cmake b/cmake/modules/BuildFIO.cmake index 3a0694b543ee..49fcfb31d973 100644 --- a/cmake/modules/BuildFIO.cmake +++ b/cmake/modules/BuildFIO.cmake @@ -37,6 +37,7 @@ function(build_fio) add_library(fio INTERFACE IMPORTED) add_dependencies(fio fio_ext) set_target_properties(fio PROPERTIES + CXX_EXTENSIONS ON INTERFACE_INCLUDE_DIRECTORIES ${source_dir} - INTERFACE_COMPILE_OPTIONS "-include;${source_dir}/config-host.h;$<$:-std=gnu99>$<$:-std=gnu++17>") + INTERFACE_COMPILE_OPTIONS "-include;${source_dir}/config-host.h;$<$:-std=gnu99>") endfunction() From e80d11b3903c52d1a35112b822b7680b6d018ee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Zarzy=C5=84ski?= Date: Fri, 8 Sep 2023 13:50:01 +0200 Subject: [PATCH 0041/2492] ceph-run: fix string comparison around --no-restart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `man test` says the stirng equality operator is `=`: ``` STRING1 = STRING2 the strings are equal ``` Also, we should handle the argument-less case which currently fails: ``` $ [ $1 == "--no-restart" ] bash: [: ==: unary operator expected ``` Signed-off-by: Radosław Zarzyński --- src/ceph-run | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ceph-run b/src/ceph-run index 764101c564f8..c28244d90f83 100755 --- a/src/ceph-run +++ b/src/ceph-run @@ -3,7 +3,7 @@ sleep=5 no_restart=0 -if [ $1 == "--no-restart" ]; then +if [ "$1" = "--no-restart" ]; then no_restart=1 shift fi From 622ef1af43a2bd635921f802f63147a9651d4961 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 12 Sep 2023 14:44:03 +0800 Subject: [PATCH 0042/2492] crimson/osd/pg_recovery: recover head before recovering clones Signed-off-by: Xuehan Xu --- src/crimson/osd/pg_recovery.cc | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc index 09b45779ec87..18c00b68831f 100644 --- a/src/crimson/osd/pg_recovery.cc +++ b/src/crimson/osd/pg_recovery.cc @@ -140,13 +140,14 @@ size_t PGRecovery::start_primary_recovery_ops( hobject_t head = soid.get_head(); + bool head_missing = missing.is_missing(head); logger().info( "{} {} item.need {} {} {} {} {}", __func__, soid, item.need, missing.is_missing(soid) ? " (missing)":"", - missing.is_missing(head) ? " (missing head)":"", + head_missing ? " (missing head)":"", pg->get_recovery_backend()->is_recovering(soid) ? " (recovering)":"", pg->get_recovery_backend()->is_recovering(head) ? " (recovering head)":""); @@ -158,7 +159,15 @@ size_t PGRecovery::start_primary_recovery_ops( } else if (pg->get_recovery_backend()->is_recovering(head)) { ++skipped; } else { - out->emplace_back(recover_missing(trigger, soid, item.need)); + if (head_missing) { + auto it = missing.get_items().find(head); + assert(it != missing.get_items().end()); + auto head_need = it->second.need; + out->emplace_back(recover_missing(trigger, head, head_need)); + ++skipped; + } else { + out->emplace_back(recover_missing(trigger, soid, item.need)); + } ++started; } From 1d7cabf3db8b8d1863aab9c403e2935ed01a5e5d Mon Sep 17 00:00:00 2001 From: NitzanMordhai Date: Tue, 27 Jun 2023 06:31:22 +0000 Subject: [PATCH 0043/2492] ceph-dencoder: Add missing common types to ceph-dencoder for accurate encode-decode comparison Currently, ceph-dencoder lacks certain common types, preventing us from accurately checking the ceph corpus for encode-decode mismatches. This pull request aims to address this issue by adding the missing types to ceph-dencoder. To successfully incorporate these types into ceph-dencoder, we need to introduce the necessary dump and generate_test_instances functions that was missing in some types. These functions are essential for proper encode and decode of the added types. This PR will enhance the functionality of ceph-dencoder by including the missing types, enabling a comprehensive analysis of encode-decode consistency. With the addition of these types, we can ensure the robustness and correctness of the ceph corpus. This update will significantly contribute to improving the overall reliability and accuracy of ceph-dencoder. It allows for a more comprehensive assessment of the encode-decode behavior, leading to enhanced data integrity and stability within the ceph ecosystem. Fixes: https://tracker.ceph.com/issues/61788 Signed-off-by: Nitzan Mordechai --- src/auth/Auth.h | 60 +++++++++ src/auth/Crypto.cc | 17 +++ src/auth/Crypto.h | 2 + src/auth/cephx/CephxKeyServer.cc | 10 ++ src/auth/cephx/CephxKeyServer.h | 40 +++++- src/auth/cephx/CephxProtocol.h | 124 +++++++++++++++++ src/cls/2pc_queue/cls_2pc_queue_ops.h | 70 +++++++++- src/cls/2pc_queue/cls_2pc_queue_types.h | 36 +++++ src/cls/cas/cls_cas_internal.h | 11 ++ src/cls/fifo/cls_fifo_ops.h | 48 +++++++ src/cls/fifo/cls_fifo_types.h | 62 ++++++++- src/cls/log/cls_log_ops.h | 56 ++++++++ src/cls/log/cls_log_types.h | 10 ++ src/cls/queue/cls_queue_ops.h | 74 ++++++++++- src/cls/queue/cls_queue_types.h | 43 +++++- src/cls/timeindex/cls_timeindex_ops.h | 38 ++++++ src/cls/timeindex/cls_timeindex_types.h | 2 +- src/cls/version/cls_version_ops.h | 55 ++++++++ src/cls/version/cls_version_types.cc | 6 - src/cls/version/cls_version_types.h | 18 ++- src/common/ceph_json.h | 55 ++++++++ src/common/entity_name.cc | 63 ++++----- src/common/entity_name.h | 3 +- src/include/cephfs/types.h | 2 + src/include/frag.h | 14 ++ src/include/fs_types.h | 7 + src/include/object.h | 18 +++ src/include/types.h | 33 ++++- src/mds/mdstypes.cc | 15 +++ src/messages/MClientReply.h | 12 +- src/messages/MClientRequest.h | 31 +++++ src/messages/MMDSCacheRejoin.h | 32 +++++ src/messages/MMgrReport.h | 21 +++ src/os/bluestore/bluestore_types.cc | 9 ++ src/os/bluestore/bluestore_types.h | 1 + src/tools/ceph-dencoder/common_types.h | 170 +++++++++++++++++++++++- 36 files changed, 1205 insertions(+), 63 deletions(-) diff --git a/src/auth/Auth.h b/src/auth/Auth.h index 5521c8d3fcf0..83e23b34dbe3 100644 --- a/src/auth/Auth.h +++ b/src/auth/Auth.h @@ -16,6 +16,7 @@ #define CEPH_AUTHTYPES_H #include "Crypto.h" +#include "common/ceph_json.h" #include "common/entity_name.h" // The _MAX values are a bit wonky here because we are overloading the first @@ -59,6 +60,14 @@ struct EntityAuth { decode(pending_key, bl); } } + void dump(ceph::Formatter *f) const { + f->dump_object("key", key); + encode_json("caps", caps, f); + f->dump_object("pending_key", pending_key); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new EntityAuth); + } }; WRITE_CLASS_ENCODER(EntityAuth) @@ -95,6 +104,19 @@ struct AuthCapsInfo { allow_all = (bool)a; decode(caps, bl); } + void dump(ceph::Formatter *f) const { + f->dump_bool("allow_all", allow_all); + encode_json("caps", caps, f); + f->dump_unsigned("caps_len", caps.length()); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new AuthCapsInfo); + ls.push_back(new AuthCapsInfo); + ls.back()->allow_all = true; + ls.push_back(new AuthCapsInfo); + ls.back()->caps.append("foo"); + ls.back()->caps.append("bar"); + } }; WRITE_CLASS_ENCODER(AuthCapsInfo) @@ -147,6 +169,25 @@ struct AuthTicket { decode(caps, bl); decode(flags, bl); } + void dump(ceph::Formatter *f) const { + f->dump_object("name", name); + f->dump_unsigned("global_id", global_id); + f->dump_stream("created") << created; + f->dump_stream("renew_after") << renew_after; + f->dump_stream("expires") << expires; + f->dump_object("caps", caps); + f->dump_unsigned("flags", flags); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new AuthTicket); + ls.push_back(new AuthTicket); + ls.back()->name.set_id("client.123"); + ls.back()->global_id = 123; + ls.back()->init_timestamps(utime_t(123, 456), 7); + ls.back()->caps.caps.append("foo"); + ls.back()->caps.caps.append("bar"); + ls.back()->flags = 0x12345678; + } }; WRITE_CLASS_ENCODER(AuthTicket) @@ -231,6 +272,16 @@ struct ExpiringCryptoKey { decode(key, bl); decode(expiration, bl); } + void dump(ceph::Formatter *f) const { + f->dump_object("key", key); + f->dump_stream("expiration") << expiration; + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new ExpiringCryptoKey); + ls.push_back(new ExpiringCryptoKey); + ls.back()->key.set_secret( + CEPH_CRYPTO_AES, bufferptr("1234567890123456", 16), utime_t(123, 456)); + } }; WRITE_CLASS_ENCODER(ExpiringCryptoKey) @@ -295,6 +346,15 @@ struct RotatingSecrets { } void dump(); + void dump(ceph::Formatter *f) const { + encode_json("secrets", secrets, f); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new RotatingSecrets); + ls.push_back(new RotatingSecrets); + auto eck = new ExpiringCryptoKey; + ls.back()->add(*eck); + } }; WRITE_CLASS_ENCODER(RotatingSecrets) diff --git a/src/auth/Crypto.cc b/src/auth/Crypto.cc index ce666e8bdc8c..5d68d3470bc0 100644 --- a/src/auth/Crypto.cc +++ b/src/auth/Crypto.cc @@ -511,6 +511,23 @@ void CryptoKey::decode(bufferlist::const_iterator& bl) throw ceph::buffer::malformed_input("malformed secret"); } +void CryptoKey::dump(Formatter *f) const +{ + f->dump_int("type", type); + f->dump_stream("created") << created; + f->dump_int("secret.length", secret.length()); +} + +void CryptoKey::generate_test_instances(std::list& ls) +{ + ls.push_back(new CryptoKey); + ls.push_back(new CryptoKey); + ls.back()->type = CEPH_CRYPTO_AES; + ls.back()->set_secret( + CEPH_CRYPTO_AES, bufferptr("1234567890123456", 16), utime_t(123, 456)); + ls.back()->created = utime_t(123, 456); +} + int CryptoKey::set_secret(int type, const bufferptr& s, utime_t c) { int r = _set_secret(type, s); diff --git a/src/auth/Crypto.h b/src/auth/Crypto.h index a29ac1abd811..3ce655a12562 100644 --- a/src/auth/Crypto.h +++ b/src/auth/Crypto.h @@ -111,6 +111,8 @@ class CryptoKey { void encode(ceph::buffer::list& bl) const; void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& ls); void clear() { *this = CryptoKey(); diff --git a/src/auth/cephx/CephxKeyServer.cc b/src/auth/cephx/CephxKeyServer.cc index 236ac451add9..a5ad1f2b7973 100644 --- a/src/auth/cephx/CephxKeyServer.cc +++ b/src/auth/cephx/CephxKeyServer.cc @@ -257,6 +257,16 @@ std::map KeyServer::get_used_pending_keys() return ret; } +void KeyServer::dump(Formatter *f) const +{ + f->dump_object("data", data); +} + +void KeyServer::generate_test_instances(std::list& ls) +{ + ls.push_back(new KeyServer(nullptr, nullptr)); +} + bool KeyServer::generate_secret(CryptoKey& secret) { bufferptr bp; diff --git a/src/auth/cephx/CephxKeyServer.h b/src/auth/cephx/CephxKeyServer.h index 64915c8ce4aa..d147dd441ad2 100644 --- a/src/auth/cephx/CephxKeyServer.h +++ b/src/auth/cephx/CephxKeyServer.h @@ -21,15 +21,16 @@ #include "include/common_fwd.h" struct KeyServerData { - version_t version; + version_t version{0}; /* for each entity */ std::map secrets; - KeyRing *extra_secrets; + KeyRing *extra_secrets = nullptr; /* for each service type */ - version_t rotating_ver; + version_t rotating_ver{0}; std::map rotating_secrets; + KeyServerData() {} explicit KeyServerData(KeyRing *extra) : version(0), @@ -70,7 +71,17 @@ struct KeyServerData { decode(rotating_ver, iter); decode(rotating_secrets, iter); } - + void dump(ceph::Formatter *f) const { + f->dump_unsigned("version", version); + f->dump_unsigned("rotating_version", rotating_ver); + encode_json("secrets", secrets, f); + encode_json("rotating_secrets", rotating_secrets, f); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new KeyServerData); + ls.push_back(new KeyServerData); + ls.back()->version = 1; + } bool contains(const EntityName& name) const { return (secrets.find(name) != secrets.end()); } @@ -159,8 +170,21 @@ struct KeyServerData { decode(auth, bl); } } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("op", op); + f->dump_object("name", name); + f->dump_object("auth", auth); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new Incremental); + ls.back()->op = AUTH_INC_DEL; + ls.push_back(new Incremental); + ls.back()->op = AUTH_INC_ADD; + ls.push_back(new Incremental); + ls.back()->op = AUTH_INC_SET_ROTATING; + } }; - + void apply_incremental(Incremental& inc) { switch (inc.op) { case AUTH_INC_ADD: @@ -188,8 +212,6 @@ WRITE_CLASS_ENCODER(KeyServerData) WRITE_CLASS_ENCODER(KeyServerData::Incremental) - - class KeyServer : public KeyStore { CephContext *cct; KeyServerData data; @@ -205,7 +227,9 @@ class KeyServer : public KeyStore { bool _get_service_caps(const EntityName& name, uint32_t service_id, AuthCapsInfo& caps) const; public: + KeyServer() : lock{ceph::make_mutex("KeyServer::lock")} {} KeyServer(CephContext *cct_, KeyRing *extra_secrets); + KeyServer& operator=(const KeyServer&) = delete; bool generate_secret(CryptoKey& secret); bool get_secret(const EntityName& name, CryptoKey& secret) const override; @@ -248,6 +272,8 @@ class KeyServer : public KeyStore { using ceph::decode; decode(data, bl); } + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& ls); bool contains(const EntityName& name) const; int encode_secrets(ceph::Formatter *f, std::stringstream *ds) const; void encode_formatted(std::string label, ceph::Formatter *f, ceph::buffer::list &bl); diff --git a/src/auth/cephx/CephxProtocol.h b/src/auth/cephx/CephxProtocol.h index aabfaaad10c9..260cb13ff5ab 100644 --- a/src/auth/cephx/CephxProtocol.h +++ b/src/auth/cephx/CephxProtocol.h @@ -55,6 +55,13 @@ struct CephXServerChallenge { decode(struct_v, bl); decode(server_challenge, bl); } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("server_challenge", server_challenge); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new CephXServerChallenge); + ls.back()->server_challenge = 1; + } }; WRITE_CLASS_ENCODER(CephXServerChallenge) @@ -72,6 +79,13 @@ struct CephXRequestHeader { using ceph::decode; decode(request_type, bl); } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("request_type", request_type); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new CephXRequestHeader); + ls.back()->request_type = 1; + } }; WRITE_CLASS_ENCODER(CephXRequestHeader) @@ -89,6 +103,15 @@ struct CephXResponseHeader { decode(request_type, bl); decode(status, bl); } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("request_type", request_type); + f->dump_int("status", status); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new CephXResponseHeader); + ls.back()->request_type = 1; + ls.back()->status = 0; + } }; WRITE_CLASS_ENCODER(CephXResponseHeader) @@ -113,6 +136,17 @@ struct CephXTicketBlob { decode(secret_id, bl); decode(blob, bl); } + + void dump(ceph::Formatter *f) const { + f->dump_unsigned("secret_id", secret_id); + f->dump_unsigned("blob_len", blob.length()); + } + + static void generate_test_instances(std::list& ls) { + ls.push_back(new CephXTicketBlob); + ls.back()->secret_id = 123; + ls.back()->blob.append(std::string_view("this is a blob")); + } }; WRITE_CLASS_ENCODER(CephXTicketBlob) @@ -152,6 +186,25 @@ struct CephXAuthenticate { // old_ticket both on reconnects and renewals old_ticket_may_be_omitted = struct_v < 3; } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("client_challenge", client_challenge); + f->dump_unsigned("key", key); + f->open_object_section("old_ticket"); + old_ticket.dump(f); + f->close_section(); + f->dump_unsigned("other_keys", other_keys); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new CephXAuthenticate); + ls.back()->client_challenge = 0; + ls.back()->key = 0; + ls.push_back(new CephXAuthenticate); + ls.back()->client_challenge = 1; + ls.back()->key = 2; + ls.back()->old_ticket.secret_id = 3; + ls.back()->old_ticket.blob.append(std::string_view("this is a blob")); + ls.back()->other_keys = 4; + } }; WRITE_CLASS_ENCODER(CephXAuthenticate) @@ -168,6 +221,15 @@ struct CephXChallengeBlob { decode(server_challenge, bl); decode(client_challenge, bl); } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("server_challenge", server_challenge); + f->dump_unsigned("client_challenge", client_challenge); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new CephXChallengeBlob); + ls.back()->server_challenge = 123; + ls.back()->client_challenge = 456; + } }; WRITE_CLASS_ENCODER(CephXChallengeBlob) @@ -218,6 +280,15 @@ struct CephXServiceTicketRequest { decode(struct_v, bl); decode(keys, bl); } + + void dump(ceph::Formatter *f) const { + f->dump_unsigned("keys", keys); + } + + static void generate_test_instances(std::list& ls) { + ls.push_back(new CephXServiceTicketRequest); + ls.back()->keys = 123; + } }; WRITE_CLASS_ENCODER(CephXServiceTicketRequest) @@ -251,6 +322,17 @@ struct CephXAuthorizeReply { decode(connection_secret, bl); } } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("nonce_plus_one", nonce_plus_one); + f->dump_string("connection_secret", connection_secret); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new CephXAuthorizeReply); + ls.back()->nonce_plus_one = 0; + ls.push_back(new CephXAuthorizeReply); + ls.back()->nonce_plus_one = 123; + ls.back()->connection_secret = "secret"; + } }; WRITE_CLASS_ENCODER(CephXAuthorizeReply) @@ -353,6 +435,17 @@ struct CephXServiceTicket { decode(session_key, bl); decode(validity, bl); } + void dump(ceph::Formatter *f) const { + session_key.dump(f); + validity.dump(f); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new CephXServiceTicket); + ls.push_back(new CephXServiceTicket); + ls.back()->session_key.set_secret( + CEPH_CRYPTO_AES, bufferptr("1234567890123456", 16), utime_t(123, 456)); + ls.back()->validity = utime_t(123, 456); + } }; WRITE_CLASS_ENCODER(CephXServiceTicket) @@ -375,6 +468,18 @@ struct CephXServiceTicketInfo { decode(ticket, bl); decode(session_key, bl); } + void dump(ceph::Formatter *f) const { + ticket.dump(f); + session_key.dump(f); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new CephXServiceTicketInfo); + ls.push_back(new CephXServiceTicketInfo); + ls.back()->ticket.global_id = 1234; + ls.back()->ticket.init_timestamps(utime_t(123, 456), utime_t(123, 456)); + ls.back()->session_key.set_secret( + CEPH_CRYPTO_AES, bufferptr("1234567890123456", 16), utime_t(123, 456)); + } }; WRITE_CLASS_ENCODER(CephXServiceTicketInfo) @@ -392,6 +497,13 @@ struct CephXAuthorizeChallenge : public AuthAuthorizerChallenge { decode(struct_v, bl); decode(server_challenge, bl); } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("server_challenge", server_challenge); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new CephXAuthorizeChallenge); + ls.back()->server_challenge = 1234; + } }; WRITE_CLASS_ENCODER(CephXAuthorizeChallenge) @@ -417,6 +529,18 @@ struct CephXAuthorize { decode(server_challenge_plus_one, bl); } } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("nonce", nonce); + f->dump_unsigned("have_challenge", have_challenge); + f->dump_unsigned("server_challenge_plus_one", server_challenge_plus_one); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new CephXAuthorize); + ls.push_back(new CephXAuthorize); + ls.back()->nonce = 1234; + ls.back()->have_challenge = true; + ls.back()->server_challenge_plus_one = 1234; + } }; WRITE_CLASS_ENCODER(CephXAuthorize) diff --git a/src/cls/2pc_queue/cls_2pc_queue_ops.h b/src/cls/2pc_queue/cls_2pc_queue_ops.h index bb61ef341ac1..3cdfd9663e8d 100644 --- a/src/cls/2pc_queue/cls_2pc_queue_ops.h +++ b/src/cls/2pc_queue/cls_2pc_queue_ops.h @@ -3,12 +3,13 @@ #pragma once +#include "common/ceph_json.h" #include "include/types.h" #include "cls_2pc_queue_types.h" struct cls_2pc_queue_reserve_op { uint64_t size; - uint32_t entries; + uint32_t entries{0}; void encode(ceph::buffer::list& bl) const { ENCODE_START(1, 1, bl); @@ -23,6 +24,19 @@ struct cls_2pc_queue_reserve_op { decode(entries, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_unsigned("size", size); + f->dump_unsigned("entries", entries); + } + + static void generate_test_instances(std::list& ls) { + ls.push_back(new cls_2pc_queue_reserve_op); + ls.back()->size = 0; + ls.push_back(new cls_2pc_queue_reserve_op); + ls.back()->size = 123; + ls.back()->entries = 456; + } }; WRITE_CLASS_ENCODER(cls_2pc_queue_reserve_op) @@ -40,6 +54,15 @@ struct cls_2pc_queue_reserve_ret { decode(id, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_unsigned("id", id); + } + + static void generate_test_instances(std::list& ls) { + ls.push_back(new cls_2pc_queue_reserve_ret); + ls.back()->id = 123; + } }; WRITE_CLASS_ENCODER(cls_2pc_queue_reserve_ret) @@ -61,6 +84,19 @@ struct cls_2pc_queue_commit_op { DECODE_FINISH(bl); } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("id", id); + encode_json("bl_data_vec", bl_data_vec, f); + } + + static void generate_test_instances(std::list& ls) { + ls.push_back(new cls_2pc_queue_commit_op); + ls.back()->id = 123; + ls.back()->bl_data_vec.push_back(ceph::buffer::list()); + ls.back()->bl_data_vec.back().append("foo"); + ls.back()->bl_data_vec.push_back(ceph::buffer::list()); + ls.back()->bl_data_vec.back().append("bar"); + } }; WRITE_CLASS_ENCODER(cls_2pc_queue_commit_op) @@ -78,6 +114,13 @@ struct cls_2pc_queue_abort_op { decode(id, bl); DECODE_FINISH(bl); } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("id", id); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new cls_2pc_queue_abort_op); + ls.back()->id = 1; + } }; WRITE_CLASS_ENCODER(cls_2pc_queue_abort_op) @@ -96,6 +139,14 @@ struct cls_2pc_queue_expire_op { decode(stale_time, bl); DECODE_FINISH(bl); } + void dump(ceph::Formatter *f) const { + f->dump_stream("stale_time") << stale_time; + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new cls_2pc_queue_expire_op); + ls.push_back(new cls_2pc_queue_expire_op); + ls.back()->stale_time = ceph::coarse_real_time::min(); + } }; WRITE_CLASS_ENCODER(cls_2pc_queue_expire_op) @@ -113,6 +164,23 @@ struct cls_2pc_queue_reservations_ret { decode(reservations, bl); DECODE_FINISH(bl); } + void dump(ceph::Formatter *f) const { + f->open_array_section("reservations"); + for (const auto& i : reservations) { + f->open_object_section("reservation"); + f->dump_unsigned("id", i.first); + i.second.dump(f); + f->close_section(); + } + f->close_section(); + } + + static void generate_test_instances(std::list& ls) { + ls.push_back(new cls_2pc_queue_reservations_ret); + ls.push_back(new cls_2pc_queue_reservations_ret); + ls.back()->reservations[1] = cls_2pc_reservation(); + ls.back()->reservations[2] = cls_2pc_reservation(); + } }; WRITE_CLASS_ENCODER(cls_2pc_queue_reservations_ret) diff --git a/src/cls/2pc_queue/cls_2pc_queue_types.h b/src/cls/2pc_queue/cls_2pc_queue_types.h index 2413fd7043da..b270c9d6e790 100644 --- a/src/cls/2pc_queue/cls_2pc_queue_types.h +++ b/src/cls/2pc_queue/cls_2pc_queue_types.h @@ -34,6 +34,19 @@ struct cls_2pc_reservation } DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_unsigned("size", size); + f->dump_stream("timestamp") << timestamp; + } + + static void generate_test_instances(std::list& ls) { + ls.push_back(new cls_2pc_reservation); + ls.back()->size = 0; + ls.push_back(new cls_2pc_reservation); + ls.back()->size = 123; + ls.back()->timestamp = ceph::coarse_real_clock::zero(); + } }; WRITE_CLASS_ENCODER(cls_2pc_reservation) @@ -68,5 +81,28 @@ struct cls_2pc_urgent_data } DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_unsigned("reserved_size", reserved_size); + f->dump_unsigned("last_id", last_id); + f->open_array_section("reservations"); + for (const auto& [id, res] : reservations) { + f->open_object_section("reservation"); + f->dump_unsigned("id", id); + res.dump(f); + f->close_section(); + } + f->close_section(); + f->dump_bool("has_xattrs", has_xattrs); + } + + static void generate_test_instances(std::list& ls) { + ls.push_back(new cls_2pc_urgent_data); + ls.push_back(new cls_2pc_urgent_data); + ls.back()->reserved_size = 123; + ls.back()->last_id = 456; + ls.back()->reservations.emplace(789, cls_2pc_reservation(1, ceph::coarse_real_clock::zero(), 2)); + ls.back()->has_xattrs = true; + } }; WRITE_CLASS_ENCODER(cls_2pc_urgent_data) diff --git a/src/cls/cas/cls_cas_internal.h b/src/cls/cas/cls_cas_internal.h index 09e7f9f1f69d..037e60fffd9d 100644 --- a/src/cls/cas/cls_cas_internal.h +++ b/src/cls/cas/cls_cas_internal.h @@ -145,6 +145,12 @@ struct chunk_refs_by_object_t : public chunk_refs_t::refs_t { } f->close_section(); } + static void generate_test_instances(std::list& ls) { + ls.push_back(new chunk_refs_by_object_t()); + ls.push_back(new chunk_refs_by_object_t()); + ls.back()->by_object.insert(hobject_t(sobject_t("foo", CEPH_NOSNAP))); + ls.back()->by_object.insert(hobject_t(sobject_t("bar", CEPH_NOSNAP))); + } }; WRITE_CLASS_ENCODER(chunk_refs_by_object_t) @@ -386,6 +392,11 @@ struct chunk_refs_count_t : public chunk_refs_t::refs_t { f->dump_string("type", "count"); f->dump_unsigned("count", total); } + static void generate_test_instances(std::list& o) { + o.push_back(new chunk_refs_count_t); + o.push_back(new chunk_refs_count_t); + o.back()->total = 123; + } }; WRITE_CLASS_ENCODER(chunk_refs_count_t) diff --git a/src/cls/fifo/cls_fifo_ops.h b/src/cls/fifo/cls_fifo_ops.h index e850c635c0b8..d466122a9527 100644 --- a/src/cls/fifo/cls_fifo_ops.h +++ b/src/cls/fifo/cls_fifo_ops.h @@ -67,6 +67,31 @@ struct create_meta decode(exclusive, bl); DECODE_FINISH(bl); } + void dump(ceph::Formatter *f) const { + f->dump_string("id", id); + f->dump_object("version", version.value_or(objv())); + f->dump_string("pool_name", pool.name); + f->dump_string("pool_ns", pool.ns); + f->dump_string("oid_prefix", oid_prefix.value_or("")); + f->dump_unsigned("max_part_size", max_part_size); + f->dump_unsigned("max_entry_size", max_entry_size); + f->dump_bool("exclusive", exclusive); + } + static void generate_test_instances(std::list& o) { + o.push_back(new create_meta); + o.push_back(new create_meta); + o.back()->id = "id"; + objv v1; + v1.instance = "inst1"; + v1.ver = 1; + o.back()->version = v1; + o.back()->pool.name = "pool"; + o.back()->pool.ns = "ns"; + o.back()->oid_prefix = "prefix"; + o.back()->max_part_size = 1024; + o.back()->max_entry_size = 1024; + o.back()->exclusive = true; + } }; WRITE_CLASS_ENCODER(create_meta) @@ -84,6 +109,17 @@ struct get_meta decode(version, bl); DECODE_FINISH(bl); } + void dump(ceph::Formatter *f) const { + f->dump_object("version", version.value_or(objv())); + } + static void generate_test_instances(std::list& o) { + o.push_back(new get_meta); + o.push_back(new get_meta); + objv v1; + v1.instance = "inst1"; + v1.ver = 1; + o.back()->version = v1; + } }; WRITE_CLASS_ENCODER(get_meta) @@ -108,6 +144,18 @@ struct get_meta_reply decode(part_entry_overhead, bl); DECODE_FINISH(bl); } + void dump(ceph::Formatter *f) const { + f->dump_object("info", info); + f->dump_unsigned("part_header_size", part_header_size); + f->dump_unsigned("part_entry_overhead", part_entry_overhead); + } + static void generate_test_instances(std::list& o) { + o.push_back(new get_meta_reply); + o.push_back(new get_meta_reply); + o.back()->info = fifo::info(); + o.back()->part_header_size = 1024; + o.back()->part_entry_overhead = 1024; + } }; WRITE_CLASS_ENCODER(get_meta_reply) diff --git a/src/cls/fifo/cls_fifo_types.h b/src/cls/fifo/cls_fifo_types.h index 1c69c1f08718..2ae601e4aafa 100644 --- a/src/cls/fifo/cls_fifo_types.h +++ b/src/cls/fifo/cls_fifo_types.h @@ -54,7 +54,16 @@ struct objv { decode(ver, bl); DECODE_FINISH(bl); } - void dump(ceph::Formatter* f) const; + void dump(ceph::Formatter* f) const { + f->dump_string("instance", instance); + f->dump_unsigned("ver", ver); + } + static void generate_test_instances(std::list& o) { + o.push_back(new objv); + o.push_back(new objv); + o.back()->instance = "instance"; + o.back()->ver = 1; + } void decode_json(JSONObj* obj); bool operator ==(const objv& rhs) const { @@ -103,7 +112,18 @@ struct data_params { decode(full_size_threshold, bl); DECODE_FINISH(bl); } - void dump(ceph::Formatter* f) const; + void dump(ceph::Formatter* f) const { + f->dump_unsigned("max_part_size", max_part_size); + f->dump_unsigned("max_entry_size", max_entry_size); + f->dump_unsigned("full_size_threshold", full_size_threshold); + } + static void generate_test_instances(std::list& o) { + o.push_back(new data_params); + o.push_back(new data_params); + o.back()->max_part_size = 1; + o.back()->max_entry_size = 2; + o.back()->full_size_threshold = 3; + } void decode_json(JSONObj* obj); auto operator <=>(const data_params&) const = default; @@ -161,7 +181,10 @@ struct journal_entry { decode(part_tag, bl); DECODE_FINISH(bl); } - void dump(ceph::Formatter* f) const; + void dump(ceph::Formatter* f) const { + f->dump_int("op", (int)op); + f->dump_int("part_num", part_num); + } auto operator <=>(const journal_entry&) const = default; }; @@ -397,7 +420,38 @@ struct info { decode_journal(bl); DECODE_FINISH(bl); } - void dump(ceph::Formatter* f) const; + void dump(ceph::Formatter* f) const { + f->dump_string("id", id); + f->dump_object("version", version); + f->dump_string("oid_prefix", oid_prefix); + f->dump_object("params", params); + f->dump_int("tail_part_num", tail_part_num); + f->dump_int("head_part_num", head_part_num); + f->dump_int("min_push_part_num", min_push_part_num); + f->dump_int("max_push_part_num", max_push_part_num); + f->open_array_section("journal"); + for (const auto& entry : journal) { + f->open_object_section("entry"); + f->dump_object("entry", entry); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(std::list& o) { + o.push_back(new info); + o.push_back(new info); + o.back()->id = "myid"; + o.back()->version = objv(); + o.back()->oid_prefix = "myprefix"; + o.back()->params = data_params(); + o.back()->tail_part_num = 123; + o.back()->head_part_num = 456; + o.back()->min_push_part_num = 789; + o.back()->max_push_part_num = 101112; + o.back()->journal.insert(journal_entry(journal_entry::Op::create, 1)); + o.back()->journal.insert(journal_entry(journal_entry::Op::create, 2)); + o.back()->journal.insert(journal_entry(journal_entry::Op::create, 3)); + } void decode_json(JSONObj* obj); std::string part_oid(std::int64_t part_num) const { diff --git a/src/cls/log/cls_log_ops.h b/src/cls/log/cls_log_ops.h index 5a65892598b6..4d3b2f5d3091 100644 --- a/src/cls/log/cls_log_ops.h +++ b/src/cls/log/cls_log_ops.h @@ -4,6 +4,7 @@ #ifndef CEPH_CLS_LOG_OPS_H #define CEPH_CLS_LOG_OPS_H +#include "common/ceph_json.h" #include "cls_log_types.h" struct cls_log_add_op { @@ -73,6 +74,21 @@ struct cls_log_list_op { decode(max_entries, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter* f) const { + f->dump_stream("from_time") << from_time; + f->dump_string("marker", marker); + f->dump_stream("to_time") << to_time; + f->dump_int("max_entries", max_entries); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new cls_log_list_op); + ls.push_back(new cls_log_list_op); + ls.back()->from_time = utime_t(1, 2); + ls.back()->marker = "marker"; + ls.back()->to_time = utime_t(3, 4); + ls.back()->max_entries = 5; + } }; WRITE_CLASS_ENCODER(cls_log_list_op) @@ -98,6 +114,25 @@ struct cls_log_list_ret { decode(truncated, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter* f) const { + encode_json("entries", entries, f); + f->dump_string("marker", marker); + f->dump_bool("truncated", truncated); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new cls_log_list_ret); + ls.push_back(new cls_log_list_ret); + ls.back()->entries.push_back(cls_log_entry()); + ls.back()->entries.push_back(cls_log_entry()); + ls.back()->entries.back().section = "section"; + ls.back()->entries.back().name = "name"; + ls.back()->entries.back().timestamp = utime_t(1, 2); + ls.back()->entries.back().data.append("data"); + ls.back()->entries.back().id = "id"; + ls.back()->marker = "marker"; + ls.back()->truncated = true; + } }; WRITE_CLASS_ENCODER(cls_log_list_ret) @@ -133,6 +168,20 @@ struct cls_log_trim_op { } DECODE_FINISH(bl); } + void dump(ceph::Formatter* f) const { + f->dump_stream("from_time") << from_time; + f->dump_stream("to_time") << to_time; + f->dump_string("from_marker", from_marker); + f->dump_string("to_marker", to_marker); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new cls_log_trim_op); + ls.push_back(new cls_log_trim_op); + ls.back()->from_time = utime_t(1, 2); + ls.back()->to_time = utime_t(3, 4); + ls.back()->from_marker = "from_marker"; + ls.back()->to_marker = "to_marker"; + } }; WRITE_CLASS_ENCODER(cls_log_trim_op) @@ -150,6 +199,13 @@ struct cls_log_info_op { // currently empty request DECODE_FINISH(bl); } + + void dump(ceph::Formatter* f) const { + } + + static void generate_test_instances(std::list& ls) { + ls.push_back(new cls_log_info_op); + } }; WRITE_CLASS_ENCODER(cls_log_info_op) diff --git a/src/cls/log/cls_log_types.h b/src/cls/log/cls_log_types.h index 33b8cce51e5e..29aa2bae8fe9 100644 --- a/src/cls/log/cls_log_types.h +++ b/src/cls/log/cls_log_types.h @@ -92,6 +92,16 @@ struct cls_log_header { decode(max_time, bl); DECODE_FINISH(bl); } + void dump(ceph::Formatter* f) const { + f->dump_string("max_marker", max_marker); + f->dump_stream("max_time") << max_time; + } + static void generate_test_instances(std::list& o) { + o.push_back(new cls_log_header); + o.push_back(new cls_log_header); + o.back()->max_marker = "test_marker"; + o.back()->max_time = utime_t(); + } }; inline bool operator ==(const cls_log_header& lhs, const cls_log_header& rhs) { return (lhs.max_marker == rhs.max_marker && diff --git a/src/cls/queue/cls_queue_ops.h b/src/cls/queue/cls_queue_ops.h index 8209659bda90..0b0b30b6f01c 100644 --- a/src/cls/queue/cls_queue_ops.h +++ b/src/cls/queue/cls_queue_ops.h @@ -4,6 +4,7 @@ #ifndef CEPH_CLS_QUEUE_OPS_H #define CEPH_CLS_QUEUE_OPS_H +#include "common/ceph_json.h" #include "cls/queue/cls_queue_types.h" struct cls_queue_init_op { @@ -29,6 +30,19 @@ struct cls_queue_init_op { DECODE_FINISH(bl); } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("queue_size", queue_size); + f->dump_unsigned("max_urgent_data_size", max_urgent_data_size); + f->dump_unsigned("urgent_data_len", bl_urgent_data.length()); + } + + static void generate_test_instances(std::list& o) { + o.push_back(new cls_queue_init_op); + o.push_back(new cls_queue_init_op); + o.back()->queue_size = 1024; + o.back()->max_urgent_data_size = 1024; + o.back()->bl_urgent_data.append(std::string_view("data")); + } }; WRITE_CLASS_ENCODER(cls_queue_init_op) @@ -47,12 +61,23 @@ struct cls_queue_enqueue_op { DECODE_START(1, bl); decode(bl_data_vec, bl); DECODE_FINISH(bl); - } + } + + void dump(ceph::Formatter *f) const { + f->dump_unsigned("data_vec_len", bl_data_vec.size()); + } + + static void generate_test_instances(std::list& o) { + o.push_back(new cls_queue_enqueue_op); + o.push_back(new cls_queue_enqueue_op); + o.back()->bl_data_vec.push_back(ceph::buffer::list()); + o.back()->bl_data_vec.back().append(std::string_view("data")); + } }; WRITE_CLASS_ENCODER(cls_queue_enqueue_op) struct cls_queue_list_op { - uint64_t max; + uint64_t max{0}; std::string start_marker; cls_queue_list_op() {} @@ -70,6 +95,18 @@ struct cls_queue_list_op { decode(start_marker, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_unsigned("max", max); + f->dump_string("start_marker", start_marker); + } + + static void generate_test_instances(std::list& o) { + o.push_back(new cls_queue_list_op); + o.push_back(new cls_queue_list_op); + o.back()->max = 123; + o.back()->start_marker = "foo"; + } }; WRITE_CLASS_ENCODER(cls_queue_list_op) @@ -95,6 +132,22 @@ struct cls_queue_list_ret { decode(entries, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_bool("is_truncated", is_truncated); + f->dump_string("next_marker", next_marker); + encode_json("entries", entries, f); + } + + static void generate_test_instances(std::list& o) { + o.push_back(new cls_queue_list_ret); + o.back()->is_truncated = true; + o.back()->next_marker = "foo"; + o.back()->entries.push_back(cls_queue_entry()); + o.back()->entries.push_back(cls_queue_entry()); + o.back()->entries.back().marker = "id"; + o.back()->entries.back().data.append(std::string_view("data")); + } }; WRITE_CLASS_ENCODER(cls_queue_list_ret) @@ -114,6 +167,15 @@ struct cls_queue_remove_op { decode(end_marker, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_string("end_marker", end_marker); + } + static void generate_test_instances(std::list& o) { + o.push_back(new cls_queue_remove_op); + o.push_back(new cls_queue_remove_op); + o.back()->end_marker = "foo"; + } }; WRITE_CLASS_ENCODER(cls_queue_remove_op) @@ -133,6 +195,14 @@ struct cls_queue_get_capacity_ret { decode(queue_capacity, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_unsigned("queue_capacity", queue_capacity); + } + static void generate_test_instances(std::list& o) { + o.push_back(new cls_queue_get_capacity_ret); + o.back()->queue_capacity = 123; + } }; WRITE_CLASS_ENCODER(cls_queue_get_capacity_ret) diff --git a/src/cls/queue/cls_queue_types.h b/src/cls/queue/cls_queue_types.h index cc46df405052..3c3e828edf0a 100644 --- a/src/cls/queue/cls_queue_types.h +++ b/src/cls/queue/cls_queue_types.h @@ -34,6 +34,17 @@ struct cls_queue_entry decode(marker, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_string("marker", marker); + f->dump_unsigned("data_len", data.length()); + } + static void generate_test_instances(std::list& o) { + o.push_back(new cls_queue_entry); + o.push_back(new cls_queue_entry); + o.back()->data.append(std::string_view("data")); + o.back()->marker = "marker"; + } }; WRITE_CLASS_ENCODER(cls_queue_entry) @@ -80,7 +91,16 @@ struct cls_queue_marker } return 0; } - + void dump(ceph::Formatter *f) const { + f->dump_unsigned("offset", offset); + f->dump_unsigned("gen", gen); + } + static void generate_test_instances(std::list& o) { + o.push_back(new cls_queue_marker); + o.push_back(new cls_queue_marker); + o.back()->offset = 1024; + o.back()->gen = 0; + } }; WRITE_CLASS_ENCODER(cls_queue_marker) @@ -114,6 +134,27 @@ struct cls_queue_head decode(bl_urgent_data, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_unsigned("max_head_size", max_head_size); + f->dump_unsigned("queue_size", queue_size); + f->dump_unsigned("max_urgent_data_size", max_urgent_data_size); + f->dump_unsigned("front_offset", front.offset); + f->dump_unsigned("front_gen", front.gen); + f->dump_unsigned("tail_offset", tail.offset); + f->dump_unsigned("tail_gen", tail.gen); + } + static void generate_test_instances(std::list& o) { + o.push_back(new cls_queue_head); + o.push_back(new cls_queue_head); + o.back()->max_head_size = 1024; + o.back()->front.offset = 1024; + o.back()->front.gen = 0; + o.back()->tail.offset = 1024; + o.back()->tail.gen = 0; + o.back()->queue_size = 1024; + o.back()->max_urgent_data_size = 0; + } }; WRITE_CLASS_ENCODER(cls_queue_head) diff --git a/src/cls/timeindex/cls_timeindex_ops.h b/src/cls/timeindex/cls_timeindex_ops.h index f40058954dce..f0f0cc024751 100644 --- a/src/cls/timeindex/cls_timeindex_ops.h +++ b/src/cls/timeindex/cls_timeindex_ops.h @@ -4,6 +4,7 @@ #ifndef CEPH_CLS_TIMEINDEX_OPS_H #define CEPH_CLS_TIMEINDEX_OPS_H +#include "common/ceph_json.h" #include "cls_timeindex_types.h" struct cls_timeindex_add_op { @@ -51,6 +52,26 @@ struct cls_timeindex_list_op { decode(max_entries, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->open_object_section("from_time"); + from_time.dump(f); + f->close_section(); + f->dump_string("marker", marker); + f->open_object_section("to_time"); + to_time.dump(f); + f->close_section(); + f->dump_int("max_entries", max_entries); + } + + static void generate_test_instances(std::list& o) { + o.push_back(new cls_timeindex_list_op); + o.push_back(new cls_timeindex_list_op); + o.back()->from_time = utime_t(1, 2); + o.back()->marker = "marker"; + o.back()->to_time = utime_t(3, 4); + o.back()->max_entries = 5; + } }; WRITE_CLASS_ENCODER(cls_timeindex_list_op) @@ -76,6 +97,23 @@ struct cls_timeindex_list_ret { decode(truncated, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + encode_json("entries", entries, f); + f->dump_string("marker", marker); + f->dump_bool("truncated", truncated); + } + + static void generate_test_instances(std::list& o) { + o.push_back(new cls_timeindex_list_ret); + o.push_back(new cls_timeindex_list_ret); + o.back()->entries.push_back(cls_timeindex_entry()); + o.back()->entries.back().key_ts = utime_t(1, 2); + o.back()->entries.back().key_ext = "key_ext"; + o.back()->entries.back().value.append("value"); + o.back()->marker = "marker"; + o.back()->truncated = true; + } }; WRITE_CLASS_ENCODER(cls_timeindex_list_ret) diff --git a/src/cls/timeindex/cls_timeindex_types.h b/src/cls/timeindex/cls_timeindex_types.h index d33886881be5..ea8d6c93d2c8 100644 --- a/src/cls/timeindex/cls_timeindex_types.h +++ b/src/cls/timeindex/cls_timeindex_types.h @@ -4,9 +4,9 @@ #ifndef CEPH_CLS_TIMEINDEX_TYPES_H #define CEPH_CLS_TIMEINDEX_TYPES_H +#include "common/Formatter.h" #include "include/encoding.h" #include "include/types.h" - #include "include/utime.h" class JSONObj; diff --git a/src/cls/version/cls_version_ops.h b/src/cls/version/cls_version_ops.h index 62cd1172982a..2eff788ce52c 100644 --- a/src/cls/version/cls_version_ops.h +++ b/src/cls/version/cls_version_ops.h @@ -5,6 +5,7 @@ #define CEPH_CLS_VERSION_OPS_H #include "cls_version_types.h" +#include "common/ceph_json.h" struct cls_version_set_op { obj_version objv; @@ -22,6 +23,17 @@ struct cls_version_set_op { decode(objv, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_object("objv", objv); + } + + static void generate_test_instances(std::list& o) { + o.push_back(new cls_version_set_op); + o.push_back(new cls_version_set_op); + o.back()->objv.ver = 123; + o.back()->objv.tag = "foo"; + } }; WRITE_CLASS_ENCODER(cls_version_set_op) @@ -44,6 +56,22 @@ struct cls_version_inc_op { decode(conds, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_object("objv", objv); + encode_json("conds", conds, f); + } + + static void generate_test_instances(std::list& o) { + o.push_back(new cls_version_inc_op); + o.push_back(new cls_version_inc_op); + o.back()->objv.ver = 123; + o.back()->objv.tag = "foo"; + o.back()->conds.push_back(obj_version_cond()); + o.back()->conds.back().ver.ver = 123; + o.back()->conds.back().ver.tag = "foo"; + o.back()->conds.back().cond = VER_COND_GE; + } }; WRITE_CLASS_ENCODER(cls_version_inc_op) @@ -66,6 +94,22 @@ struct cls_version_check_op { decode(conds, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_object("objv", objv); + encode_json("conds", conds, f); + } + + static void generate_test_instances(std::list& o) { + o.push_back(new cls_version_check_op); + o.push_back(new cls_version_check_op); + o.back()->objv.ver = 123; + o.back()->objv.tag = "foo"; + o.back()->conds.push_back(obj_version_cond()); + o.back()->conds.back().ver.ver = 123; + o.back()->conds.back().ver.tag = "foo"; + o.back()->conds.back().cond = VER_COND_GE; + } }; WRITE_CLASS_ENCODER(cls_version_check_op) @@ -85,6 +129,17 @@ struct cls_version_read_ret { decode(objv, bl); DECODE_FINISH(bl); } + + void dump(ceph::Formatter *f) const { + f->dump_object("objv", objv); + } + + static void generate_test_instances(std::list& o) { + o.push_back(new cls_version_read_ret); + o.push_back(new cls_version_read_ret); + o.back()->objv.ver = 123; + o.back()->objv.tag = "foo"; + } }; WRITE_CLASS_ENCODER(cls_version_read_ret) diff --git a/src/cls/version/cls_version_types.cc b/src/cls/version/cls_version_types.cc index b82f6aa8a5dd..735ef7c89857 100644 --- a/src/cls/version/cls_version_types.cc +++ b/src/cls/version/cls_version_types.cc @@ -6,12 +6,6 @@ #include "common/ceph_json.h" -void obj_version::dump(ceph::Formatter *f) const -{ - f->dump_int("ver", ver); - f->dump_string("tag", tag); -} - void obj_version::decode_json(JSONObj *obj) { JSONDecoder::decode_json("ver", ver, obj); diff --git a/src/cls/version/cls_version_types.h b/src/cls/version/cls_version_types.h index 62cc16e33d52..dafa866e1f35 100644 --- a/src/cls/version/cls_version_types.h +++ b/src/cls/version/cls_version_types.h @@ -53,7 +53,11 @@ struct obj_version { tag.compare(v.tag) == 0); } - void dump(ceph::Formatter *f) const; + void dump(ceph::Formatter *f) const { + f->dump_int("ver", ver); + f->dump_string("tag", tag); + } + void decode_json(JSONObj *obj); static void generate_test_instances(std::list& o); }; @@ -91,6 +95,18 @@ struct obj_version_cond { DECODE_FINISH(bl); } + void dump(ceph::Formatter *f) const { + f->dump_object("ver", ver); + f->dump_unsigned("cond", cond); + } + + static void generate_test_instances(std::list& o) { + o.push_back(new obj_version_cond); + o.push_back(new obj_version_cond); + o.back()->ver.ver = 1; + o.back()->ver.tag = "foo"; + o.back()->cond = VER_COND_EQ; + } }; WRITE_CLASS_ENCODER(obj_version_cond) diff --git a/src/common/ceph_json.h b/src/common/ceph_json.h index 08e8d9e46623..f7a899fd9356 100644 --- a/src/common/ceph_json.h +++ b/src/common/ceph_json.h @@ -836,6 +836,61 @@ class JSONFormattable : public ceph::JSONFormatter { DECODE_FINISH(bl); } + void dump(ceph::Formatter *f) const { + switch (type) { + case FMT_VALUE: + if (value.quoted) { + f->dump_string("value", value.str); + } else { + f->dump_format_unquoted("value", "%s", value.str.c_str()); + } + break; + case FMT_ARRAY: + f->open_array_section("array"); + for (auto& i : arr) { + i.dump(f); + } + f->close_section(); + break; + case FMT_OBJ: + f->open_object_section("object"); + for (auto& i : obj) { + f->dump_object(i.first.c_str(), i.second); + } + f->close_section(); + break; + default: + break; + } + } + static void generate_test_instances(std::list& o) { + o.push_back(new JSONFormattable); + o.push_back(new JSONFormattable); + o.back()->set_type(FMT_VALUE); + o.back()->value.str = "foo"; + o.back()->value.quoted = true; + o.push_back(new JSONFormattable); + o.back()->set_type(FMT_VALUE); + o.back()->value.str = "foo"; + o.back()->value.quoted = false; + o.push_back(new JSONFormattable); + o.back()->set_type(FMT_ARRAY); + o.back()->arr.push_back(JSONFormattable()); + o.back()->arr.back().set_type(FMT_VALUE); + o.back()->arr.back().value.str = "foo"; + o.back()->arr.back().value.quoted = true; + o.back()->arr.push_back(JSONFormattable()); + o.back()->arr.back().set_type(FMT_VALUE); + o.back()->arr.back().value.str = "bar"; + o.back()->arr.back().value.quoted = true; + o.push_back(new JSONFormattable); + o.back()->set_type(FMT_OBJ); + o.back()->obj["foo"] = JSONFormattable(); + o.back()->obj["foo"].set_type(FMT_VALUE); + o.back()->obj["foo"].value.str = "bar"; + o.back()->obj["foo"].value.quoted = true; + } + const std::string& val() const { return value.str; } diff --git a/src/common/entity_name.cc b/src/common/entity_name.cc index 5357b34eacb7..a9d6fb9c8b10 100644 --- a/src/common/entity_name.cc +++ b/src/common/entity_name.cc @@ -29,21 +29,30 @@ const std::array EntityName::STR_TO_ENTITY_ { CEPH_ENTITY_TYPE_CLIENT, "client" }, }}; -const std::string& EntityName:: -to_str() const -{ +void EntityName::dump(ceph::Formatter *f) const { + f->dump_int("type", type); + f->dump_string("id", id); +} + +void EntityName::generate_test_instances(std::list& ls) { + ls.push_back(new EntityName); + ls.push_back(new EntityName); + ls.back()->set_type(CEPH_ENTITY_TYPE_OSD); + ls.back()->set_id("0"); + ls.push_back(new EntityName); + ls.back()->set_type(CEPH_ENTITY_TYPE_MDS); + ls.back()->set_id("a"); +} + +const std::string& EntityName::to_str() const { return type_id; } -const char* EntityName:: -to_cstr() const -{ +const char* EntityName::to_cstr() const { return type_id.c_str(); } -bool EntityName:: -from_str(std::string_view s) -{ +bool EntityName::from_str(std::string_view s) { size_t pos = s.find('.'); if (pos == string::npos) @@ -56,9 +65,7 @@ from_str(std::string_view s) return true; } -void EntityName:: -set(uint32_t type_, std::string_view id_) -{ +void EntityName::set(uint32_t type_, std::string_view id_) { type = type_; id = id_; @@ -71,9 +78,7 @@ set(uint32_t type_, std::string_view id_) } } -int EntityName:: -set(std::string_view type_, std::string_view id_) -{ +int EntityName::set(std::string_view type_, std::string_view id_) { uint32_t t = str_to_ceph_entity_type(type_); if (t == CEPH_ENTITY_TYPE_ANY) return -EINVAL; @@ -81,9 +86,7 @@ set(std::string_view type_, std::string_view id_) return 0; } -void EntityName:: -set_type(uint32_t type_) -{ +void EntityName::set_type(uint32_t type_) { set(type_, id); } @@ -93,9 +96,7 @@ set_type(std::string_view type_) return set(type_, id); } -void EntityName:: -set_id(std::string_view id_) -{ +void EntityName::set_id(std::string_view id_) { set(type, id_); } @@ -106,33 +107,23 @@ void EntityName::set_name(entity_name_t n) set(n.type(), s); } -const char* EntityName:: -get_type_str() const -{ +const char* EntityName::get_type_str() const { return ceph_entity_type_name(type); } -std::string_view EntityName:: -get_type_name() const -{ +std::string_view EntityName::get_type_name() const { return ceph_entity_type_name(type); } -const std::string &EntityName:: -get_id() const -{ +const std::string &EntityName::get_id() const { return id; } -bool EntityName:: -has_default_id() const -{ +bool EntityName::has_default_id() const { return (id == "admin"); } -std::string EntityName:: -get_valid_types_as_str() -{ +std::string EntityName::get_valid_types_as_str() { std::ostringstream out; size_t i; for (i = 0; i < STR_TO_ENTITY_TYPE.size(); ++i) { diff --git a/src/common/entity_name.h b/src/common/entity_name.h index c88ebcbbabde..53f8cd4d5d09 100644 --- a/src/common/entity_name.h +++ b/src/common/entity_name.h @@ -41,7 +41,8 @@ struct EntityName decode(id_, bl); set(type_, id_); } - + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& ls); const std::string& to_str() const; const char *to_cstr() const; bool from_str(std::string_view s); diff --git a/src/include/cephfs/types.h b/src/include/cephfs/types.h index 049ef8e3fcf6..54d97d9d4a2f 100644 --- a/src/include/cephfs/types.h +++ b/src/include/cephfs/types.h @@ -349,6 +349,8 @@ struct inline_data_t { } void encode(ceph::buffer::list &bl) const; void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& ls); version_t version = 1; diff --git a/src/include/frag.h b/src/include/frag.h index ec18bddfbb1e..5ea2429aee8d 100644 --- a/src/include/frag.h +++ b/src/include/frag.h @@ -159,6 +159,15 @@ class frag_t { ceph::decode_raw(v, p); _enc = v; } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("value", value()); + f->dump_unsigned("bits", bits()); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new frag_t); + ls.push_back(new frag_t(10, 2)); + ls.push_back(new frag_t(11, 3)); + } bool operator<(const frag_t& b) const { if (value() != b.value()) @@ -525,6 +534,11 @@ class fragtree_t { } f->close_section(); // splits } + + static void generate_test_instances(std::list& ls) { + ls.push_back(new fragtree_t); + ls.push_back(new fragtree_t); + } }; WRITE_CLASS_ENCODER(fragtree_t) diff --git a/src/include/fs_types.h b/src/include/fs_types.h index c1932bfcc30e..606b9c6503d4 100644 --- a/src/include/fs_types.h +++ b/src/include/fs_types.h @@ -75,6 +75,13 @@ struct inodeno_t { using ceph::decode; decode(val, p); } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("val", val); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new inodeno_t(1)); + ls.push_back(new inodeno_t(123456789)); + } } __attribute__ ((__may_alias__)); WRITE_CLASS_ENCODER(inodeno_t) diff --git a/src/include/object.h b/src/include/object.h index 4564af86e577..2e5fb471c139 100644 --- a/src/include/object.h +++ b/src/include/object.h @@ -25,6 +25,7 @@ #include "include/rados.h" #include "include/unordered_map.h" +#include "common/Formatter.h" #include "hash.h" #include "encoding.h" @@ -58,6 +59,15 @@ struct object_t { using ceph::decode; decode(name, bl); } + + void dump(ceph::Formatter *f) const { + f->dump_string("name", name); + } + + static void generate_test_instances(std::list& o) { + o.push_back(new object_t); + o.push_back(new object_t("myobject")); + } }; WRITE_CLASS_ENCODER(object_t) @@ -170,6 +180,14 @@ struct sobject_t { decode(oid, bl); decode(snap, bl); } + void dump(ceph::Formatter *f) const { + f->dump_stream("oid") << oid; + f->dump_stream("snap") << snap; + } + static void generate_test_instances(std::list& o) { + o.push_back(new sobject_t); + o.push_back(new sobject_t(object_t("myobject"), 123)); + } }; WRITE_CLASS_ENCODER(sobject_t) diff --git a/src/include/types.h b/src/include/types.h index bdd09a53df10..a50a506eb0bc 100644 --- a/src/include/types.h +++ b/src/include/types.h @@ -371,6 +371,14 @@ struct client_t { using ceph::decode; decode(v, bl); } + void dump(ceph::Formatter *f) const { + f->dump_int("id", v); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new client_t); + ls.push_back(new client_t(1)); + ls.push_back(new client_t(123)); + } }; WRITE_CLASS_ENCODER(client_t) @@ -517,7 +525,13 @@ struct shard_id_t { using ceph::decode; decode(id, bl); } - + void dump(ceph::Formatter *f) const { + f->dump_int("id", id); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new shard_id_t(1)); + ls.push_back(new shard_id_t(2)); + } bool operator==(const shard_id_t&) const = default; auto operator<=>(const shard_id_t&) const = default; }; @@ -561,6 +575,13 @@ struct errorcode32_t { decode(code, bl); code = ceph_to_hostos_errno(code); } + void dump(ceph::Formatter *f) const { + f->dump_int("code", code); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new errorcode32_t(1)); + ls.push_back(new errorcode32_t(2)); + } }; WRITE_CLASS_ENCODER(errorcode32_t) @@ -602,6 +623,16 @@ struct sha_digest_t { decode(tmparr, bl); memcpy(v, tmparr.data(), SIZE); } + void dump(ceph::Formatter *f) const { + f->dump_string("sha1", to_str()); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new sha_digest_t); + ls.push_back(new sha_digest_t); + ls.back()->v[0] = 1; + ls.push_back(new sha_digest_t); + ls.back()->v[0] = 2; + } }; template diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index 044c33459606..21e17ca0e5bc 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -284,6 +284,21 @@ void inline_data_t::decode(bufferlist::const_iterator &p) free_data(); } +void inline_data_t::dump(Formatter *f) const +{ + f->dump_unsigned("version", version); + f->dump_unsigned("length", length()); +} + +void inline_data_t::generate_test_instances(std::list& ls) +{ + ls.push_back(new inline_data_t); + ls.push_back(new inline_data_t); + bufferlist bl; + bl.append("inline data"); + ls.back()->set_data(bl); +} + /* * fnode_t diff --git a/src/messages/MClientReply.h b/src/messages/MClientReply.h index be33fad49491..028c4200c149 100644 --- a/src/messages/MClientReply.h +++ b/src/messages/MClientReply.h @@ -291,7 +291,7 @@ struct InodeStat { }; struct openc_response_t { - _inodeno_t created_ino; + _inodeno_t created_ino{0}; interval_set delegated_inos; public: @@ -309,6 +309,16 @@ struct openc_response_t { decode(delegated_inos, p); DECODE_FINISH(p); } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("created_ino", created_ino); + f->dump_stream("delegated_inos") << delegated_inos; + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new openc_response_t); + ls.push_back(new openc_response_t); + ls.back()->created_ino = 1; + ls.back()->delegated_inos.insert(1, 10); + } } __attribute__ ((__may_alias__)); WRITE_CLASS_ENCODER(openc_response_t) diff --git a/src/messages/MClientRequest.h b/src/messages/MClientRequest.h index c62e183a7563..a51d73489186 100644 --- a/src/messages/MClientRequest.h +++ b/src/messages/MClientRequest.h @@ -60,6 +60,17 @@ struct SnapPayload { decode(metadata, iter); DECODE_FINISH(iter); } + void dump(ceph::Formatter *f) const { + for (const auto &i : metadata) { + f->dump_string(i.first.c_str(), i.second); + } + } + static void generate_test_instances(std::list &o) { + o.push_back(new SnapPayload); + o.push_back(new SnapPayload); + o.back()->metadata["key1"] = "val1"; + o.back()->metadata["key2"] = "val2"; + } }; WRITE_CLASS_ENCODER(SnapPayload) @@ -95,6 +106,26 @@ class MClientRequest final : public MMDSOp { decode(item, bl); ceph::decode_nohead(item.dname_len, dname, bl); } + + void dump(ceph::Formatter *f) const { + f->dump_string("dname", dname); + f->dump_unsigned("ino", item.ino); + f->dump_unsigned("cap_id", item.cap_id); + f->dump_unsigned("caps", item.caps); + f->dump_unsigned("wanted", item.wanted); + f->dump_unsigned("seq", item.seq); + f->dump_unsigned("issue_seq", item.issue_seq); + f->dump_unsigned("mseq", item.mseq); + f->dump_unsigned("dname_seq", item.dname_seq); + f->dump_unsigned("dname_len", item.dname_len); + } + + static void generate_test_instances(std::list& ls) { + ls.push_back(new Release); + ls.push_back(new Release); + ls.back()->item.dname_len = 4; + ls.back()->dname = "test"; + } }; mutable std::vector releases; /* XXX HACK! */ diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h index a9211d53538a..9237c79b6e77 100644 --- a/src/messages/MMDSCacheRejoin.h +++ b/src/messages/MMDSCacheRejoin.h @@ -63,6 +63,17 @@ class MMDSCacheRejoin final : public MMDSOp { decode(nestlock, bl); decode(dftlock, bl); } + void dump(ceph::Formatter *f) const { + f->dump_int("nonce", nonce); + f->dump_int("caps_wanted", caps_wanted); + f->dump_int("filelock", filelock); + f->dump_int("nestlock", nestlock); + f->dump_int("dftlock", dftlock); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new inode_strong); + ls.push_back(new inode_strong(1, 2, 3, 4, 5)); + } }; WRITE_CLASS_ENCODER(inode_strong) @@ -81,6 +92,14 @@ class MMDSCacheRejoin final : public MMDSOp { decode(nonce, bl); decode(dir_rep, bl); } + static void generate_test_instances(std::list& ls) { + ls.push_back(new dirfrag_strong); + ls.push_back(new dirfrag_strong(1, 2)); + } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("nonce", nonce); + f->dump_unsigned("dir_rep", dir_rep); + } }; WRITE_CLASS_ENCODER(dirfrag_strong) @@ -118,6 +137,19 @@ class MMDSCacheRejoin final : public MMDSOp { decode(lock, bl); decode(alternate_name, bl); } + static void generate_test_instances(std::list& ls) { + ls.push_back(new dn_strong); + ls.push_back(new dn_strong(1, "alternate_name", 2, 3, 4, 5, 6)); + } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("first", first); + f->dump_string("alternate_name", alternate_name); + f->dump_unsigned("ino", ino); + f->dump_unsigned("remote_ino", remote_ino); + f->dump_unsigned("remote_d_type", remote_d_type); + f->dump_unsigned("nonce", nonce); + f->dump_unsigned("lock", lock); + } }; WRITE_CLASS_ENCODER(dn_strong) diff --git a/src/messages/MMgrReport.h b/src/messages/MMgrReport.h index bd3c221a847e..e81282dc1a06 100644 --- a/src/messages/MMgrReport.h +++ b/src/messages/MMgrReport.h @@ -74,6 +74,27 @@ class PerfCounterType } DECODE_FINISH(p); } + + void dump(ceph::Formatter *f) const + { + f->dump_string("path", path); + f->dump_string("description", description); + f->dump_string("nick", nick); + f->dump_int("type", type); + f->dump_int("priority", priority); + f->dump_int("unit", unit); + } + static void generate_test_instances(std::list& ls) + { + ls.push_back(new PerfCounterType); + ls.push_back(new PerfCounterType); + ls.back()->path = "mycounter"; + ls.back()->description = "mycounter description"; + ls.back()->nick = "mycounter nick"; + ls.back()->type = PERFCOUNTER_COUNTER; + ls.back()->priority = PerfCountersBuilder::PRIO_CRITICAL; + ls.back()->unit = UNIT_BYTES; + } }; WRITE_CLASS_ENCODER(PerfCounterType) diff --git a/src/os/bluestore/bluestore_types.cc b/src/os/bluestore/bluestore_types.cc index 904b6fbd3d98..80a8ea06a805 100644 --- a/src/os/bluestore/bluestore_types.cc +++ b/src/os/bluestore/bluestore_types.cc @@ -1087,6 +1087,15 @@ void bluestore_onode_t::shard_info::dump(Formatter *f) const f->dump_unsigned("bytes", bytes); } +void bluestore_onode_t::shard_info::generate_test_instances( + list& o) +{ + o.push_back(new shard_info); + o.push_back(new shard_info); + o.back()->offset = 123; + o.back()->bytes = 456; +} + ostream& operator<<(ostream& out, const bluestore_onode_t::shard_info& si) { return out << std::hex << "0x" << si.offset << "(0x" << si.bytes << " bytes" diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h index 4c96e8903260..cdedf223b055 100644 --- a/src/os/bluestore/bluestore_types.h +++ b/src/os/bluestore/bluestore_types.h @@ -954,6 +954,7 @@ struct bluestore_onode_t { denc_varint(v.bytes, p); } void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& ls); }; std::vector extent_map_shards; ///< extent std::map shards (if any) diff --git a/src/tools/ceph-dencoder/common_types.h b/src/tools/ceph-dencoder/common_types.h index 3180e3476be2..e853321645ba 100644 --- a/src/tools/ceph-dencoder/common_types.h +++ b/src/tools/ceph-dencoder/common_types.h @@ -23,12 +23,29 @@ TYPE(filepath) #include "include/fs_types.h" TYPE_FEATUREFUL(file_layout_t) +TYPE(inodeno_t) #include "include/util.h" TYPE(ceph_data_stats) +#include "include/object.h" +TYPE(object_t) +TYPE(sobject_t) + +#include "include/frag.h" +TYPE(fragtree_t) +TYPE(frag_t) + +#include "include/types.h" +TYPE(shard_id_t) +TYPE(inline_data_t) +TYPE(sha256_digest_t) +TYPE(errorcode32_t) +TYPE(client_t) + #include "common/bit_vector.hpp" TYPE(BitVector<2>) +TYPE(ceph::BitVector<2>) #include "common/bloom_filter.hpp" TYPE(bloom_filter) @@ -37,6 +54,9 @@ TYPE(compressible_bloom_filter) #include "common/DecayCounter.h" TYPE(DecayCounter) +#include "common/entity_name.h" +TYPE(EntityName) + #include "common/histogram.h" TYPE(pow2_hist_t) @@ -71,11 +91,17 @@ TYPE(cls_cas_chunk_put_ref_op) #include "cls/cas/cls_cas_internal.h" TYPE(chunk_refs_t) +TYPE(chunk_refs_count_t) +TYPE(chunk_refs_by_object_t) #include "cls/lock/cls_lock_types.h" TYPE(rados::cls::lock::locker_id_t) TYPE_FEATUREFUL(rados::cls::lock::locker_info_t) TYPE_FEATUREFUL(rados::cls::lock::lock_info_t) +using namespace rados::cls::lock; +TYPE(locker_id_t) +TYPE_FEATUREFUL(locker_info_t) +TYPE_FEATUREFUL(lock_info_t) #include "cls/lock/cls_lock_ops.h" TYPE(cls_lock_lock_op) @@ -98,6 +124,64 @@ TYPE(obj_refcount) #include "cls/timeindex/cls_timeindex_types.h" TYPE(cls_timeindex_entry) +#include "cls/timeindex/cls_timeindex_ops.h" +TYPE(cls_timeindex_list_op) +TYPE(cls_timeindex_list_ret) + +#include "cls/queue/cls_queue_types.h" +TYPE(cls_queue_entry) +TYPE(cls_queue_marker) +TYPE(cls_queue_head) + +#include "cls/queue/cls_queue_ops.h" +TYPE(cls_queue_get_capacity_ret) +TYPE(cls_queue_remove_op) +TYPE(cls_queue_enqueue_op) +TYPE(cls_queue_list_op) +TYPE(cls_queue_list_ret) +TYPE(cls_queue_init_op) + +#include "cls/2pc_queue/cls_2pc_queue_ops.h" +TYPE(cls_2pc_queue_abort_op) +TYPE(cls_2pc_queue_commit_op) +TYPE(cls_2pc_queue_expire_op) +TYPE_NONDETERMINISTIC(cls_2pc_queue_reservations_ret) +TYPE(cls_2pc_queue_reserve_op) +TYPE(cls_2pc_queue_reserve_ret) +TYPE(cls_queue_init_op) + +#include "cls/2pc_queue/cls_2pc_queue_types.h" +TYPE(cls_2pc_reservation) +TYPE_NONDETERMINISTIC(cls_2pc_urgent_data) + +#include "cls/log/cls_log_types.h" +TYPE(cls_log_header) + +#include "cls/log/cls_log_ops.h" +TYPE(cls_log_info_op) +TYPE(cls_log_list_op) +TYPE(cls_log_list_ret) +TYPE(cls_log_trim_op) + +#include "cls/version/cls_version_ops.h" +TYPE(cls_version_check_op) +TYPE(cls_version_read_ret) +TYPE(cls_version_inc_op) +TYPE(cls_version_set_op) + + +#include "cls/fifo/cls_fifo_ops.h" +using namespace rados::cls::fifo::op; +TYPE(create_meta) +TYPE(get_meta) +TYPE(get_meta_reply) + +#include "cls/fifo/cls_fifo_types.h" +using namespace rados::cls::fifo; +TYPE(data_params) +TYPE(objv) +TYPE(info) + #include "journal/Entry.h" TYPE(journal::Entry) @@ -125,9 +209,12 @@ MESSAGE(MClientReconnect) #include "messages/MClientReply.h" MESSAGE(MClientReply) +TYPE(openc_response_t) #include "messages/MClientRequest.h" MESSAGE(MClientRequest) +TYPE(SnapPayload) +TYPE(MClientRequest::Release) #include "messages/MClientRequestForward.h" MESSAGE(MClientRequestForward) @@ -251,6 +338,9 @@ MESSAGE(MMDSBeacon) #include "messages/MMDSCacheRejoin.h" MESSAGE(MMDSCacheRejoin) +TYPE(MMDSCacheRejoin::dirfrag_strong) +TYPE(MMDSCacheRejoin::dn_strong) +TYPE(MMDSCacheRejoin::inode_strong) #include "messages/MMDSFindIno.h" MESSAGE(MMDSFindIno) @@ -265,10 +355,14 @@ MESSAGE(MMDSFragmentNotify) MESSAGE(MMDSLoadTargets) #include "messages/MMDSMap.h" -MESSAGE(MMDSMap) +MESSAGE(MMDSMap) + +#include "messages/MMgrBeacon.h" +MESSAGE(MMgrBeacon) #include "messages/MMgrReport.h" MESSAGE(MMgrReport) +TYPE(PerfCounterType) #include "messages/MMDSResolve.h" MESSAGE(MMDSResolve) @@ -321,6 +415,9 @@ MESSAGE(MMonGetVersionReply) #include "messages/MMonGlobalID.h" MESSAGE(MMonGlobalID) +#include "messages/MMonHealth.h" +MESSAGE(MMonHealth) + #include "messages/MMonJoin.h" MESSAGE(MMonJoin) @@ -452,3 +549,74 @@ MESSAGE(MWatchNotify) #include "messages/MMgrUpdate.h" MESSAGE(MMgrUpdate) + +#include "messages/MOSDECSubOpRead.h" +MESSAGE(MOSDECSubOpRead) + +#include "messages/MOSDECSubOpReadReply.h" +MESSAGE(MOSDECSubOpReadReply) + +#include "messages/MOSDECSubOpWrite.h" +MESSAGE(MOSDECSubOpWrite) + +#include "messages/MOSDECSubOpWriteReply.h" +MESSAGE(MOSDECSubOpWriteReply) + +#include "messages/MOSDMarkMeDown.h" +MESSAGE(MOSDMarkMeDown) + +#include "messages/MOSDPGCreated.h" +MESSAGE(MOSDPGCreated) + +#include "messages/MOSDPGPush.h" +MESSAGE(MOSDPGPush) + +#include "messages/MOSDPGPushReply.h" +MESSAGE(MOSDPGPushReply) + +#include "messages/MOSDPGUpdateLogMissing.h" +MESSAGE(MOSDPGUpdateLogMissing) + +#include "messages/MOSDPGUpdateLogMissingReply.h" +MESSAGE(MOSDPGUpdateLogMissingReply) + +#include "messages/MOSDRepOp.h" +MESSAGE(MOSDRepOp) + +#include "messages/MOSDRepOpReply.h" +MESSAGE(MOSDRepOpReply) + +#include "messages/MRecoveryReserve.h" +MESSAGE(MRecoveryReserve) + + +#include "auth/cephx/CephxProtocol.h" +TYPE(CephXAuthenticate) +TYPE(CephXAuthorize) +TYPE(CephXAuthorizeChallenge) +TYPE(CephXAuthorizeReply) +TYPE(CephXChallengeBlob) +TYPE(CephXRequestHeader) +TYPE(CephXResponseHeader) +TYPE(CephXServerChallenge) +TYPE(CephXServiceTicket) +TYPE(CephXServiceTicketInfo) +TYPE(CephXServiceTicketRequest) +TYPE(CephXTicketBlob) + +#include "auth/cephx/CephxKeyServer.h" +TYPE(KeyServerData) +TYPE(KeyServerData::Incremental) + +#include "auth/Auth.h" +TYPE(RotatingSecrets) +TYPE(ExpiringCryptoKey) +TYPE(AuthCapsInfo) +TYPE(AuthTicket) +TYPE(EntityAuth) + +#include "auth/Crypto.h" +TYPE(CryptoKey) + +#include "common/ceph_json.h" +TYPE(JSONFormattable) From c64501d07a47989454a43bb1cb2a0d74bbfa5df7 Mon Sep 17 00:00:00 2001 From: NitzanMordhai Date: Thu, 7 Sep 2023 05:56:55 +0000 Subject: [PATCH 0044/2492] submodule: update ceph-object-corpus submodule Signed-off-by: Nitzan Mordechai --- ceph-object-corpus | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ceph-object-corpus b/ceph-object-corpus index 038c72b5acec..530602c5f31d 160000 --- a/ceph-object-corpus +++ b/ceph-object-corpus @@ -1 +1 @@ -Subproject commit 038c72b5acec667e1aca4c79a8cfcae705d766fe +Subproject commit 530602c5f31d68595495593027439838c459b1eb From cc4b75718f2f9c2e57a40fd2ed395762bbff5bd9 Mon Sep 17 00:00:00 2001 From: Matan Breizman Date: Wed, 30 Aug 2023 08:19:27 +0000 Subject: [PATCH 0045/2492] qa/suites/rados/thrash/thrashers/mapgap: Increase trimming probability Signed-off-by: Matan Breizman --- qa/suites/rados/thrash/thrashers/mapgap.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/qa/suites/rados/thrash/thrashers/mapgap.yaml b/qa/suites/rados/thrash/thrashers/mapgap.yaml index 3b34f5b6b37e..ea7a0c911b5c 100644 --- a/qa/suites/rados/thrash/thrashers/mapgap.yaml +++ b/qa/suites/rados/thrash/thrashers/mapgap.yaml @@ -18,6 +18,7 @@ overrides: osd scrub max interval: 120 osd scrub during recovery: false osd max backfills: 6 + osd beacon report interval: 30 tasks: - thrashosds: timeout: 1800 @@ -25,3 +26,4 @@ tasks: chance_pgnum_shrink: 0.25 chance_pgpnum_fix: 0.25 chance_test_map_discontinuity: 2 + map_discontinuity_sleep_time: 200 From 527b34e6285719f163519f9562523db64012bcdf Mon Sep 17 00:00:00 2001 From: Matan Breizman Date: Sun, 10 Sep 2023 14:38:28 +0000 Subject: [PATCH 0046/2492] doc/dev/osd_internals/past_intervals.rst: add OSDSuperblock::maps Signed-off-by: Matan Breizman --- doc/dev/osd_internals/past_intervals.rst | 35 ++++++++++++++++++------ 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/doc/dev/osd_internals/past_intervals.rst b/doc/dev/osd_internals/past_intervals.rst index 5b594df1ae0c..4d8e7d720933 100644 --- a/doc/dev/osd_internals/past_intervals.rst +++ b/doc/dev/osd_internals/past_intervals.rst @@ -1,9 +1,10 @@ -============= -PastIntervals -============= +================================= +OSDMap Trimming and PastIntervals +================================= + -Purpose -------- +PastIntervals +------------- There are two situations where we need to consider the set of all acting-set OSDs for a PG back to some epoch ``e``: @@ -81,13 +82,31 @@ trimmed up to epoch ``e``, we know that the PG must have been clean at some epoc This dependency also pops up in PeeringState::check_past_interval_bounds(). PeeringState::get_required_past_interval_bounds takes as a parameter -oldest_epoch, which comes from OSDSuperblock::cluster_osdmap_trim_lower_bound. -We use cluster_osdmap_trim_lower_bound rather than a specific osd's oldest_map +oldest epoch, which comes from OSDSuperblock::cluster_osdmap_trim_lower_bound. +We use cluster_osdmap_trim_lower_bound rather than a specific osd's oldest map because we don't necessarily trim all MOSDMap::cluster_osdmap_trim_lower_bound. In order to avoid doing too much work at once we limit the amount of osdmaps trimmed using ``osd_target_transaction_size`` in OSD::trim_maps(). -For this reason, a specific OSD's oldest_map can lag behind +For this reason, a specific OSD's oldest map can lag behind OSDSuperblock::cluster_osdmap_trim_lower_bound for a while. See https://tracker.ceph.com/issues/49689 for an example. + +OSDSuperblock::maps +------------------- + +The OSDSuperblock holds an epoch interval set that represents the OSDMaps +that are stored by the OSD. Each OSDMap epoch range that was handled +is added to the set. +Once an osdmap is trimmed, it will be erased from the set. +As a result, the set's lower bound represent the oldest map that is +stored. While the upper bound represents the newest map. + +The ``interval_set`` data structure supports non-contiguous epoch intervals +which may occur in "map gap" events. Before using this data structure, +``oldest_map`` and ``newest_map`` epochs were stored in the OSDSuperblock. +However, holding a single and contiguous epoch range imposed constraints which +may have resulted in an OSDMap leak. + +See: https://tracker.ceph.com/issues/61962 From 7d8db4954c2fedb4db9f0ccc8a9d304f35807b07 Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Wed, 13 Sep 2023 12:12:26 +0000 Subject: [PATCH 0047/2492] dokan: simple case insensitive emulation While CephFS is case sensitive, Windows software commonly assume that the filesystem is case insensitive. In order to unblock certain use cases, a short term solution is to simply normalize paths when creating or accessing files or directories. This change adds two ceph-dokan parameters: * --case-insensitive: if set, paths are normalized * --force-lowercase: normalized paths are converted to lowercase instead of uppercase This trivial implementation has some limitations: * the original case is not preserved * we could later on use xattr to store the original name * can't access existing files that have a different case * handled at ceph-dokan level The advantage is that it's simple, shouldn't impact performance and doesn't require libcephfs or mds changes. In the future, we may conider a more robust implementation at the mds and/or libcephfs level. Signed-off-by: Lucian Petrut --- src/dokan/ceph_dokan.cc | 40 +++++++++++++++---- src/dokan/ceph_dokan.h | 8 ++++ src/dokan/options.cc | 9 +++++ src/test/dokan/dokan.cc | 86 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 136 insertions(+), 7 deletions(-) diff --git a/src/dokan/ceph_dokan.cc b/src/dokan/ceph_dokan.cc index 9e115222cab2..6459ea261bf1 100644 --- a/src/dokan/ceph_dokan.cc +++ b/src/dokan/ceph_dokan.cc @@ -77,9 +77,26 @@ typedef struct { static_assert(sizeof(fd_context) <= 8, "fd_context exceeds DOKAN_FILE_INFO.Context size."); -string get_path(LPCWSTR path_w) { +string get_path(LPCWSTR path_w, bool normalize_case=true) { string path = to_string(path_w); replace(path.begin(), path.end(), '\\', '/'); + + if (normalize_case && !g_cfg->case_sensitive) { + if (g_cfg->convert_to_uppercase) { + std::transform( + path.begin(), path.end(), path.begin(), + [](unsigned char c){ + return std::toupper(c); + }); + } else { + std::transform( + path.begin(), path.end(), path.begin(), + [](unsigned char c){ + return std::tolower(c); + }); + } + } + return path; } @@ -543,6 +560,11 @@ static NTSTATUS WinCephFindFiles( return cephfs_errno_to_ntstatus_map(ret); } + // TODO: retrieve the original case (e.g. using xattr) if configured + // to do so. + // TODO: provide aliases when case insensitive mounts cause collisions. + // For example, when having test.txt and Test.txt, the latter becomes + // TEST~1.txt WIN32_FIND_DATAW findData; int count = 0; while (1) { @@ -794,14 +816,18 @@ static NTSTATUS WinCephGetVolumeInformation( { g_cfg->win_vol_name.copy(VolumeNameBuffer, VolumeNameSize); *VolumeSerialNumber = g_cfg->win_vol_serial; - *MaximumComponentLength = g_cfg->max_path_len; - *FileSystemFlags = FILE_CASE_SENSITIVE_SEARCH | - FILE_CASE_PRESERVED_NAMES | - FILE_SUPPORTS_REMOTE_STORAGE | - FILE_UNICODE_ON_DISK | - FILE_PERSISTENT_ACLS; + *FileSystemFlags = + FILE_SUPPORTS_REMOTE_STORAGE | + FILE_UNICODE_ON_DISK | + FILE_PERSISTENT_ACLS; + + if (g_cfg->case_sensitive) { + *FileSystemFlags |= + FILE_CASE_SENSITIVE_SEARCH | + FILE_CASE_PRESERVED_NAMES; + } wcscpy(FileSystemNameBuffer, L"Ceph"); return 0; diff --git a/src/dokan/ceph_dokan.h b/src/dokan/ceph_dokan.h index 5957d4dead11..fe48aa458143 100644 --- a/src/dokan/ceph_dokan.h +++ b/src/dokan/ceph_dokan.h @@ -36,6 +36,14 @@ struct Config { unsigned long max_path_len = 256; mode_t file_mode = 0755; mode_t dir_mode = 0755; + + bool case_sensitive = true; + // Convert new file paths to upper case in case of case insensitive mounts. + // Visual Studio recommends normalizing to uppercase in order to avoid + // locale issues (CA1308). + bool convert_to_uppercase = true; + // TODO: consider adding an option to preserve the original case. + // It could be stored using an extended attribute. }; extern Config *g_cfg; diff --git a/src/dokan/options.cc b/src/dokan/options.cc index 1ed90ef9d34d..705e1117ca99 100644 --- a/src/dokan/options.cc +++ b/src/dokan/options.cc @@ -45,6 +45,11 @@ Map options: --max-path-len The value of the maximum path length. Default: 256. --file-mode The access mode to be used when creating files. --dir-mode The access mode to be used when creating directories. + --case-insensitive Emulate a case insensitive filesystem by normalizing + paths. The original case is NOT preserved. Existing + paths with a different case cannot be accessed. + --force-lowercase Use lowercase when normalizing paths. Uppercase is + used by default. Unmap options: -l [ --mountpoint ] arg mountpoint (path or drive letter) (e.g -l x). @@ -196,6 +201,10 @@ int parse_args( *err_msg << "ceph-dokan: Invalid argument for operation-timeout"; return -EINVAL; } + } else if (ceph_argparse_flag(args, i, "--case-insensitive", (char *)NULL)) { + cfg->case_sensitive = false; + } else if (ceph_argparse_flag(args, i, "--force-lowercase", (char *)NULL)) { + cfg->convert_to_uppercase = false; } else { ++i; } diff --git a/src/test/dokan/dokan.cc b/src/test/dokan/dokan.cc index 18f206985e8b..eaa26557fe88 100644 --- a/src/test/dokan/dokan.cc +++ b/src/test/dokan/dokan.cc @@ -39,6 +39,18 @@ std::string get_uuid() { return suffix.to_string(); } +std::string to_upper(std::string& in) { + std::string out = in; + + std::transform( + out.begin(), out.end(), out.begin(), + [](unsigned char c){ + return std::toupper(c); + }); + + return out; +} + bool move_eof(HANDLE handle, LARGE_INTEGER offset) { // Move file pointer to FILE_BEGIN + offset @@ -162,6 +174,22 @@ void map_dokan_with_maxpath( } } +void map_dokan_case_insensitive(SubProcess** mount, const char* mountpoint, + bool force_lowercase=false) { + SubProcess* new_mount = new SubProcess("ceph-dokan"); + + new_mount->add_cmd_args("map", "--win-vol-name", "TestCeph", + "--win-vol-serial", TEST_VOL_SERIAL, + "-l", mountpoint, "--case-insensitive", NULL); + if (force_lowercase) { + new_mount->add_cmd_args("--force-lowercase", NULL); + } + + *mount = new_mount; + ASSERT_EQ(new_mount->spawn(), 0); + ASSERT_EQ(wait_for_mount(mountpoint), 0); +} + void unmap_dokan(SubProcess* mount, const char* mountpoint) { std::string ret = run_cmd("ceph-dokan", "unmap", "-l", mountpoint, (char*)NULL); @@ -762,3 +790,61 @@ TEST_F(DokanTests, test_create_dispositions) { // clean-up ASSERT_TRUE(fs::remove(file_path)); } + +TEST_F(DokanTests, test_case_sensitive) { + std::string test_dir = DEFAULT_MOUNTPOINT"test_dir" + get_uuid() + "\\"; + std::string lower_file_path = test_dir + "file_" + get_uuid(); + std::string upper_file_path = to_upper(lower_file_path); + + ASSERT_TRUE(fs::create_directory(test_dir)); + std::ofstream{lower_file_path}; + + ASSERT_TRUE(fs::exists(lower_file_path)); + ASSERT_FALSE(fs::exists(upper_file_path)); + + // clean-up + fs::remove_all(test_dir); +} + +void test_case_insensitive(bool force_lowercase) { + std::string mountpoint = "Q:\\"; + std::string test_dir = mountpoint + "test_dir" + get_uuid() + "/"; + std::string file_name = "file_" + get_uuid(); + std::string lower_file_path = test_dir + file_name; + std::string upper_file_path = to_upper(lower_file_path); + + SubProcess* mount = nullptr; + map_dokan_case_insensitive(&mount, mountpoint.c_str(), force_lowercase); + + ASSERT_TRUE(fs::create_directory(test_dir)); + std::ofstream{upper_file_path}; + + ASSERT_TRUE(fs::exists(lower_file_path)); + ASSERT_TRUE(fs::exists(upper_file_path)); + + std::vector paths; + for (const auto & entry : fs::recursive_directory_iterator(test_dir)) { + paths.push_back(entry.path().filename().generic_string()); + } + + bool found_lowercase = std::find( + begin(paths), end(paths), file_name) != end(paths); + bool found_uppercase = std::find( + begin(paths), end(paths), to_upper(file_name)) != end(paths); + + ASSERT_EQ(found_lowercase, force_lowercase); + ASSERT_NE(found_uppercase, force_lowercase); + + // clean-up + fs::remove_all(test_dir); + + unmap_dokan(mount, mountpoint.c_str()); +} + +TEST_F(DokanTests, test_case_insensitive_force_lower) { + test_case_insensitive(true); +} + +TEST_F(DokanTests, test_case_insensitive_force_upper) { + test_case_insensitive(false); +} From 0357ab2ad8b398641b1e03727c9de6241d72ad83 Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Thu, 14 Sep 2023 11:08:36 +0000 Subject: [PATCH 0048/2492] test/dokan: avoid unnecessary wait One of the dokan tests expects some mount operations to fail. The issue is that it polls the mount location for 10s, which is unnecessary. We'll update the test to check the mount process exit code instead. This basically reduces the dokan test duration by about 20s. Signed-off-by: Lucian Petrut --- src/test/dokan/dokan.cc | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/test/dokan/dokan.cc b/src/test/dokan/dokan.cc index eaa26557fe88..baef44a49b65 100644 --- a/src/test/dokan/dokan.cc +++ b/src/test/dokan/dokan.cc @@ -157,7 +157,18 @@ void map_dokan_with_maxpath( const char* mountpoint, uint64_t max_path_len) { - SubProcess* new_mount = new SubProcess("ceph-dokan"); + SubProcess* new_mount = nullptr; + + bool expect_failure = max_path_len < 256 || max_path_len > 4096; + if (expect_failure) { + new_mount = new SubProcessTimed( + "ceph-dokan", + SubProcess::CLOSE, SubProcess::CLOSE, SubProcess::CLOSE, + MOUNT_POLL_ATTEMPT * MOUNT_POLL_INTERVAL_MS / 1000); + } else { + new_mount = new SubProcess("ceph-dokan"); + } + new_mount->add_cmd_args("map", "--debug", "--dokan-stderr", "--win-vol-name", "TestCeph", "--win-vol-serial", TEST_VOL_SERIAL, @@ -167,10 +178,10 @@ void map_dokan_with_maxpath( *mount = new_mount; ASSERT_EQ(new_mount->spawn(), 0); - if (256 <= max_path_len && max_path_len <= 4096) { - ASSERT_EQ(wait_for_mount(mountpoint), 0); + if (expect_failure) { + ASSERT_NE(0, new_mount->join()); } else { - ASSERT_NE(wait_for_mount(mountpoint), 0); + ASSERT_EQ(wait_for_mount(mountpoint), 0); } } From 723c1b7c9e773a68b6c2586155092ca95ab4f3e7 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 15 Sep 2023 08:41:35 +0800 Subject: [PATCH 0049/2492] mds: record the internal client request and receive client reply This will be used to avoid possible multiple reintegration issue later. Fixes: https://tracker.ceph.com/issues/62702 Signed-off-by: Xiubo Li --- src/mds/MDSMetaRequest.h | 33 +++++++++++++++++++++++++++++++++ src/mds/MDSRank.cc | 2 ++ src/mds/MDSRank.h | 3 +++ src/mds/Server.cc | 30 ++++++++++++++++++++++++++++++ src/mds/Server.h | 1 + 5 files changed, 69 insertions(+) create mode 100644 src/mds/MDSMetaRequest.h diff --git a/src/mds/MDSMetaRequest.h b/src/mds/MDSMetaRequest.h new file mode 100644 index 000000000000..ad4720410686 --- /dev/null +++ b/src/mds/MDSMetaRequest.h @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2023 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_META_REQUEST_H +#define CEPH_MDS_META_REQUEST_H + +#include "include/types.h" + +struct MDSMetaRequest { +private: + int op; + ceph_tid_t tid; +public: + explicit MDSMetaRequest(int o, ceph_tid_t t) : + op(o), tid(t) { } + virtual ~MDSMetaRequest() { } + + int get_op() { return op; } + ceph_tid_t get_tid() { return tid; } +}; + +#endif // !CEPH_MDS_META_REQUEST_H diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index d7d583f53274..fb78aa3c28ee 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -1191,6 +1191,7 @@ bool MDSRank::is_valid_message(const cref_t &m) { type == CEPH_MSG_CLIENT_RECONNECT || type == CEPH_MSG_CLIENT_RECLAIM || type == CEPH_MSG_CLIENT_REQUEST || + type == CEPH_MSG_CLIENT_REPLY || type == MSG_MDS_PEER_REQUEST || type == MSG_MDS_HEARTBEAT || type == MSG_MDS_TABLE_REQUEST || @@ -1244,6 +1245,7 @@ void MDSRank::handle_message(const cref_t &m) ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT); // fall-thru case CEPH_MSG_CLIENT_REQUEST: + case CEPH_MSG_CLIENT_REPLY: server->dispatch(m); break; case MSG_MDS_PEER_REQUEST: diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h index d01a5894df47..48043df4baf6 100644 --- a/src/mds/MDSRank.h +++ b/src/mds/MDSRank.h @@ -43,6 +43,7 @@ #include "Server.h" #include "MetricsHandler.h" #include "osdc/Journaler.h" +#include "MDSMetaRequest.h" // Full .h import instead of forward declaration for PerfCounter, for the // benefit of those including this header and using MDSRank::logger @@ -423,6 +424,8 @@ class MDSRank { PerfCounters *logger = nullptr, *mlogger = nullptr; OpTracker op_tracker; + std::map> internal_client_requests; + // The last different state I held before current MDSMap::DaemonState last_state = MDSMap::STATE_BOOT; // The state assigned to me by the MDSMap diff --git a/src/mds/Server.cc b/src/mds/Server.cc index bb5f0a30ac7c..780eafc43644 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -31,6 +31,7 @@ #include "Mutation.h" #include "MetricsHandler.h" #include "cephfs_features.h" +#include "MDSContext.h" #include "msg/Messenger.h" @@ -360,6 +361,9 @@ void Server::dispatch(const cref_t &m) case CEPH_MSG_CLIENT_REQUEST: handle_client_request(ref_cast(m)); return; + case CEPH_MSG_CLIENT_REPLY: + handle_client_reply(ref_cast(m)); + return; case CEPH_MSG_CLIENT_RECLAIM: handle_client_reclaim(ref_cast(m)); return; @@ -2319,6 +2323,10 @@ void Server::reply_client_request(MDRequestRef& mdr, const ref_t & mds->send_message_client(reply, session); } + if (client_inst.name.is_mds() && reply->get_op() == CEPH_MDS_OP_RENAME) { + mds->send_message(reply, mdr->client_request->get_connection()); + } + if (req->is_queued_for_replay() && (mdr->has_completed || reply->get_result() < 0)) { if (reply->get_result() < 0) { @@ -2551,6 +2559,28 @@ void Server::handle_client_request(const cref_t &req) return; } +void Server::handle_client_reply(const cref_t &reply) +{ + dout(4) << "handle_client_reply " << *reply << dendl; + + ceph_assert(reply->is_safe()); + ceph_tid_t tid = reply->get_tid(); + + if (mds->internal_client_requests.count(tid) == 0) { + dout(1) << " no pending request on tid " << tid << dendl; + return; + } + + switch (reply->get_op()) { + case CEPH_MDS_OP_RENAME: + break; + default: + dout(5) << " unknown client op " << reply->get_op() << dendl; + } + + mds->internal_client_requests.erase(tid); +} + void Server::handle_osd_map() { /* Note that we check the OSDMAP_FULL flag directly rather than diff --git a/src/mds/Server.h b/src/mds/Server.h index 61096a5b68a4..6b2f9c188f5a 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -159,6 +159,7 @@ class Server { // -- requests -- void handle_client_request(const cref_t &m); + void handle_client_reply(const cref_t &m); void journal_and_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn, LogEvent *le, MDSLogContextBase *fin); From 53d9e657e42bdf2440cd93a8273eb3ce79d8bd33 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 15 Sep 2023 08:41:35 +0800 Subject: [PATCH 0050/2492] mds: fix issuing redundant reintegrate/migrate_stray requests Just in case a CInode's nlink is 1, and then a unlink request comes and then early replies and submits to the MDLogs, but just before the MDlogs are flushed a link request comes, and the link request also succeeds and early replies to client. Later when the unlink/link requests' MDLog events are flushed and the callbacks are called, which will fire a stray denty reintegration. But it will pick the new dentry, which is from the link's request and is a remote dentry, to do the reintegration. While in the 'rename' code when traversing the path it will trigger to call the 'dn->link_remote()', which later will fire a new stray dentry reintegration. The problem is if the first 'rename' request is retried several times, and in each time it will fire a new reintegration, which makes no sense and maybe blocked for a very long time dues to some reasons and then will be reported as slow request warning. Fixes: https://tracker.ceph.com/issues/62702 Signed-off-by: Xiubo Li --- src/mds/CDentry.h | 2 ++ src/mds/StrayManager.cc | 30 ++++++++++++++++++++++++++---- src/mds/StrayManager.h | 17 ++++++++++++++++- 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h index 4dca5816ae6f..1c2b6f892cec 100644 --- a/src/mds/CDentry.h +++ b/src/mds/CDentry.h @@ -376,6 +376,8 @@ class CDentry : public MDSCacheObject, public LRUObject, public Counter mempool::mds_co::map client_lease_map; std::map> batch_ops; + ceph_tid_t reintegration_reqid = 0; + protected: friend class Migrator; diff --git a/src/mds/StrayManager.cc b/src/mds/StrayManager.cc index 325209da6e0e..e9ec153d3fdb 100644 --- a/src/mds/StrayManager.cc +++ b/src/mds/StrayManager.cc @@ -673,24 +673,41 @@ void StrayManager::reintegrate_stray(CDentry *straydn, CDentry *rdn) { dout(10) << __func__ << " " << *straydn << " to " << *rdn << dendl; + if (straydn->reintegration_reqid) { + dout(20) << __func__ << ": stray dentry " << *straydn + << " is already under reintegrating" << dendl; + return; + } + logger->inc(l_mdc_strays_reintegrated); - + // rename it to remote linkage . filepath src(straydn->get_name(), straydn->get_dir()->ino()); filepath dst(rdn->get_name(), rdn->get_dir()->ino()); + ceph_tid_t tid = mds->issue_tid(); + auto req = make_message(CEPH_MDS_OP_RENAME); req->set_filepath(dst); req->set_filepath2(src); - req->set_tid(mds->issue_tid()); + req->set_tid(tid); + + auto ptr = std::make_unique(CEPH_MDS_OP_RENAME, tid, straydn); + mds->internal_client_requests.emplace(tid, std::move(ptr)); mds->send_message_mds(req, rdn->authority().first); } - + void StrayManager::migrate_stray(CDentry *dn, mds_rank_t to) { dout(10) << __func__ << " " << *dn << " to mds." << to << dendl; + if (dn->reintegration_reqid) { + dout(20) << __func__ << ": stray dentry " << *dn + << " is already under migrating" << dendl; + return; + } + logger->inc(l_mdc_strays_migrated); // rename it to another mds. @@ -700,10 +717,15 @@ void StrayManager::migrate_stray(CDentry *dn, mds_rank_t to) filepath src(dn->get_name(), dirino); filepath dst(dn->get_name(), MDS_INO_STRAY(to, MDS_INO_STRAY_INDEX(dirino))); + ceph_tid_t tid = mds->issue_tid(); + auto req = make_message(CEPH_MDS_OP_RENAME); req->set_filepath(dst); req->set_filepath2(src); - req->set_tid(mds->issue_tid()); + req->set_tid(tid); + + auto ptr = std::make_unique(CEPH_MDS_OP_RENAME, tid, dn); + mds->internal_client_requests.emplace(tid, std::move(ptr)); mds->send_message_mds(req, to); } diff --git a/src/mds/StrayManager.h b/src/mds/StrayManager.h index 86b6941a5131..874fbbb9a8dc 100644 --- a/src/mds/StrayManager.h +++ b/src/mds/StrayManager.h @@ -19,15 +19,30 @@ #include #include "Mutation.h" #include "PurgeQueue.h" +#include "MDSMetaRequest.h" +#include "CDentry.h" class MDSRank; class CInode; -class CDentry; class StrayManager { // My public interface is for consumption by MDCache public: + struct StrayEvalRequest : public MDSMetaRequest { + CDentry *dentry; + public: + explicit StrayEvalRequest(int o, ceph_tid_t t, CDentry *d) : + MDSMetaRequest(o, t), dentry(d) { + dentry->get(CDentry::PIN_PURGING); + dentry->reintegration_reqid = t; + } + ~StrayEvalRequest() { + dentry->reintegration_reqid = 0; + dentry->put(CDentry::PIN_PURGING); + } + }; + explicit StrayManager(MDSRank *mds, PurgeQueue &purge_queue_); void set_logger(PerfCounters *l) {logger = l;} void activate(); From dbb4daff404c5d2da32c33f4e852e84a257c0b8d Mon Sep 17 00:00:00 2001 From: Prasanna Kumar Kalever Date: Tue, 12 Sep 2023 17:45:05 +0530 Subject: [PATCH 0051/2492] rbd-nbd: fix stuck with disable request Problem: ------- Trying to disable any feature on an rbd image mapped with nbd leads to stuck in rbd-nbd. The rbd-nbd registers a watcher callback to detect image resize in NBDWatchCtx::handle_notify(). The handle_notify calls image info method, which calls refresh_if_required and it got stuck there. It is getting stuck in ImageState::refresh_if_required() because DisableFeaturesRequest issues update notifications while still holding onto the exclusive lock with everything that has to do with it blocked. Solution: -------- Set only notify flag as part of NBDWatchCtx::handle_notify() and handle the resize detection part as part of a different thread. Fixes: https://tracker.ceph.com/issues/58740 Signed-off-by: Prasanna Kumar Kalever --- qa/workunits/rbd/rbd-nbd.sh | 10 ++++ src/tools/rbd_nbd/rbd-nbd.cc | 112 +++++++++++++++++++++++++---------- 2 files changed, 91 insertions(+), 31 deletions(-) diff --git a/qa/workunits/rbd/rbd-nbd.sh b/qa/workunits/rbd/rbd-nbd.sh index 122df3d6f35a..bc89e9be5a18 100755 --- a/qa/workunits/rbd/rbd-nbd.sh +++ b/qa/workunits/rbd/rbd-nbd.sh @@ -472,6 +472,16 @@ DEV= rbd feature disable ${POOL}/${IMAGE} journaling rbd config image rm ${POOL}/${IMAGE} rbd_discard_granularity_bytes +# test that disabling a feature so that the op is proxied to rbd-nbd +# (arranged here by blkdiscard before "rbd feature disable") doesn't hang +DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}` +get_pid ${POOL} +rbd feature enable ${POOL}/${IMAGE} journaling +_sudo blkdiscard --offset 0 --length 4096 ${DEV} +rbd feature disable ${POOL}/${IMAGE} journaling +unmap_device ${DEV} ${PID} +DEV= + # test that rbd_op_threads setting takes effect EXPECTED=`ceph-conf --show-config-value librados_thread_count` DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}` diff --git a/src/tools/rbd_nbd/rbd-nbd.cc b/src/tools/rbd_nbd/rbd-nbd.cc index 3130e8bc750e..e348bd8fe431 100644 --- a/src/tools/rbd_nbd/rbd-nbd.cc +++ b/src/tools/rbd_nbd/rbd-nbd.cc @@ -738,7 +738,67 @@ class NBDWatchCtx : public librbd::UpdateWatchCtx bool use_netlink; librados::IoCtx &io_ctx; librbd::Image ℑ - unsigned long size; + uint64_t size; + std::thread handle_notify_thread; + ceph::condition_variable cond; + ceph::mutex lock = ceph::make_mutex("NBDWatchCtx::Locker"); + bool notify = false; + bool terminated = false; + + bool wait_notify() { + dout(10) << __func__ << dendl; + + std::unique_lock locker{lock}; + cond.wait(locker, [this] { return notify || terminated; }); + + if (terminated) { + return false; + } + + dout(10) << __func__ << ": got notify request" << dendl; + notify = false; + return true; + } + + void handle_notify_entry() { + dout(10) << __func__ << dendl; + + while (wait_notify()) { + uint64_t new_size; + int ret = image.size(&new_size); + if (ret < 0) { + derr << "getting image size failed: " << cpp_strerror(ret) << dendl; + continue; + } + if (new_size == size) { + continue; + } + dout(5) << "resize detected" << dendl; + if (ioctl(fd, BLKFLSBUF, NULL) < 0) { + derr << "invalidate page cache failed: " << cpp_strerror(errno) + << dendl; + } + if (use_netlink) { + ret = netlink_resize(nbd_index, new_size); + } else { + ret = ioctl(fd, NBD_SET_SIZE, new_size); + if (ret < 0) { + derr << "resize failed: " << cpp_strerror(errno) << dendl; + } + } + if (!ret) { + size = new_size; + } + if (ioctl(fd, BLKRRPART, NULL) < 0) { + derr << "rescan of partition table failed: " << cpp_strerror(errno) + << dendl; + } + if (image.invalidate_cache() < 0) { + derr << "invalidate rbd cache failed" << dendl; + } + } + } + public: NBDWatchCtx(int _fd, int _nbd_index, @@ -752,41 +812,31 @@ class NBDWatchCtx : public librbd::UpdateWatchCtx , io_ctx(_io_ctx) , image(_image) , size(_size) - { } + { + handle_notify_thread = make_named_thread("rbd_handle_notify", + &NBDWatchCtx::handle_notify_entry, + this); + } - ~NBDWatchCtx() override {} + ~NBDWatchCtx() override + { + dout(10) << __func__ << ": terminating" << dendl; + std::unique_lock locker{lock}; + terminated = true; + cond.notify_all(); + locker.unlock(); + + handle_notify_thread.join(); + dout(10) << __func__ << ": finish" << dendl; + } void handle_notify() override { - librbd::image_info_t info; - if (image.stat(info, sizeof(info)) == 0) { - unsigned long new_size = info.size; - int ret; - - if (new_size != size) { - dout(5) << "resize detected" << dendl; - if (ioctl(fd, BLKFLSBUF, NULL) < 0) - derr << "invalidate page cache failed: " << cpp_strerror(errno) - << dendl; - if (use_netlink) { - ret = netlink_resize(nbd_index, new_size); - } else { - ret = ioctl(fd, NBD_SET_SIZE, new_size); - if (ret < 0) - derr << "resize failed: " << cpp_strerror(errno) << dendl; - } - - if (!ret) - size = new_size; + dout(10) << __func__ << dendl; - if (ioctl(fd, BLKRRPART, NULL) < 0) { - derr << "rescan of partition table failed: " << cpp_strerror(errno) - << dendl; - } - if (image.invalidate_cache() < 0) - derr << "invalidate rbd cache failed" << dendl; - } - } + std::unique_lock locker{lock}; + notify = true; + cond.notify_all(); } }; From b30c35019d0128a0cf840b43d3059392effbb52c Mon Sep 17 00:00:00 2001 From: Patty8122 Date: Fri, 22 Sep 2023 16:33:38 -0500 Subject: [PATCH 0052/2492] doc/man/8/ceph-bluestore-tool.rst: Added valid options for fsck --deep Signed-off-by: Patty8122 --- doc/man/8/ceph-bluestore-tool.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/man/8/ceph-bluestore-tool.rst b/doc/man/8/ceph-bluestore-tool.rst index f6c88da09b24..634d3a8bb6eb 100644 --- a/doc/man/8/ceph-bluestore-tool.rst +++ b/doc/man/8/ceph-bluestore-tool.rst @@ -44,7 +44,7 @@ Commands show help -:command:`fsck` [ --deep ] +:command:`fsck` [ --deep ] *(on|off) or (yes|no) or (1|0) or (true|false)* run consistency check on BlueStore metadata. If *--deep* is specified, also read all object data and verify checksums. From 9b9eff16a885284c106a368970479b54eedb9c19 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 19 Sep 2023 13:45:16 +0800 Subject: [PATCH 0053/2492] crimson/osd/replicated_backend: don't put pg's metadata updates in repops Fixes: https://tracker.ceph.com/issues/62857 Signed-off-by: Xuehan Xu --- src/crimson/osd/pg.cc | 4 +- src/crimson/osd/pg_backend.cc | 4 +- src/crimson/osd/pg_backend.h | 1 + src/crimson/osd/replicated_backend.cc | 65 ++++++++++++++++----------- src/crimson/osd/replicated_backend.h | 3 ++ 5 files changed, 49 insertions(+), 28 deletions(-) diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index 7cf3b158c89c..2bd1bfc5bf31 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -115,6 +115,7 @@ PG::PG( pgid.pgid, pg_shard, pool, + *this, coll_ref, shard_services, profile, @@ -761,8 +762,7 @@ PG::submit_transaction( ceph_assert(!has_reset_since(osd_op_p.at_version.epoch)); peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, osd_op_p.at_version); - peering_state.append_log_with_trim_to_updated(std::move(log_entries), osd_op_p.at_version, - txn, true, false); + peering_state.update_trim_to(); auto [submitted, all_completed] = backend->mutate_object( peering_state.get_acting_recovery_backfill(), diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc index 02acb9a55d3f..06bcf7bbd036 100644 --- a/src/crimson/osd/pg_backend.cc +++ b/src/crimson/osd/pg_backend.cc @@ -23,6 +23,7 @@ #include "crimson/os/futurized_store.h" #include "crimson/osd/osd_operation.h" #include "crimson/osd/object_context_loader.h" +#include "crimson/osd/pg.h" #include "replicated_backend.h" #include "replicated_recovery_backend.h" #include "ec_backend.h" @@ -43,6 +44,7 @@ std::unique_ptr PGBackend::create(pg_t pgid, const pg_shard_t pg_shard, const pg_pool_t& pool, + crimson::osd::PG& pg, crimson::os::CollectionRef coll, crimson::osd::ShardServices& shard_services, const ec_profile_t& ec_profile, @@ -50,7 +52,7 @@ PGBackend::create(pg_t pgid, { switch (pool.type) { case pg_pool_t::TYPE_REPLICATED: - return std::make_unique(pgid, pg_shard, + return std::make_unique(pgid, pg_shard, pg, coll, shard_services, dpp); case pg_pool_t::TYPE_ERASURE: diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h index fbad37d4c71c..d681726576ff 100644 --- a/src/crimson/osd/pg_backend.h +++ b/src/crimson/osd/pg_backend.h @@ -70,6 +70,7 @@ class PGBackend static std::unique_ptr create(pg_t pgid, const pg_shard_t pg_shard, const pg_pool_t& pool, + crimson::osd::PG &pg, crimson::os::CollectionRef coll, crimson::osd::ShardServices& shard_services, const ec_profile_t& ec_profile, diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc index 0ff4ad5730f5..caa86d46a5b1 100644 --- a/src/crimson/osd/replicated_backend.cc +++ b/src/crimson/osd/replicated_backend.cc @@ -8,6 +8,7 @@ #include "crimson/common/exception.h" #include "crimson/common/log.h" #include "crimson/os/futurized_store.h" +#include "crimson/osd/pg.h" #include "crimson/osd/shard_services.h" #include "osd/PeeringState.h" @@ -15,12 +16,14 @@ SET_SUBSYS(osd); ReplicatedBackend::ReplicatedBackend(pg_t pgid, pg_shard_t whoami, + crimson::osd::PG& pg, ReplicatedBackend::CollectionRef coll, crimson::osd::ShardServices& shard_services, DoutPrefixProvider &dpp) : PGBackend{whoami.shard, coll, shard_services, dpp}, pgid{pgid}, - whoami{whoami} + whoami{whoami}, + pg(pg) {} ReplicatedBackend::ll_read_ierrorator::future @@ -41,6 +44,7 @@ ReplicatedBackend::_submit_transaction(std::set&& pg_shards, std::vector&& log_entries) { LOG_PREFIX(ReplicatedBackend::_submit_transaction); + DEBUGDPP("object {}, {}", dpp, hoid); const ceph_tid_t tid = shard_services.get_tid(); auto pending_txn = @@ -48,29 +52,6 @@ ReplicatedBackend::_submit_transaction(std::set&& pg_shards, bufferlist encoded_txn; encode(txn, encoded_txn); - DEBUGDPP("object {}", dpp, hoid); - auto all_completed = interruptor::make_interruptible( - shard_services.get_store().do_transaction(coll, std::move(txn)) - ).then_interruptible([FNAME, this, - peers=pending_txn->second.weak_from_this()] { - if (!peers) { - // for now, only actingset_changed can cause peers - // to be nullptr - ERRORDPP("peers is null, this should be impossible", dpp); - assert(0 == "impossible"); - } - if (--peers->pending == 0) { - peers->all_committed.set_value(); - peers->all_committed = {}; - return seastar::now(); - } - return peers->all_committed.get_shared_future(); - }).then_interruptible([pending_txn, this] { - auto acked_peers = std::move(pending_txn->second.acked_peers); - pending_trans.erase(pending_txn); - return seastar::make_ready_future(std::move(acked_peers)); - }); - auto sends = std::make_unique>>(); for (auto pg_shard : pg_shards) { if (pg_shard != whoami) { @@ -91,9 +72,43 @@ ReplicatedBackend::_submit_transaction(std::set&& pg_shards, m->min_last_complete_ondisk = osd_op_p.min_last_complete_ondisk; m->set_rollback_to(osd_op_p.at_version); // TODO: set more stuff. e.g., pg_states - sends->emplace_back(shard_services.send_to_osd(pg_shard.osd, std::move(m), map_epoch)); + sends->emplace_back( + shard_services.send_to_osd( + pg_shard.osd, std::move(m), map_epoch)); } } + + pg.log_operation( + std::move(log_entries), + osd_op_p.pg_trim_to, + osd_op_p.at_version, + osd_op_p.min_last_complete_ondisk, + true, + txn, + false); + + auto all_completed = interruptor::make_interruptible( + shard_services.get_store().do_transaction(coll, std::move(txn)) + ).then_interruptible([FNAME, this, + peers=pending_txn->second.weak_from_this()] { + if (!peers) { + // for now, only actingset_changed can cause peers + // to be nullptr + ERRORDPP("peers is null, this should be impossible", dpp); + assert(0 == "impossible"); + } + if (--peers->pending == 0) { + peers->all_committed.set_value(); + peers->all_committed = {}; + return seastar::now(); + } + return peers->all_committed.get_shared_future(); + }).then_interruptible([pending_txn, this] { + auto acked_peers = std::move(pending_txn->second.acked_peers); + pending_trans.erase(pending_txn); + return seastar::make_ready_future(std::move(acked_peers)); + }); + auto sends_complete = seastar::when_all_succeed( sends->begin(), sends->end() ).finally([sends=std::move(sends)] {}); diff --git a/src/crimson/osd/replicated_backend.h b/src/crimson/osd/replicated_backend.h index f789a35eae69..78366060d894 100644 --- a/src/crimson/osd/replicated_backend.h +++ b/src/crimson/osd/replicated_backend.h @@ -14,12 +14,14 @@ namespace crimson::osd { class ShardServices; + class PG; } class ReplicatedBackend : public PGBackend { public: ReplicatedBackend(pg_t pgid, pg_shard_t whoami, + crimson::osd::PG& pg, CollectionRef coll, crimson::osd::ShardServices& shard_services, DoutPrefixProvider &dpp); @@ -55,6 +57,7 @@ class ReplicatedBackend : public PGBackend }; using pending_transactions_t = std::map; pending_transactions_t pending_trans; + crimson::osd::PG& pg; seastar::future<> request_committed( const osd_reqid_t& reqid, const eversion_t& at_version) final; From fcce984a74f0b34deea491d5b91cc6a1254f73e1 Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Mon, 25 Sep 2023 17:58:53 +0300 Subject: [PATCH 0054/2492] osd/PeeringState: get rid off costly and redundant count() calls. Signed-off-by: Igor Fedotov --- src/osd/PeeringState.cc | 47 +++++++++++++++++++++++------------------ src/osd/PeeringState.h | 10 +++++---- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 5cd78afdeec8..2c41b7b71189 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -2749,8 +2749,9 @@ void PeeringState::activate( ++i) { if (*i == pg_whoami) continue; pg_shard_t peer = *i; - ceph_assert(peer_info.count(peer)); - pg_info_t& pi = peer_info[peer]; + auto pi_it = peer_info.find(peer); + ceph_assert(pi_it != peer_info.end()); + pg_info_t& pi = pi_it->second; psdout(10) << "activate peer osd." << peer << " " << pi << dendl; @@ -2759,8 +2760,9 @@ void PeeringState::activate( #else MRef m; #endif - ceph_assert(peer_missing.count(peer)); - pg_missing_t& pm = peer_missing[peer]; + auto pm_it = peer_missing.find(peer); + ceph_assert(pm_it != peer_missing.end()); + pg_missing_t& pm = pm_it->second; bool needs_past_intervals = pi.dne(); @@ -2927,21 +2929,24 @@ void PeeringState::activate( ++i) { if (*i == pg_whoami) continue; psdout(10) << ": adding " << *i << " as a source" << dendl; - ceph_assert(peer_missing.count(*i)); - ceph_assert(peer_info.count(*i)); + auto pi_it = peer_info.find(*i); + ceph_assert(pi_it != peer_info.end()); + auto pm_it = peer_missing.find(*i); + ceph_assert(pm_it != peer_missing.end()); missing_loc.add_source_info( *i, - peer_info[*i], - peer_missing[*i], + pi_it->second, + pm_it->second, ctx.handle); } } for (auto i = peer_missing.begin(); i != peer_missing.end(); ++i) { if (is_acting_recovery_backfill(i->first)) continue; - ceph_assert(peer_info.count(i->first)); + auto pi_it = peer_info.find(i->first); + ceph_assert(pi_it != peer_info.end()); search_for_missing( - peer_info[i->first], + pi_it->second, i->second, i->first, ctx); @@ -3640,8 +3645,9 @@ void PeeringState::update_calc_stats() if (is_backfill_target(peer.first)) { missing = std::max((int64_t)0, num_objects - peer_num_objects); } else { - if (peer_missing.count(peer.first)) { - missing = peer_missing[peer.first].num_missing(); + auto pm_it = peer_missing.find(peer.first); + if (pm_it != peer_missing.end()) { + missing = pm_it->second.num_missing(); } else { psdout(20) << "no peer_missing found for " << peer.first << dendl; @@ -4094,12 +4100,14 @@ void PeeringState::merge_new_log_entries( ++i) { pg_shard_t peer(*i); if (peer == pg_whoami) continue; - ceph_assert(peer_missing.count(peer)); - ceph_assert(peer_info.count(peer)); - pg_missing_t& pmissing(peer_missing[peer]); + auto pm_it = peer_missing.find(peer); + ceph_assert(pm_it != peer_missing.end()); + auto pi_it = peer_info.find(peer); + ceph_assert(pi_it != peer_info.end()); + pg_missing_t& pmissing(pm_it->second); psdout(20) << "peer_missing for " << peer << " = " << pmissing << dendl; - pg_info_t& pinfo(peer_info[peer]); + pg_info_t& pinfo = pi_it->second; bool invalidate_stats = PGLog::append_log_entries_update_missing( pinfo.last_backfill, entries, @@ -6133,10 +6141,9 @@ boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt) // may be telling us they have activated (and committed) but we can't // share that until _everyone_ does the same. if (ps->is_acting_recovery_backfill(infoevt.from) && - ps->peer_activated.count(infoevt.from) == 0) { + ps->peer_activated.insert(infoevt.from).second) { psdout(10) << " peer osd." << infoevt.from << " activated and committed" << dendl; - ps->peer_activated.insert(infoevt.from); ps->blocked_by.erase(infoevt.from.shard); pl->publish_stats_to_osd(); if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) { @@ -6220,8 +6227,8 @@ boost::statechart::result PeeringState::Active::react( const ActivateCommitted &evt) { DECLARE_LOCALS; - ceph_assert(!ps->peer_activated.count(ps->pg_whoami)); - ps->peer_activated.insert(ps->pg_whoami); + auto p = ps->peer_activated.insert(ps->pg_whoami); + ceph_assert(p.second); psdout(10) << "_activate_committed " << evt.epoch << " peer_activated now " << ps->peer_activated << " last_interval_started " diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h index cf70fa1d11bc..b9f8401b05b6 100644 --- a/src/osd/PeeringState.h +++ b/src/osd/PeeringState.h @@ -2333,13 +2333,15 @@ class PeeringState : public MissingLoc::MappingInfo { if (peer == pg_whoami) { return pg_log.get_missing(); } else { - assert(peer_missing.count(peer)); - return peer_missing.find(peer)->second; + auto it = peer_missing.find(peer); + assert(it != peer_missing.end()); + return it->second; } } const pg_info_t&get_peer_info(pg_shard_t peer) const { - assert(peer_info.count(peer)); - return peer_info.find(peer)->second; + auto it = peer_info.find(peer); + assert(it != peer_info.end()); + return it->second; } bool has_peer_info(pg_shard_t peer) const { return peer_info.count(peer); From 36870a557194dd647c03f99e033c4fc99c89dbe0 Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Mon, 25 Sep 2023 18:00:37 +0300 Subject: [PATCH 0055/2492] osd/PrimaryLogPG: get rid off redundant assertion. It's also performed inside get_peer_info() call below. Signed-off-by: Igor Fedotov --- src/osd/PrimaryLogPG.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index c28184f9c9b6..f615604eaeff 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -550,7 +550,6 @@ bool PrimaryLogPG::should_send_op( const hobject_t &hoid) { if (peer == get_primary()) return true; - ceph_assert(recovery_state.has_peer_info(peer)); bool should_send = hoid.pool != (int64_t)info.pgid.pool() || hoid <= last_backfill_started || From d14752ff1f56da93ad0d9c94fcf101945fdacadd Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 21 Sep 2023 15:02:38 +0800 Subject: [PATCH 0056/2492] client: queue a delay cap flushing if there are ditry caps/snapcaps We should queue a cap release anyway if there are dirty caps/snapcaps for the inodes when trimming caps. Fixes: https://tracker.ceph.com/issues/62979 Signed-off-by: Xiubo Li --- src/client/Client.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 5820ef90ae0c..7d40bbc9440a 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -4796,6 +4796,9 @@ void Client::trim_caps(MetaSession *s, uint64_t max) // is deleted inside remove_cap ++p; + if (in->dirty_caps || in->cap_snaps.size()) + cap_delay_requeue(in.get()); + if (in->caps.size() > 1 && cap != in->auth_cap) { int mine = cap->issued | cap->implemented; int oissued = in->auth_cap ? in->auth_cap->issued : 0; @@ -4833,7 +4836,8 @@ void Client::trim_caps(MetaSession *s, uint64_t max) } if (all && in->ino != CEPH_INO_ROOT) { ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl; - trimmed++; + if (!in->dirty_caps && !in->cap_snaps.size()) + trimmed++; } } } From c20b4d706dfe32e08c5301676fd9144b052a4347 Mon Sep 17 00:00:00 2001 From: Milind Changire Date: Tue, 26 Sep 2023 16:20:50 +0530 Subject: [PATCH 0057/2492] mds/scrub: enqueue all child frags for a given fragset Problem: fragsets sent over to replicas for scrubbing are simplified i.e. they are bit representation of the lease common ancestors of the frags that need scrubbing on that replica. A search operation of a frag in the frasget often fails to match exactly with the frags delegated to the replica causing the scrub item to infinitely be held in the scrub stack. Solution: Test if the frag in the fragset sent over to the replica contains the delegated frag as a child to accept it for scrubbing. Fixes: https://tracker.ceph.com/issues/62658 Signed-off-by: Milind Changire --- src/mds/ScrubStack.cc | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc index 6d799343f149..047bf3ba8220 100644 --- a/src/mds/ScrubStack.cc +++ b/src/mds/ScrubStack.cc @@ -892,22 +892,30 @@ void ScrubStack::handle_scrub(const cref_t &m) std::vector dfs; MDSGatherBuilder gather(g_ceph_context); + frag_vec_t frags; + diri->dirfragtree.get_leaves(frags); for (const auto& fg : m->get_frags()) { - CDir *dir = diri->get_dirfrag(fg); - if (!dir) { - dout(10) << __func__ << " no frag " << fg << dendl; - continue; - } - if (!dir->is_auth()) { - dout(10) << __func__ << " not auth " << *dir << dendl; - continue; - } - if (!dir->can_auth_pin()) { - dout(10) << __func__ << " can't auth pin " << *dir << dendl; - dir->add_waiter(CDir::WAIT_UNFREEZE, gather.new_sub()); - continue; + for (auto f : frags) { + if (!fg.contains(f)) { + dout(20) << __func__ << " skipping " << f << dendl; + continue; + } + CDir *dir = diri->get_or_open_dirfrag(mdcache, f); + if (!dir) { + dout(10) << __func__ << " no frag " << f << dendl; + continue; + } + if (!dir->is_auth()) { + dout(10) << __func__ << " not auth " << *dir << dendl; + continue; + } + if (!dir->can_auth_pin()) { + dout(10) << __func__ << " can't auth pin " << *dir << dendl; + dir->add_waiter(CDir::WAIT_UNFREEZE, gather.new_sub()); + continue; + } + dfs.push_back(dir); } - dfs.push_back(dir); } if (gather.has_subs()) { From d0226306f04284f99e95be51a55574f6b43e5a49 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 31 Aug 2023 13:15:18 +0800 Subject: [PATCH 0058/2492] test/crimson/seastore: add journal replay to fltree-onode-manager unittest Signed-off-by: Xuehan Xu --- .../seastore/onode_tree/test_fltree_onode_manager.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc b/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc index 1f661cdca596..17ad975d5e87 100644 --- a/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc +++ b/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc @@ -272,13 +272,14 @@ TEST_P(fltree_onode_manager_test_t, 2_synthetic) run_async([this] { uint64_t block_size = tm->get_block_size(); auto pool = KVPool::create_range( - {0, 100}, {32, 64, 128, 256, 512}, block_size); + {0, 10000}, {32, 64, 128, 256, 512}, block_size); auto start = pool.begin(); auto end = pool.end(); with_onodes_write(start, end, [](auto& t, auto& onode, auto& item) { item.initialize(t, onode); }); + restart(); validate_onodes(start, end); validate_list_onodes(pool); @@ -289,6 +290,7 @@ TEST_P(fltree_onode_manager_test_t, 2_synthetic) [](auto& t, auto& onode, auto& item) { item.modify(t, onode); }); + restart(); validate_onodes(start, end); pool.shuffle(); @@ -298,6 +300,7 @@ TEST_P(fltree_onode_manager_test_t, 2_synthetic) [](auto& t, auto& onode, auto& item) { item.modify(t, onode); }); + restart(); validate_onodes(start, end); pool.shuffle(); @@ -310,6 +313,7 @@ TEST_P(fltree_onode_manager_test_t, 2_synthetic) return manager->erase_onode(t, onode_ref); }).unsafe_get0(); }); + restart(); validate_erased(rd_start, rd_end); pool.erase_from_random(rd_start, rd_end); start = pool.begin(); From 3ccd10f266cfd7ec6dd1ad930598bfe4ca422a90 Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Thu, 17 Aug 2023 20:01:38 +0000 Subject: [PATCH 0059/2492] qa/suites/rados: Added wait_for_all_active_clean_pgs flag Added flag to not allow rados suite to delete the pool unless all pgs are active+clean and all OSDs are up in the thrashosds side of the test. Fixes: https://tracker.ceph.com/issues/59172 Signed-off-by: Kamoltat --- .../thrash-erasure-code/thrashers/minsize_recovery.yaml | 5 ++++- qa/tasks/rados.py | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/qa/suites/rados/thrash-erasure-code/thrashers/minsize_recovery.yaml b/qa/suites/rados/thrash-erasure-code/thrashers/minsize_recovery.yaml index 771d9a1047a7..f7df20f313f0 100644 --- a/qa/suites/rados/thrash-erasure-code/thrashers/minsize_recovery.yaml +++ b/qa/suites/rados/thrash-erasure-code/thrashers/minsize_recovery.yaml @@ -13,7 +13,10 @@ overrides: osd scrub min interval: 60 osd scrub max interval: 120 osd max backfills: 2 + rados: + wait_for_all_active_clean_pgs: true + tasks: - thrashosds: timeout: 1200 - chance_test_min_size: 3 + chance_test_min_size: 3 diff --git a/qa/tasks/rados.py b/qa/tasks/rados.py index a730a72993c2..625ea4d9d764 100644 --- a/qa/tasks/rados.py +++ b/qa/tasks/rados.py @@ -272,6 +272,13 @@ def thread(): ) tests[id_] = proc run.wait(tests.values()) + wait_for_all_active_clean_pgs = config.get("wait_for_all_active_clean_pgs", False) + # usually set when we do min_size testing. + if wait_for_all_active_clean_pgs: + # Make sure we finish the test first before deleting the pool. + # Mainly used for test_pool_min_size + manager.wait_for_clean() + manager.wait_for_all_osds_up(timeout=1800) for pool in created_pools: manager.wait_snap_trimming_complete(pool); From 8c68a503f1d87d4efedab53dd68d7c79529d27aa Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 29 Sep 2023 13:16:42 +0200 Subject: [PATCH 0060/2492] os/bluestore: test _extend_log sequence advance Signed-off-by: Pere Diaz Bou --- src/test/objectstore/test_bluefs.cc | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc index 5eac49938a68..007f47837cef 100644 --- a/src/test/objectstore/test_bluefs.cc +++ b/src/test/objectstore/test_bluefs.cc @@ -1559,6 +1559,30 @@ TEST(BlueFS, test_log_runway_3) { } } +TEST(BlueFS, test_log_runway_advance_seq) { + uint64_t max_log_runway = 65536; + ConfSaver conf(g_ceph_context->_conf); + conf.SetVal("bluefs_alloc_size", "4096"); + conf.SetVal("bluefs_shared_alloc_size", "4096"); + conf.SetVal("bluefs_compact_log_sync", "false"); + conf.SetVal("bluefs_min_log_runway", "32768"); + conf.SetVal("bluefs_max_log_runway", std::to_string(max_log_runway).c_str()); + conf.ApplyChanges(); + + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + + std::string longdir(max_log_runway*2, 'A'); + ASSERT_EQ(fs.mkdir(longdir), 0); + fs.compact_log(); +} + int main(int argc, char **argv) { auto args = argv_to_vec(argc, argv); map defaults = { From 63f0a0df14c9d8e68be61e374438bc75cef45a1f Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 29 Sep 2023 13:17:03 +0200 Subject: [PATCH 0061/2492] os/bluestore: fix _extend_log seq advance when extending the log, the sequence was left on a bad state because it would first create a transaction to update with the current seq number but leave the "real" transaction with the same sequence number which should be `extend_log_transaction.seq + 1`. Signed-off-by: Pere Diaz Bou --- src/os/bluestore/BlueFS.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 53284355132f..e6ae5e70e48c 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -3120,12 +3120,13 @@ void BlueFS::_extend_log(uint64_t amount) { _pad_bl(bl, super.block_size); log.writer->append(bl); ceph_assert(allocated_before_extension >= log.writer->get_effective_write_pos()); - log.t.seq = log.seq_live; // before sync_core we advance the seq { std::unique_lock l(dirty.lock); - _log_advance_seq(); + dirty.seq_live++; + log.seq_live++; + log.t.seq++; } } From eace0b102c48a78a717fa30e58a68796c17b82b6 Mon Sep 17 00:00:00 2001 From: Patty8122 Date: Fri, 22 Sep 2023 17:37:10 -0500 Subject: [PATCH 0062/2492] mds: Updating the print statement to include filepath(root.c_str()) Signed-off-by: Patty8122 --- src/mds/MDSRank.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index f93c8d7fd60a..984d90267297 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -3183,7 +3183,7 @@ void MDSRank::command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Format std::lock_guard l(mds_lock); CInode *in = mdcache->cache_traverse(filepath(root.c_str())); if (!in) { - ss << "root inode is not in cache"; + ss << "inode for path '" << filepath(root.c_str()) << "' is not in cache"; return; } f->open_array_section("inodes"); From cf0450f8e148ea481f59a2f19149ae398230227e Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Fri, 22 Sep 2023 15:44:44 +0300 Subject: [PATCH 0063/2492] os/bluestore: add more latency tracking perf counters into BlueFS Signed-off-by: Igor Fedotov --- src/os/bluestore/BlueFS.cc | 42 +++++++++++++++++++++++++++++++++++--- src/os/bluestore/BlueFS.h | 7 ++++++- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 53284355132f..920739dec898 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -300,6 +300,10 @@ void BlueFS::_init_logger() "auwb", PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); + b.add_time_avg (l_bluefs_read_random_lat, "read_random_lat", + "Average bluefs read_random latency", + "rdrt", + PerfCountersBuilder::PRIO_INTERESTING); b.add_u64_counter(l_bluefs_read_random_count, "read_random_count", "random read requests processed", NULL, @@ -338,6 +342,10 @@ void BlueFS::_init_logger() "Bytes read from prefetch buffer in random read mode", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_time_avg (l_bluefs_read_lat, "read_lat", + "Average bluefs read latency", + "rd_t", + PerfCountersBuilder::PRIO_INTERESTING); b.add_u64_counter(l_bluefs_read_count, "read_count", "buffered read requests processed", NULL, @@ -381,14 +389,30 @@ void BlueFS::_init_logger() b.add_u64_counter(l_bluefs_write_bytes, "write_bytes", "Bytes written", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); - b.add_time_avg (l_bluefs_compaction_lat, "compact_lat", + b.add_time_avg (l_bluefs_compaction_lat, "compact_lat", "Average bluefs log compaction latency", "c__t", PerfCountersBuilder::PRIO_INTERESTING); - b.add_time_avg (l_bluefs_compaction_lock_lat, "compact_lock_lat", + b.add_time_avg (l_bluefs_compaction_lock_lat, "compact_lock_lat", "Average lock duration while compacting bluefs log", "c_lt", PerfCountersBuilder::PRIO_INTERESTING); + b.add_time_avg (l_bluefs_fsync_lat, "fsync_lat", + "Average bluefs fsync latency", + "fs_t", + PerfCountersBuilder::PRIO_INTERESTING); + b.add_time_avg (l_bluefs_flush_lat, "flush_lat", + "Average bluefs flush latency", + "fl_t", + PerfCountersBuilder::PRIO_INTERESTING); + b.add_time_avg (l_bluefs_unlink_lat, "unlink_lat", + "Average bluefs unlink latency", + "unlt", + PerfCountersBuilder::PRIO_INTERESTING); + b.add_time_avg (l_bluefs_truncate_lat, "truncate_lat", + "Average bluefs truncate latency", + "trnt", + PerfCountersBuilder::PRIO_INTERESTING); b.add_u64_counter(l_bluefs_alloc_shared_dev_fallbacks, "alloc_slow_fallback", "Amount of allocations that required fallback to " " slow/shared device", @@ -2086,6 +2110,7 @@ int64_t BlueFS::_read_random( uint64_t len, ///< [in] this many bytes char *out) ///< [out] copy it here { + auto t0 = mono_clock::now(); auto* buf = &h->buf; int64_t ret = 0; @@ -2171,6 +2196,7 @@ int64_t BlueFS::_read_random( << " got 0x" << ret << std::dec << dendl; --h->file->num_reading; + logger->tinc(l_bluefs_read_random_lat, mono_clock::now() - t0); return ret; } @@ -2181,6 +2207,7 @@ int64_t BlueFS::_read( bufferlist *outbl, ///< [out] optional: reference the result here char *out) ///< [out] optional: or copy it here { + auto t0 = mono_clock::now(); FileReaderBuffer *buf = &(h->buf); bool prefetch = !outbl && !out; @@ -2302,6 +2329,7 @@ int64_t BlueFS::_read( << std::dec << dendl; ceph_assert(!outbl || (int)outbl->length() == ret); --h->file->num_reading; + logger->tinc(l_bluefs_read_lat, mono_clock::now() - t0); return ret; } @@ -3369,6 +3397,7 @@ void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) { + auto t0 = mono_clock::now(); ceph_assert(ceph_mutex_is_locked(h->lock)); ceph_assert(h->file->num_readers.load() == 0); ceph_assert(h->file->fnode.ino > 1); @@ -3424,6 +3453,7 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl; int res = _flush_data(h, offset, length, buffered); vselector->add_usage(h->file->vselector_hint, h->file->fnode); + logger->tinc(l_bluefs_flush_lat, mono_clock::now() - t0); return res; } @@ -3645,6 +3675,7 @@ uint64_t BlueFS::_flush_special(FileWriter *h) int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ { + auto t0 = mono_clock::now(); std::lock_guard hl(h->lock); dout(10) << __func__ << " 0x" << std::hex << offset << std::dec << " file " << h->file->fnode << dendl; @@ -3683,11 +3714,13 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ h->file->is_dirty = true; vselector->add_usage(h->file->vselector_hint, h->file->fnode.size); log.t.op_file_update_inc(h->file->fnode); + logger->tinc(l_bluefs_truncate_lat, mono_clock::now() - t0); return 0; } int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/ { + auto t0 = mono_clock::now(); _maybe_check_vselector_LNF(); std::unique_lock hl(h->lock); uint64_t old_dirty_seq = 0; @@ -3715,7 +3748,7 @@ int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/ _flush_and_sync_log_LD(old_dirty_seq); } _maybe_compact_log_LNF_NF_LD_D(); - + logger->tinc(l_bluefs_fsync_lat, mono_clock::now() - t0); return 0; } @@ -4357,6 +4390,7 @@ int BlueFS::readdir(std::string_view dirname, vector *ls)/*_N*/ int BlueFS::unlink(std::string_view dirname, std::string_view filename)/*_LND*/ { + auto t0 = mono_clock::now(); std::lock_guard ll(log.lock); std::lock_guard nl(nodes.lock); dout(10) << __func__ << " " << dirname << "/" << filename << dendl; @@ -4381,6 +4415,8 @@ int BlueFS::unlink(std::string_view dirname, std::string_view filename)/*_LND*/ dir->file_map.erase(string{filename}); log.t.op_dir_unlink(dirname, filename); _drop_link_D(file); + logger->tinc(l_bluefs_unlink_lat, mono_clock::now() - t0); + return 0; } diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 9c5fb4981592..f0493a758dab 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -46,6 +46,7 @@ enum { l_bluefs_main_alloc_unit, l_bluefs_db_alloc_unit, l_bluefs_wal_alloc_unit, + l_bluefs_read_random_lat, l_bluefs_read_random_count, l_bluefs_read_random_bytes, l_bluefs_read_random_disk_count, @@ -55,6 +56,7 @@ enum { l_bluefs_read_random_disk_bytes_slow, l_bluefs_read_random_buffer_count, l_bluefs_read_random_buffer_bytes, + l_bluefs_read_lat, l_bluefs_read_count, l_bluefs_read_bytes, l_bluefs_read_disk_count, @@ -69,6 +71,10 @@ enum { l_bluefs_write_bytes, l_bluefs_compaction_lat, l_bluefs_compaction_lock_lat, + l_bluefs_fsync_lat, + l_bluefs_flush_lat, + l_bluefs_unlink_lat, + l_bluefs_truncate_lat, l_bluefs_alloc_shared_dev_fallbacks, l_bluefs_alloc_shared_size_fallbacks, l_bluefs_read_zeros_candidate, @@ -445,7 +451,6 @@ class BlueFS { int _flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered); int _flush_F(FileWriter *h, bool force, bool *flushed = nullptr); uint64_t _flush_special(FileWriter *h); - int _fsync(FileWriter *h); #ifdef HAVE_LIBAIO void _claim_completed_aios(FileWriter *h, std::list *ls); From b7b38897a411bcfd1f88a8bd0564fa4b81f23052 Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Fri, 22 Sep 2023 15:59:18 +0300 Subject: [PATCH 0064/2492] os/bluestore: a bit more effective file_map handling in BlueFS Signed-off-by: Igor Fedotov --- src/os/bluestore/BlueFS.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 920739dec898..337a022e7df0 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -4023,8 +4023,8 @@ int BlueFS::open_for_write( dir = p->second; } - map::iterator q = dir->file_map.find(filename); - if (q == dir->file_map.end()) { + map::iterator q = dir->file_map.lower_bound(filename); + if (q == dir->file_map.end() || q->first != filename) { if (overwrite) { dout(20) << __func__ << " dir " << dirname << " (" << dir << ") file " << filename @@ -4034,7 +4034,7 @@ int BlueFS::open_for_write( file = ceph::make_ref(); file->fnode.ino = ++ino_last; nodes.file_map[ino_last] = file; - dir->file_map[string{filename}] = file; + dir->file_map.emplace_hint(q, string{filename}, file); ++file->refs; create = true; logger->set(l_bluefs_num_files, nodes.file_map.size()); @@ -4263,7 +4263,7 @@ int BlueFS::rmdir(std::string_view dirname)/*_LN*/ dout(20) << __func__ << " dir " << dirname << " not empty" << dendl; return -ENOTEMPTY; } - nodes.dir_map.erase(string{dirname}); + nodes.dir_map.erase(p); log.t.op_dir_remove(dirname); return 0; } @@ -4317,9 +4317,9 @@ int BlueFS::lock_file(std::string_view dirname, std::string_view filename, return -ENOENT; } DirRef dir = p->second; - auto q = dir->file_map.find(filename); + auto q = dir->file_map.lower_bound(filename); FileRef file; - if (q == dir->file_map.end()) { + if (q == dir->file_map.end() || q->first != filename) { dout(20) << __func__ << " dir " << dirname << " (" << dir << ") file " << filename << " not found, creating" << dendl; @@ -4327,7 +4327,7 @@ int BlueFS::lock_file(std::string_view dirname, std::string_view filename, file->fnode.ino = ++ino_last; file->fnode.mtime = ceph_clock_now(); nodes.file_map[ino_last] = file; - dir->file_map[string{filename}] = file; + dir->file_map.emplace_hint(q, string{filename}, file); logger->set(l_bluefs_num_files, nodes.file_map.size()); ++file->refs; log.t.op_file_update(file->fnode); @@ -4412,7 +4412,7 @@ int BlueFS::unlink(std::string_view dirname, std::string_view filename)/*_LND*/ << " is locked" << dendl; return -EBUSY; } - dir->file_map.erase(string{filename}); + dir->file_map.erase(q); log.t.op_dir_unlink(dirname, filename); _drop_link_D(file); logger->tinc(l_bluefs_unlink_lat, mono_clock::now() - t0); From acc26d7b17316d1e45bfc3a882355b46db19d053 Mon Sep 17 00:00:00 2001 From: Adam King Date: Tue, 9 May 2023 15:06:41 -0400 Subject: [PATCH 0065/2492] mgr/cephadm: make jaeger-collector urls a dep for jaeger-agent the jaeger-agent's need to know the url for the collector(s) that have been deployed. If a collector moves, or we deployed the agents before the collector, we need to reconfig the agents with updated info about the collectors. Failure to do so can leave the jager-agents down reporting ``` Could not create collector proxy","error":"at least one collector hostPort address is required when resolver is not available" ``` Fixes: https://tracker.ceph.com/issues/59704 Signed-off-by: Adam King --- src/pybind/mgr/cephadm/module.py | 7 +++++++ src/pybind/mgr/cephadm/serve.py | 5 +++++ src/pybind/mgr/cephadm/services/jaeger.py | 3 +++ 3 files changed, 15 insertions(+) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 4b6f7cf7a567..70d66732f0f5 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -40,6 +40,7 @@ from mgr_module import MgrModule, HandleCommandResult, Option, NotifyType +from mgr_util import build_url import orchestrator from orchestrator.module import to_format, Format @@ -2712,6 +2713,12 @@ def get_daemon_names(daemons: List[str]) -> List[str]: deps.append(f'{hash(alertmanager_user + alertmanager_password)}') elif daemon_type == 'promtail': deps += get_daemon_names(['loki']) + elif daemon_type == JaegerAgentService.TYPE: + for dd in self.cache.get_daemons_by_type(JaegerCollectorService.TYPE): + assert dd.hostname is not None + port = dd.ports[0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT + deps.append(build_url(host=dd.hostname, port=port).lstrip('/')) + deps = sorted(deps) else: # TODO(redo): some error message! pass diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index a17ac151e3a6..116e97238691 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -1060,6 +1060,11 @@ def _check_daemons(self) -> None: diff = list(set(last_deps) - set(deps)) if any('secure_monitoring_stack' in e for e in diff): action = 'redeploy' + elif dd.daemon_type == 'jaeger-agent': + # changes to jaeger-agent deps affect the way the unit.run for + # the daemon is written, which we rewrite on redeploy, but not + # on reconfig. + action = 'redeploy' elif spec is not None and hasattr(spec, 'extra_container_args') and dd.extra_container_args != spec.extra_container_args: self.log.debug( diff --git a/src/pybind/mgr/cephadm/services/jaeger.py b/src/pybind/mgr/cephadm/services/jaeger.py index c136d20e612a..c83c765d0394 100644 --- a/src/pybind/mgr/cephadm/services/jaeger.py +++ b/src/pybind/mgr/cephadm/services/jaeger.py @@ -20,13 +20,16 @@ class JaegerAgentService(CephadmService): def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: assert self.TYPE == daemon_spec.daemon_type collectors = [] + deps: List[str] = [] for dd in self.mgr.cache.get_daemons_by_type(JaegerCollectorService.TYPE): # scrape jaeger-collector nodes assert dd.hostname is not None port = dd.ports[0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT url = build_url(host=dd.hostname, port=port).lstrip('/') collectors.append(url) + deps.append(url) daemon_spec.final_config = {'collector_nodes': ",".join(collectors)} + daemon_spec.deps = sorted(deps) return daemon_spec From 9a083b0935509744234082832d12ed2734bcb6e0 Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Tue, 3 Oct 2023 09:39:20 -0700 Subject: [PATCH 0066/2492] CLIENT: C_Read_Async_Finisher should assume client_lock is held Client::C_Read_Async_Finisher::finish() doesn't need to take the lock because ObjectCacher has already assured the lock is held. Of course when we immediately complete, we don't need to unlock. Signed-off-by: Frank S. Filz --- src/client/Client.cc | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 4e7e3961e8e1..1ce33f230538 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -10999,15 +10999,11 @@ void Client::do_readahead(Fh *f, Inode *in, uint64_t off, uint64_t len) void Client::C_Read_Async_Finisher::finish(int r) { - clnt->client_lock.lock(); - // Do read ahead as long as we aren't completing with 0 bytes if (r != 0) clnt->do_readahead(f, in, off, len); onfinish->complete(r); - - clnt->client_lock.unlock(); } int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl, @@ -11039,9 +11035,7 @@ int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl, Context *crf = io_finish.release(); // Complete the crf immediately with 0 bytes - client_lock.unlock(); crf->complete(0); - client_lock.lock(); // Signal async completion return 0; @@ -11073,9 +11067,7 @@ int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl, Context *crf = io_finish.release(); if (r != 0) { // need to do readahead, so complete the crf - client_lock.unlock(); crf->complete(r); - client_lock.lock(); } else { get_cap_ref(in, CEPH_CAP_FILE_CACHE); } From f004def5b7cb8aa31e7f428f5be9bcf1aee2f30c Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Tue, 19 Sep 2023 14:26:19 +0300 Subject: [PATCH 0067/2492] osd: do not assert on fast shutdown timeout Fixes: https://tracker.ceph.com/issues/61140 Signed-off-by: Igor Fedotov --- src/osd/OSD.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f01540c3a930..d813de2b7e20 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -4496,10 +4496,12 @@ int OSD::shutdown() store->umount(); utime_t end_time = ceph_clock_now(); - if (cct->_conf->osd_fast_shutdown_timeout) { - ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout); - } dout(0) <<"Fast Shutdown duration total :" << end_time - start_time_func << " seconds" << dendl; + if (cct->_conf->osd_fast_shutdown_timeout && + end_time - start_time_func > cct->_conf->osd_fast_shutdown_timeout) { + dout(0) << "Fast Shutdown duration exceeded :" << cct->_conf->osd_fast_shutdown_timeout << " seconds" + << dendl; + } dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount - start_time_osd_drain << " seconds" << dendl; dout(0) <<"Fast Shutdown duration umount :" << end_time - start_time_umount << " seconds" << dendl; dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain - start_time_timer << " seconds" << dendl; From b2c4e62afac32edda142a51eb601420a1a79bb2f Mon Sep 17 00:00:00 2001 From: pilem94 Date: Tue, 3 Oct 2023 16:09:48 -0400 Subject: [PATCH 0068/2492] src/ceph-volume/ceph_volume/devices/lvm/listing.py : lvm list filters also on vg name This commit fix the listing of LVs with the same name on multiple VG Fixes: https://tracker.ceph.com/issues/62320 Signed-off-by: Pierre Lemay --- src/ceph-volume/ceph_volume/devices/lvm/listing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ceph-volume/ceph_volume/devices/lvm/listing.py b/src/ceph-volume/ceph_volume/devices/lvm/listing.py index c16afdaa7672..8fb9d8ddcf87 100644 --- a/src/ceph-volume/ceph_volume/devices/lvm/listing.py +++ b/src/ceph-volume/ceph_volume/devices/lvm/listing.py @@ -153,7 +153,9 @@ def single_report(self, arg): elif arg[0] == '/': lv = api.get_lvs_from_path(arg) else: - lv = [api.get_single_lv(filters={'lv_name': arg.split('/')[1]})] + vg_name, lv_name = arg.split('/') + lv = [api.get_single_lv(filters={'lv_name': lv_name, + 'vg_name': vg_name})] report = self.create_report(lv) From 637613eb148157f30f8524dc99e508cf674b7b9f Mon Sep 17 00:00:00 2001 From: Oguzhan Ozmen Date: Tue, 3 Oct 2023 16:05:39 -0400 Subject: [PATCH 0069/2492] STS: when generating keys, take the trailing character into account Otherwise, STS acckey has 19 and secret key has 39 alphanumeric chars. Signed-off-by: Oguzhan Ozmen --- src/rgw/rgw_sts.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rgw/rgw_sts.cc b/src/rgw/rgw_sts.cc index b552834426a9..2b31d5c5a297 100644 --- a/src/rgw/rgw_sts.cc +++ b/src/rgw/rgw_sts.cc @@ -54,7 +54,7 @@ int Credentials::generateCredentials(const DoutPrefixProvider *dpp, rgw::auth::Identity* identity) { uuid_d accessKey, secretKey; - char accessKeyId_str[MAX_ACCESS_KEY_LEN], secretAccessKey_str[MAX_SECRET_KEY_LEN]; + char accessKeyId_str[MAX_ACCESS_KEY_LEN + 1], secretAccessKey_str[MAX_SECRET_KEY_LEN + 1]; //AccessKeyId gen_rand_alphanumeric_plain(cct, accessKeyId_str, sizeof(accessKeyId_str)); From b14ff07e6344d9f097259265d468f6300818b053 Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Wed, 4 Oct 2023 01:11:32 +0200 Subject: [PATCH 0070/2492] ceph-volume: fix util.get_partitions The current logic makes it report only the first partitions of devices. Fixes: https://tracker.ceph.com/issues/63086 Signed-off-by: Guillaume Abrioux --- src/ceph-volume/ceph_volume/util/disk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py index ee061b724007..704b9e76600c 100644 --- a/src/ceph-volume/ceph_volume/util/disk.py +++ b/src/ceph-volume/ceph_volume/util/disk.py @@ -816,7 +816,7 @@ def get_partitions(_sys_dev_block_path ='/sys/dev/block'): result = dict() for device in devices: device_path = os.path.join(_sys_dev_block_path, device) - is_partition = get_file_contents(os.path.join(device_path, 'partition')) == "1" + is_partition = int(get_file_contents(os.path.join(device_path, 'partition'), '0')) > 0 if not is_partition: continue From 6f3f58cb8e4ce100cc7186858465b4d11d5c2c49 Mon Sep 17 00:00:00 2001 From: Aashish Sharma Date: Wed, 4 Oct 2023 12:24:13 +0530 Subject: [PATCH 0071/2492] mgr/dashboard: Consider null values as zero in grafana panels After upgrading from RHCS4 to RHCS5..some of the grafana charts broke. This is because in RHCS5 we do not generate the metrics if its value is zero as a result the null value from that metric breaks the grafana charts or graphs. This PR is to fix the above mentioned issue. Fixes: https://tracker.ceph.com/issues/63088 Signed-off-by: Aashish Sharma --- .../ceph-mixin/dashboards/osd.libsonnet | 2 +- .../ceph-mixin/dashboards/rbd.libsonnet | 2 +- .../ceph-mixin/dashboards/rgw.libsonnet | 4 ++-- .../dashboards_out/osd-device-details.json | 12 +++++------ .../dashboards_out/radosgw-detail.json | 6 +++--- .../dashboards_out/radosgw-overview.json | 20 +++++++++---------- .../dashboards_out/rbd-overview.json | 6 +++--- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/monitoring/ceph-mixin/dashboards/osd.libsonnet b/monitoring/ceph-mixin/dashboards/osd.libsonnet index 0ea43c96ff9f..0015c7f398bd 100644 --- a/monitoring/ceph-mixin/dashboards/osd.libsonnet +++ b/monitoring/ceph-mixin/dashboards/osd.libsonnet @@ -342,7 +342,7 @@ local g = import 'grafonnet/grafana.libsonnet'; $.graphPanelSchema({}, title, description, - 'null', + 'null as zero', false, formatY1, 'short', diff --git a/monitoring/ceph-mixin/dashboards/rbd.libsonnet b/monitoring/ceph-mixin/dashboards/rbd.libsonnet index 0eca5a877737..709d4e04f7e9 100644 --- a/monitoring/ceph-mixin/dashboards/rbd.libsonnet +++ b/monitoring/ceph-mixin/dashboards/rbd.libsonnet @@ -133,7 +133,7 @@ local u = import 'utils.libsonnet'; $.graphPanelSchema({}, title, '', - 'null', + 'null as zero', false, formatY1, 'short', diff --git a/monitoring/ceph-mixin/dashboards/rgw.libsonnet b/monitoring/ceph-mixin/dashboards/rgw.libsonnet index 892480d1ca0f..49dcf9156884 100644 --- a/monitoring/ceph-mixin/dashboards/rgw.libsonnet +++ b/monitoring/ceph-mixin/dashboards/rgw.libsonnet @@ -140,7 +140,7 @@ local u = import 'utils.libsonnet'; {}, title, description, - 'null', + 'null as zero', false, formatY1, formatY2, @@ -658,7 +658,7 @@ local u = import 'utils.libsonnet'; $.graphPanelSchema(aliasColors, title, description, - 'null', + 'null as zero', false, formatY1, formatY2, diff --git a/monitoring/ceph-mixin/dashboards_out/osd-device-details.json b/monitoring/ceph-mixin/dashboards_out/osd-device-details.json index 384516fb0195..811e6d57ef2e 100644 --- a/monitoring/ceph-mixin/dashboards_out/osd-device-details.json +++ b/monitoring/ceph-mixin/dashboards_out/osd-device-details.json @@ -87,7 +87,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -185,7 +185,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -283,7 +283,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -400,7 +400,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -498,7 +498,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -596,7 +596,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, diff --git a/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json b/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json index a0f8f3537c48..4568f9a4d854 100644 --- a/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json @@ -93,7 +93,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -186,7 +186,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -285,7 +285,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, diff --git a/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json index 77d69e4f3152..a8256c1f5e16 100644 --- a/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json @@ -87,7 +87,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -180,7 +180,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -266,7 +266,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -352,7 +352,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -445,7 +445,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -531,7 +531,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -636,7 +636,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -754,7 +754,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -893,7 +893,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -1000,7 +1000,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, diff --git a/monitoring/ceph-mixin/dashboards_out/rbd-overview.json b/monitoring/ceph-mixin/dashboards_out/rbd-overview.json index e017280e02bf..86b354a6089c 100644 --- a/monitoring/ceph-mixin/dashboards_out/rbd-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/rbd-overview.json @@ -80,7 +80,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -173,7 +173,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -266,7 +266,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, From 0b8b98907c7f60cfe64636c3278e474bd03f27ce Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Wed, 20 Sep 2023 19:10:59 +0300 Subject: [PATCH 0072/2492] osd: introduce dump_osd_pg_stats admin socket command. One can learn full OSD stats in a way they're reported to monitors using it. Signed-off-by: Igor Fedotov --- src/messages/MPGStats.h | 25 +++++++++++++++++++++++++ src/osd/OSD.cc | 12 ++++++++++++ 2 files changed, 37 insertions(+) diff --git a/src/messages/MPGStats.h b/src/messages/MPGStats.h index 65cec5244788..2d9c2dcb5c2c 100644 --- a/src/messages/MPGStats.h +++ b/src/messages/MPGStats.h @@ -44,6 +44,31 @@ class MPGStats final : public PaxosServiceMessage { void print(std::ostream& out) const override { out << "pg_stats(" << pg_stat.size() << " pgs seq " << osd_stat.seq << " v " << version << ")"; } + void dump_stats(ceph::Formatter *f) const { + f->open_object_section("stats"); + { + f->open_array_section("pg_stat"); + for(const auto& [_pg, _stat] : pg_stat) { + f->open_object_section("pg_stat"); + _pg.dump(f); + _stat.dump(f); + f->close_section(); + } + f->close_section(); + + f->dump_object("osd_stat", osd_stat); + + f->open_array_section("pool_stat"); + for(const auto& [_id, _stat] : pool_stat) { + f->open_object_section("pool"); + f->dump_int("poolid", _id); + _stat.dump(f); + f->close_section(); + } + f->close_section(); + } + f->close_section(); + } void encode_payload(uint64_t features) override { using ceph::encode; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 52f937d7ff4d..6b3dd52786a3 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -3278,6 +3278,13 @@ will start to track new ops received afterwards."; st.dump(f); f->close_section(); } + } else if (prefix == "dump_osd_pg_stats") { + lock_guard l(osd_lock); + + MPGStats* m = collect_pg_stats(); + ceph_assert(m); + m->dump_stats(f); + m->put(); } else { ceph_abort_msg("broken asok registration"); } @@ -4155,6 +4162,11 @@ void OSD::final_init() "Dump store's statistics for the given pool"); ceph_assert(r == 0); + r = admin_socket->register_command( + "dump_osd_pg_stats ", asok_hook, + "Dump OSD PGs' statistics"); + ceph_assert(r == 0); + test_ops_hook = new TestOpsSocketHook(&(this->service), this->store.get()); // Note: pools are CephString instead of CephPoolname because // these commands traditionally support both pool names and numbers From 7752b9019d4444a499051669539cfd67b83e0d44 Mon Sep 17 00:00:00 2001 From: Cory Snyder Date: Wed, 4 Oct 2023 05:42:30 -0400 Subject: [PATCH 0073/2492] rgw: add versioning info to radosgw-admin bucket stats output This allows admins to more easily identify whether a bucket has versioning / object lock/ mfa enabled. Fixes: https://tracker.ceph.com/issues/63092 Signed-off-by: Cory Snyder --- src/rgw/driver/rados/rgw_bucket.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/rgw/driver/rados/rgw_bucket.cc b/src/rgw/driver/rados/rgw_bucket.cc index c4b89d6cd4c6..e58b554f790b 100644 --- a/src/rgw/driver/rados/rgw_bucket.cc +++ b/src/rgw/driver/rados/rgw_bucket.cc @@ -1290,6 +1290,8 @@ static int bucket_stats(rgw::sal::Driver* driver, return ret; } + const RGWBucketInfo& bucket_info = bucket->get_info(); + const auto& index = bucket->get_info().get_current_index(); if (is_layout_indexless(index)) { cerr << "error, indexless buckets do not maintain stats; bucket=" << @@ -1320,6 +1322,10 @@ static int bucket_stats(rgw::sal::Driver* driver, formatter->dump_string("id", bucket->get_bucket_id()); formatter->dump_string("marker", bucket->get_marker()); formatter->dump_stream("index_type") << bucket->get_info().layout.current_index.layout.type; + formatter->dump_bool("versioned", bucket_info.versioned()); + formatter->dump_bool("versioning_enabled", bucket_info.versioning_enabled()); + formatter->dump_bool("object_lock_enabled", bucket_info.obj_lock_enabled()); + formatter->dump_bool("mfa_enabled", bucket_info.mfa_enabled()); ::encode_json("owner", bucket->get_info().owner, formatter); formatter->dump_string("ver", bucket_ver); formatter->dump_string("master_ver", master_ver); From ae11bbe6b2805740d3621cc47c68a5f0da493df0 Mon Sep 17 00:00:00 2001 From: Laura Flores Date: Thu, 28 Sep 2023 17:52:11 +0000 Subject: [PATCH 0074/2492] osd: fix logic in check_pg_upmaps The logic was changed in check_pg_upmaps in a Reef refactor, which results in recommendations made by the upmap balancer even when it says there are no optimizations. Fixes: https://tracker.ceph.com/issues/63029 Signed-off-by: Laura Flores --- src/osd/OSDMap.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 4a2d400b6703..ce199e8ee8ac 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -2157,8 +2157,8 @@ bool OSDMap::check_pg_upmaps( << j->first << " " << j->second << dendl; to_cancel->push_back(pg); - } else { - //Josh--check partial no-op here. + } else if (newmap != j->second) { + // check partial no-op here. ldout(cct, 10) << __func__ << " simplifying partially no-op pg_upmap_items " << j->first << " " << j->second << " -> " << newmap From 82a242672375965d7b3872b43c49285630d93402 Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Thu, 5 Oct 2023 00:04:50 +0000 Subject: [PATCH 0075/2492] qa: enable test_librgw_file.sh to be run with vstart cluster Signed-off-by: Ali Maredia --- qa/workunits/rgw/test_librgw_file.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/qa/workunits/rgw/test_librgw_file.sh b/qa/workunits/rgw/test_librgw_file.sh index 1371ff711075..d56dc7b8258f 100755 --- a/qa/workunits/rgw/test_librgw_file.sh +++ b/qa/workunits/rgw/test_librgw_file.sh @@ -1,5 +1,11 @@ #!/bin/sh -e - +# +# To run this test script with a cluster created via vstart.sh: +# $PATH needs to be set for radosgw-admin and ceph_test_librgw executables. +# $KEYRING need to be set as the path for a vstart clusters Ceph keyring. +# +# Example when ceph source is cloned into $HOME and a vstart cluster is already running with a radosgw: +# $ PATH=~/ceph/build/bin/:$PATH KEYRING=~/ceph/build/keyring ~/ceph/qa/workunits/rgw/test_librgw_file.sh if [ -z ${AWS_ACCESS_KEY_ID} ] then @@ -13,7 +19,10 @@ then --email librgw@example.com || echo "librgw user exists" # keyring override for teuthology env - KEYRING="/etc/ceph/ceph.keyring" + if [ -z ${KEYRING} ] + then + KEYRING="/etc/ceph/ceph.keyring" + fi K="-k ${KEYRING}" fi From dd8f59a1e88313d61f16e4b43bb47247dc64e71f Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 3 Oct 2023 12:50:44 -0400 Subject: [PATCH 0076/2492] cephadm: add unit test coverage for deploying keepalived Signed-off-by: John Mulligan --- src/cephadm/tests/test_deploy.py | 55 ++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py index 6fd36cc6eb18..4c0b5c845f7e 100644 --- a/src/cephadm/tests/test_deploy.py +++ b/src/cephadm/tests/test_deploy.py @@ -1,3 +1,4 @@ +import os import pathlib import unittest from unittest import mock @@ -84,3 +85,57 @@ def test_deploy_snmp_container(cephadm_fs, monkeypatch): assert basedir.is_dir() assert not (basedir / 'config').exists() assert not (basedir / 'keyring').exists() + + +def test_deploy_keepalived_container(cephadm_fs, monkeypatch): + _call = mock.MagicMock(return_value=('', '', 0)) + monkeypatch.setattr('cephadmlib.container_types.call', _call) + _call_throws = mock.MagicMock(return_value=0) + monkeypatch.setattr( + 'cephadmlib.container_types.call_throws', _call_throws + ) + _firewalld = mock.MagicMock() + _firewalld().external_ports.get.return_value = [] + monkeypatch.setattr('cephadm.Firewalld', _firewalld) + _extract_uid_gid = mock.MagicMock() + _extract_uid_gid.return_value = (8765, 8765) + monkeypatch.setattr('cephadm.extract_uid_gid', _extract_uid_gid) + _install_sysctl = mock.MagicMock() + monkeypatch.setattr('cephadm.install_sysctl', _install_sysctl) + fsid = 'b01dbeef-701d-9abe-0000-e1e5a47004a7' + with with_cephadm_ctx([]) as ctx: + ctx.container_engine = mock_podman() + ctx.fsid = fsid + ctx.name = 'keepalived.uiop' + ctx.image = 'quay.io/eeranimated/keepalived:latest' + ctx.reconfig = False + ctx.config_blobs = { + 'destination': '192.168.100.10:8899', + 'config': 'XXXXXXX', + 'keyring': 'YYYYYY', + 'files': { + 'keepalived.conf': 'neversayneveragain', + }, + } + _cephadm._common_deploy(ctx) + + basedir = pathlib.Path(f'/var/lib/ceph/{fsid}/keepalived.uiop') + assert basedir.is_dir() + with open(basedir / 'unit.run') as f: + runfile_lines = f.read().splitlines() + assert 'podman' in runfile_lines[-1] + assert runfile_lines[-1].endswith('quay.io/eeranimated/keepalived:latest') + _firewalld().open_ports.assert_not_called() + assert not (basedir / 'config').exists() + assert not (basedir / 'keyring').exists() + with open(basedir / 'keepalived.conf') as f: + assert f.read() == 'neversayneveragain' + with open(basedir / 'keepalived.conf') as f: + assert f.read() == 'neversayneveragain' + si = os.fstat(f.fileno()) + assert (si.st_uid, si.st_gid) == (8765, 8765) + assert (basedir / 'keepalived').is_dir() + si = (basedir / 'keepalived').stat() + assert (si.st_uid, si.st_gid) == (8765, 8765) + assert _install_sysctl.call_count == 1 + assert len(_install_sysctl.call_args[0][-1].get_sysctl_settings()) > 1 From d365d605bd7fcffef66a4924e792594e08245bd2 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 3 Oct 2023 13:11:14 -0400 Subject: [PATCH 0077/2492] cephadm: move some common mocks to a function Reduce frequently repeated mocks in these new deployment test functions. Signed-off-by: John Mulligan --- src/cephadm/tests/test_deploy.py | 51 +++++++++++++++++--------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py index 4c0b5c845f7e..fff88ba63755 100644 --- a/src/cephadm/tests/test_deploy.py +++ b/src/cephadm/tests/test_deploy.py @@ -14,12 +14,33 @@ _cephadm = import_cephadm() -def test_deploy_nfs_container(cephadm_fs, monkeypatch): +def _common_mp(monkeypatch): + mocks = {} _call = mock.MagicMock(return_value=('', '', 0)) monkeypatch.setattr('cephadmlib.container_types.call', _call) + mocks['call'] = _call + _call_throws = mock.MagicMock(return_value=0) + monkeypatch.setattr( + 'cephadmlib.container_types.call_throws', _call_throws + ) + mocks['call_throws'] = _call_throws _firewalld = mock.MagicMock() _firewalld().external_ports.get.return_value = [] monkeypatch.setattr('cephadm.Firewalld', _firewalld) + mocks['Firewalld'] = _firewalld + _extract_uid_gid = mock.MagicMock() + _extract_uid_gid.return_value = (8765, 8765) + monkeypatch.setattr('cephadm.extract_uid_gid', _extract_uid_gid) + mocks['extract_uid_gid'] = _extract_uid_gid + _install_sysctl = mock.MagicMock() + monkeypatch.setattr('cephadm.install_sysctl', _install_sysctl) + mocks['install_sysctl'] = _install_sysctl + return mocks + + +def test_deploy_nfs_container(cephadm_fs, monkeypatch): + mocks = _common_mp(monkeypatch) + _firewalld = mocks['Firewalld'] fsid = 'b01dbeef-701d-9abe-0000-e1e5a47004a7' with with_cephadm_ctx([]) as ctx: ctx.container_engine = mock_podman() @@ -51,15 +72,8 @@ def test_deploy_nfs_container(cephadm_fs, monkeypatch): def test_deploy_snmp_container(cephadm_fs, monkeypatch): - _call = mock.MagicMock(return_value=('', '', 0)) - monkeypatch.setattr('cephadmlib.container_types.call', _call) - _call_throws = mock.MagicMock(return_value=0) - monkeypatch.setattr( - 'cephadmlib.container_types.call_throws', _call_throws - ) - _firewalld = mock.MagicMock() - _firewalld().external_ports.get.return_value = [] - monkeypatch.setattr('cephadm.Firewalld', _firewalld) + mocks = _common_mp(monkeypatch) + _firewalld = mocks['Firewalld'] fsid = 'b01dbeef-701d-9abe-0000-e1e5a47004a7' with with_cephadm_ctx([]) as ctx: ctx.container_engine = mock_podman() @@ -88,20 +102,9 @@ def test_deploy_snmp_container(cephadm_fs, monkeypatch): def test_deploy_keepalived_container(cephadm_fs, monkeypatch): - _call = mock.MagicMock(return_value=('', '', 0)) - monkeypatch.setattr('cephadmlib.container_types.call', _call) - _call_throws = mock.MagicMock(return_value=0) - monkeypatch.setattr( - 'cephadmlib.container_types.call_throws', _call_throws - ) - _firewalld = mock.MagicMock() - _firewalld().external_ports.get.return_value = [] - monkeypatch.setattr('cephadm.Firewalld', _firewalld) - _extract_uid_gid = mock.MagicMock() - _extract_uid_gid.return_value = (8765, 8765) - monkeypatch.setattr('cephadm.extract_uid_gid', _extract_uid_gid) - _install_sysctl = mock.MagicMock() - monkeypatch.setattr('cephadm.install_sysctl', _install_sysctl) + mocks = _common_mp(monkeypatch) + _firewalld = mocks['Firewalld'] + _install_sysctl = mocks['install_sysctl'] fsid = 'b01dbeef-701d-9abe-0000-e1e5a47004a7' with with_cephadm_ctx([]) as ctx: ctx.container_engine = mock_podman() From 42991a1dc69abca1aec6fe7ec358d554dc17bce3 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 3 Oct 2023 13:19:18 -0400 Subject: [PATCH 0078/2492] cephadm: add unit test coverage for deploying haproxy Signed-off-by: John Mulligan --- src/cephadm/tests/test_deploy.py | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py index fff88ba63755..fdc6f582dbdc 100644 --- a/src/cephadm/tests/test_deploy.py +++ b/src/cephadm/tests/test_deploy.py @@ -142,3 +142,45 @@ def test_deploy_keepalived_container(cephadm_fs, monkeypatch): assert (si.st_uid, si.st_gid) == (8765, 8765) assert _install_sysctl.call_count == 1 assert len(_install_sysctl.call_args[0][-1].get_sysctl_settings()) > 1 + + +def test_deploy_haproxy_container(cephadm_fs, monkeypatch): + mocks = _common_mp(monkeypatch) + _firewalld = mocks['Firewalld'] + _install_sysctl = mocks['install_sysctl'] + fsid = 'b01dbeef-701d-9abe-0000-e1e5a47004a7' + with with_cephadm_ctx([]) as ctx: + ctx.container_engine = mock_podman() + ctx.fsid = fsid + ctx.name = 'haproxy.yyz' + ctx.image = 'quay.io/lfeuwbo/haproxy:latest' + ctx.reconfig = False + ctx.config_blobs = { + 'config': 'XXXXXXX', + 'keyring': 'YYYYYY', + 'files': { + 'haproxy.cfg': 'bifrost', + }, + } + _cephadm._common_deploy(ctx) + + basedir = pathlib.Path(f'/var/lib/ceph/{fsid}/haproxy.yyz') + assert basedir.is_dir() + with open(basedir / 'unit.run') as f: + runfile_lines = f.read().splitlines() + assert 'podman' in runfile_lines[-1] + assert runfile_lines[-1].endswith( + 'quay.io/lfeuwbo/haproxy:latest haproxy -f /var/lib/haproxy/haproxy.cfg' + ) + _firewalld().open_ports.assert_not_called() + assert not (basedir / 'config').exists() + assert not (basedir / 'keyring').exists() + assert (basedir / 'haproxy').is_dir() + si = (basedir / 'haproxy').stat() + assert (si.st_uid, si.st_gid) == (8765, 8765) + with open(basedir / 'haproxy/haproxy.cfg') as f: + assert f.read() == 'bifrost' + si = os.fstat(f.fileno()) + assert (si.st_uid, si.st_gid) == (8765, 8765) + assert _install_sysctl.call_count == 1 + assert len(_install_sysctl.call_args[0][-1].get_sysctl_settings()) > 1 From 9aecda0f9437d3ca9eb59bd0a9385357e838a6ce Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 3 Oct 2023 13:31:26 -0400 Subject: [PATCH 0079/2492] cephadm: add unit test coverage for deploying iscsi Signed-off-by: John Mulligan --- src/cephadm/tests/test_deploy.py | 39 ++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py index fdc6f582dbdc..c392c9f4d798 100644 --- a/src/cephadm/tests/test_deploy.py +++ b/src/cephadm/tests/test_deploy.py @@ -184,3 +184,42 @@ def test_deploy_haproxy_container(cephadm_fs, monkeypatch): assert (si.st_uid, si.st_gid) == (8765, 8765) assert _install_sysctl.call_count == 1 assert len(_install_sysctl.call_args[0][-1].get_sysctl_settings()) > 1 + + +def test_deploy_iscsi_container(cephadm_fs, monkeypatch): + mocks = _common_mp(monkeypatch) + _firewalld = mocks['Firewalld'] + fsid = 'b01dbeef-701d-9abe-0000-e1e5a47004a7' + with with_cephadm_ctx([]) as ctx: + ctx.container_engine = mock_podman() + ctx.fsid = fsid + ctx.name = 'iscsi.wuzzy' + ctx.image = 'quay.io/ayeaye/iscsi:latest' + ctx.reconfig = False + ctx.config_blobs = { + 'config': 'XXXXXXX', + 'keyring': 'YYYYYY', + 'files': { + 'iscsi-gateway.cfg': 'portal', + }, + } + _cephadm._common_deploy(ctx) + + basedir = pathlib.Path(f'/var/lib/ceph/{fsid}/iscsi.wuzzy') + assert basedir.is_dir() + with open(basedir / 'unit.run') as f: + runfile_lines = f.read().splitlines() + assert 'podman' in runfile_lines[-1] + assert runfile_lines[-1].endswith('quay.io/ayeaye/iscsi:latest') + _firewalld().open_ports.assert_not_called() + with open(basedir / 'config') as f: + assert f.read() == 'XXXXXXX' + with open(basedir / 'keyring') as f: + assert f.read() == 'YYYYYY' + assert (basedir / 'configfs').is_dir() + si = (basedir / 'configfs').stat() + assert (si.st_uid, si.st_gid) == (8765, 8765) + with open(basedir / 'iscsi-gateway.cfg') as f: + assert f.read() == 'portal' + si = os.fstat(f.fileno()) + assert (si.st_uid, si.st_gid) == (8765, 8765) From 785e4a623d47bb1118914146d94411b4a1556ac1 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 3 Oct 2023 13:39:12 -0400 Subject: [PATCH 0080/2492] cephadm: add unit test coverage for deploying nvmeof Signed-off-by: John Mulligan --- src/cephadm/tests/test_deploy.py | 39 ++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py index c392c9f4d798..d11aca065fbf 100644 --- a/src/cephadm/tests/test_deploy.py +++ b/src/cephadm/tests/test_deploy.py @@ -223,3 +223,42 @@ def test_deploy_iscsi_container(cephadm_fs, monkeypatch): assert f.read() == 'portal' si = os.fstat(f.fileno()) assert (si.st_uid, si.st_gid) == (8765, 8765) + + +def test_deploy_nvmeof_container(cephadm_fs, monkeypatch): + mocks = _common_mp(monkeypatch) + _firewalld = mocks['Firewalld'] + fsid = 'b01dbeef-701d-9abe-0000-e1e5a47004a7' + with with_cephadm_ctx([]) as ctx: + ctx.container_engine = mock_podman() + ctx.fsid = fsid + ctx.name = 'nvmeof.andu' + ctx.image = 'quay.io/ownf/nmve:latest' + ctx.reconfig = False + ctx.config_blobs = { + 'config': 'XXXXXXX', + 'keyring': 'YYYYYY', + 'files': { + 'ceph-nvmeof.conf': 'icantbeliveitsnotiscsi', + }, + } + _cephadm._common_deploy(ctx) + + basedir = pathlib.Path(f'/var/lib/ceph/{fsid}/nvmeof.andu') + assert basedir.is_dir() + with open(basedir / 'unit.run') as f: + runfile_lines = f.read().splitlines() + assert 'podman' in runfile_lines[-1] + assert runfile_lines[-1].endswith('quay.io/ownf/nmve:latest') + _firewalld().open_ports.assert_not_called() + with open(basedir / 'config') as f: + assert f.read() == 'XXXXXXX' + with open(basedir / 'keyring') as f: + assert f.read() == 'YYYYYY' + assert (basedir / 'configfs').is_dir() + si = (basedir / 'configfs').stat() + assert (si.st_uid, si.st_gid) == (167, 167) + with open(basedir / 'ceph-nvmeof.conf') as f: + assert f.read() == 'icantbeliveitsnotiscsi' + si = os.fstat(f.fileno()) + assert (si.st_uid, si.st_gid) == (167, 167) From 5c49070a058beb96a95ed53395b7ae2b87fb3dde Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 3 Oct 2023 13:56:36 -0400 Subject: [PATCH 0081/2492] cephadm: add unit test coverage for deploying monitoring Signed-off-by: John Mulligan --- src/cephadm/tests/test_deploy.py | 38 ++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py index d11aca065fbf..fad142ebbabb 100644 --- a/src/cephadm/tests/test_deploy.py +++ b/src/cephadm/tests/test_deploy.py @@ -262,3 +262,41 @@ def test_deploy_nvmeof_container(cephadm_fs, monkeypatch): assert f.read() == 'icantbeliveitsnotiscsi' si = os.fstat(f.fileno()) assert (si.st_uid, si.st_gid) == (167, 167) + + +def test_deploy_a_monitoring_container(cephadm_fs, monkeypatch): + mocks = _common_mp(monkeypatch) + _firewalld = mocks['Firewalld'] + _get_ip_addresses = mock.MagicMock(return_value=(['10.10.10.10'], [])) + monkeypatch.setattr('cephadm.get_ip_addresses', _get_ip_addresses) + fsid = 'b01dbeef-701d-9abe-0000-e1e5a47004a7' + with with_cephadm_ctx([]) as ctx: + ctx.container_engine = mock_podman() + ctx.fsid = fsid + ctx.name = 'prometheus.fire' + ctx.image = 'quay.io/titans/prometheus:latest' + ctx.reconfig = False + ctx.config_blobs = { + 'config': 'XXXXXXX', + 'keyring': 'YYYYYY', + 'files': { + 'prometheus.yml': 'bettercallherc', + }, + } + _cephadm._common_deploy(ctx) + + basedir = pathlib.Path(f'/var/lib/ceph/{fsid}/prometheus.fire') + assert basedir.is_dir() + with open(basedir / 'unit.run') as f: + runfile_lines = f.read().splitlines() + assert 'podman' in runfile_lines[-1] + assert runfile_lines[-1].endswith( + 'quay.io/titans/prometheus:latest --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus --web.listen-address=:9095 --storage.tsdb.retention.time=15d --storage.tsdb.retention.size=0 --web.external-url=http://10.10.10.10:9095' + ) + _firewalld().open_ports.assert_not_called() + assert not (basedir / 'config').exists() + assert not (basedir / 'keyring').exists() + with open(basedir / 'etc/prometheus/prometheus.yml') as f: + assert f.read() == 'bettercallherc' + si = os.fstat(f.fileno()) + assert (si.st_uid, si.st_gid) == (8765, 8765) From 8f82ef32c8b991ba777d5436e3cc69937805bc9f Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 3 Oct 2023 14:17:02 -0400 Subject: [PATCH 0082/2492] cephadm: add unit test coverage for deploying tracing Signed-off-by: John Mulligan --- src/cephadm/tests/test_deploy.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py index fad142ebbabb..e6b4da8127fc 100644 --- a/src/cephadm/tests/test_deploy.py +++ b/src/cephadm/tests/test_deploy.py @@ -300,3 +300,33 @@ def test_deploy_a_monitoring_container(cephadm_fs, monkeypatch): assert f.read() == 'bettercallherc' si = os.fstat(f.fileno()) assert (si.st_uid, si.st_gid) == (8765, 8765) + + +def test_deploy_a_tracing_container(cephadm_fs, monkeypatch): + mocks = _common_mp(monkeypatch) + _firewalld = mocks['Firewalld'] + fsid = 'b01dbeef-701d-9abe-0000-e1e5a47004a7' + with with_cephadm_ctx([]) as ctx: + ctx.container_engine = mock_podman() + ctx.fsid = fsid + ctx.name = 'elasticsearch.band' + ctx.image = 'quay.io/rubber/elasticsearch:latest' + ctx.reconfig = False + ctx.config_blobs = { + 'config': 'XXXXXXX', + 'keyring': 'YYYYYY', + 'files': { + 'prometheus.yml': 'bettercallherc', + }, + } + _cephadm._common_deploy(ctx) + + basedir = pathlib.Path(f'/var/lib/ceph/{fsid}/elasticsearch.band') + assert basedir.is_dir() + with open(basedir / 'unit.run') as f: + runfile_lines = f.read().splitlines() + assert 'podman' in runfile_lines[-1] + assert runfile_lines[-1].endswith('quay.io/rubber/elasticsearch:latest') + _firewalld().open_ports.assert_not_called() + assert not (basedir / 'config').exists() + assert not (basedir / 'keyring').exists() From 04e2ac04b68457824ec69bc0a712d595915e4f81 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 3 Oct 2023 14:28:31 -0400 Subject: [PATCH 0083/2492] cephadm: add unit test coverage for deploying ceph container Signed-off-by: John Mulligan --- src/cephadm/tests/test_deploy.py | 38 ++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py index e6b4da8127fc..a5511a34d1d3 100644 --- a/src/cephadm/tests/test_deploy.py +++ b/src/cephadm/tests/test_deploy.py @@ -330,3 +330,41 @@ def test_deploy_a_tracing_container(cephadm_fs, monkeypatch): _firewalld().open_ports.assert_not_called() assert not (basedir / 'config').exists() assert not (basedir / 'keyring').exists() + + +def test_deploy_ceph_mgr_container(cephadm_fs, monkeypatch): + mocks = _common_mp(monkeypatch) + _firewalld = mocks['Firewalld'] + _make_var_run = mock.MagicMock() + monkeypatch.setattr('cephadm.make_var_run', _make_var_run) + fsid = 'b01dbeef-701d-9abe-0000-e1e5a47004a7' + with with_cephadm_ctx([]) as ctx: + ctx.container_engine = mock_podman() + ctx.fsid = fsid + ctx.name = 'mgr.foo' + ctx.image = 'quay.io/ceph/ceph:latest' + ctx.reconfig = False + ctx.allow_ptrace = False + ctx.osd_fsid = '00000000-0000-0000-0000-000000000000' + ctx.config_blobs = { + 'config': 'XXXXXXX', + 'keyring': 'YYYYYY', + } + _cephadm._common_deploy(ctx) + + basedir = pathlib.Path(f'/var/lib/ceph/{fsid}/mgr.foo') + assert basedir.is_dir() + with open(basedir / 'unit.run') as f: + runfile_lines = f.read().splitlines() + assert 'podman' in runfile_lines[-1] + assert runfile_lines[-1].endswith( + 'quay.io/ceph/ceph:latest -n mgr.foo -f --setuser ceph --setgroup ceph --default-log-to-file=false --default-log-to-journald=true --default-log-to-stderr=false' + ) + _firewalld().open_ports.assert_not_called() + with open(basedir / 'config') as f: + assert f.read() == 'XXXXXXX' + with open(basedir / 'keyring') as f: + assert f.read() == 'YYYYYY' + assert _make_var_run.call_count == 1 + assert _make_var_run.call_args[0][2] == 8765 + assert _make_var_run.call_args[0][3] == 8765 From 89dd3719a01323025da5ed7fd95ca27bd96af16b Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 3 Oct 2023 14:43:51 -0400 Subject: [PATCH 0084/2492] cephadm: add unit test coverage for deploying ceph-exporter Signed-off-by: John Mulligan --- src/cephadm/tests/test_deploy.py | 45 ++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py index a5511a34d1d3..1f4eddf3a70d 100644 --- a/src/cephadm/tests/test_deploy.py +++ b/src/cephadm/tests/test_deploy.py @@ -368,3 +368,48 @@ def test_deploy_ceph_mgr_container(cephadm_fs, monkeypatch): assert _make_var_run.call_count == 1 assert _make_var_run.call_args[0][2] == 8765 assert _make_var_run.call_args[0][3] == 8765 + + +def test_deploy_ceph_exporter_container(cephadm_fs, monkeypatch): + mocks = _common_mp(monkeypatch) + _firewalld = mocks['Firewalld'] + _get_ip_addresses = mock.MagicMock(return_value=(['10.10.10.10'], [])) + monkeypatch.setattr('cephadm.get_ip_addresses', _get_ip_addresses) + _make_var_run = mock.MagicMock() + monkeypatch.setattr('cephadm.make_var_run', _make_var_run) + fsid = 'b01dbeef-701d-9abe-0000-e1e5a47004a7' + with with_cephadm_ctx([]) as ctx: + ctx.container_engine = mock_podman() + ctx.fsid = fsid + ctx.name = 'ceph-exporter.zaq' + ctx.image = 'quay.io/ceph/ceph:latest' + ctx.reconfig = False + ctx.allow_ptrace = False + ctx.osd_fsid = '00000000-0000-0000-0000-000000000000' + ctx.config_blobs = { + 'config': 'XXXXXXX', + 'keyring': 'YYYYYY', + 'prio-limit': 12, + } + + # ceph-exporter is weird and special. it requires the "sock dir" + # to already exist. that dir defaults to /var/run/ceph + vrc = pathlib.Path('/var/run/ceph') + (vrc / fsid).mkdir(parents=True) + + _cephadm._common_deploy(ctx) + + basedir = pathlib.Path(f'/var/lib/ceph/{fsid}/ceph-exporter.zaq') + assert basedir.is_dir() + with open(basedir / 'unit.run') as f: + runfile_lines = f.read().splitlines() + assert 'podman' in runfile_lines[-1] + assert runfile_lines[-1].endswith( + 'quay.io/ceph/ceph:latest -n client.ceph-exporter.zaq -f --sock-dir=/var/run/ceph/ --addrs=0.0.0.0 --port=9926 --prio-limit=12 --stats-period=5' + ) + assert '--entrypoint /usr/bin/ceph-exporter' in runfile_lines[-1] + _firewalld().open_ports.assert_not_called() + with open(basedir / 'config') as f: + assert f.read() == 'XXXXXXX' + with open(basedir / 'keyring') as f: + assert f.read() == 'YYYYYY' From 14664264e51eec8e62119439f92dffd3cc487607 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 3 Oct 2023 16:33:46 -0400 Subject: [PATCH 0085/2492] cephadm: remove gateways property from Ceph type The gateways list property of the Ceph type was used in exactly one place. In order to provide a clearer structure to the classes in cephadm, remove this property and simply list the daemon types specifically where they are needed. In the future, I hope to see these handled in a class/object based manner but this is still better for now. Signed-off-by: John Mulligan --- src/cephadm/cephadm.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 4901abf42cd3..ed6841bbae89 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -211,7 +211,6 @@ def __eq__(self, other: Any) -> bool: class Ceph(DaemonForm): daemons = ('mon', 'mgr', 'osd', 'mds', 'rgw', 'rbd-mirror', 'crash', 'cephfs-mirror', 'ceph-exporter') - gateways = ('iscsi', 'nfs', 'nvmeof') @classmethod def for_daemon_type(cls, daemon_type: str) -> bool: @@ -2553,7 +2552,11 @@ def get_container( envs.append('TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=134217728') if container_args is None: container_args = [] - if daemon_type in Ceph.daemons or daemon_type in Ceph.gateways: + unlimited_daemons = set(Ceph.daemons) + unlimited_daemons.add(CephIscsi.daemon_type) + unlimited_daemons.add(CephNvmeof.daemon_type) + unlimited_daemons.add(NFSGanesha.daemon_type) + if daemon_type in unlimited_daemons: set_pids_limit_unlimited(ctx, container_args) if daemon_type in ['mon', 'osd']: # mon and osd need privileged in order for libudev to query devices From 1e531f4854e742091d02d15c8bef7d7e0f70fe61 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Wed, 27 Sep 2023 18:00:03 -0400 Subject: [PATCH 0086/2492] cephadm: convert keepalived type to a ContainerDaemonForm Signed-off-by: John Mulligan --- src/cephadm/cephadm.py | 22 ++++++---------------- src/cephadm/tests/test_ingress.py | 2 +- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index ed6841bbae89..1d5c55369971 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -1240,7 +1240,7 @@ def get_sysctl_settings() -> List[str]: @register_daemon_form -class Keepalived(DaemonForm): +class Keepalived(ContainerDaemonForm): """Defines an Keepalived container""" daemon_type = 'keepalived' required_files = ['keepalived.conf'] @@ -1335,7 +1335,7 @@ def get_sysctl_settings() -> List[str]: 'net.ipv4.ip_nonlocal_bind = 1', ] - def extract_uid_gid_keepalived(self) -> Tuple[int, int]: + def uid_gid(self, ctx: CephadmContext) -> Tuple[int, int]: # better directory for this? return extract_uid_gid(self.ctx, file_path='/var/lib') @@ -1345,6 +1345,10 @@ def get_container_mounts(data_dir: str) -> Dict[str, str]: mounts[os.path.join(data_dir, 'keepalived.conf')] = '/etc/keepalived/keepalived.conf' return mounts + def container(self, ctx: CephadmContext) -> CephContainer: + return get_deployment_container(ctx, self.identity) + + ################################## @@ -5235,20 +5239,6 @@ def _dispatch_deploy( endpoints=daemon_endpoints, ) - elif daemon_type == Keepalived.daemon_type: - keepalived = Keepalived.init(ctx, ident.fsid, ident.daemon_id) - uid, gid = keepalived.extract_uid_gid_keepalived() - c = get_deployment_container(ctx, ident) - deploy_daemon( - ctx, - ident, - c, - uid, - gid, - deployment_type=deployment_type, - endpoints=daemon_endpoints, - ) - elif daemon_type == CephadmAgent.daemon_type: # get current user gid and uid uid = os.getuid() diff --git a/src/cephadm/tests/test_ingress.py b/src/cephadm/tests/test_ingress.py index 798c73708686..51a6e113bc4a 100644 --- a/src/cephadm/tests/test_ingress.py +++ b/src/cephadm/tests/test_ingress.py @@ -331,7 +331,7 @@ def test_keepalived_extract_uid_gid_keepalived(): ) with mock.patch("cephadm.CephContainer") as cc: cc.return_value.run.return_value = "500 500" - uid, gid = kad.extract_uid_gid_keepalived() + uid, gid = kad.uid_gid(ctx) cc.return_value.run.assert_called() assert uid == 500 assert gid == 500 From aaffe62c0b224039db6c70097b7122e0ad72013a Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Wed, 27 Sep 2023 18:06:12 -0400 Subject: [PATCH 0087/2492] cephadm: convert haproxy type to a ContainerDaemonForm Signed-off-by: John Mulligan --- src/cephadm/cephadm.py | 20 +++++--------------- src/cephadm/tests/test_ingress.py | 2 +- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 1d5c55369971..bd116fd00636 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -1139,7 +1139,7 @@ def validate(self) -> None: @register_daemon_form -class HAproxy(DaemonForm): +class HAproxy(ContainerDaemonForm): """Defines an HAproxy container""" daemon_type = 'haproxy' required_files = ['haproxy.cfg'] @@ -1218,7 +1218,7 @@ def get_container_name(self, desc=None): cname = '%s-%s' % (cname, desc) return cname - def extract_uid_gid_haproxy(self) -> Tuple[int, int]: + def uid_gid(self, ctx: CephadmContext) -> Tuple[int, int]: # better directory for this? return extract_uid_gid(self.ctx, file_path='/var/lib') @@ -1236,6 +1236,9 @@ def get_sysctl_settings() -> List[str]: 'net.ipv4.ip_nonlocal_bind = 1', ] + def container(self, ctx: CephadmContext) -> CephContainer: + return get_deployment_container(ctx, self.identity) + ################################## @@ -5225,19 +5228,6 @@ def _dispatch_deploy( deployment_type=deployment_type, endpoints=daemon_endpoints, ) - elif daemon_type == HAproxy.daemon_type: - haproxy = HAproxy.init(ctx, ident.fsid, ident.daemon_id) - uid, gid = haproxy.extract_uid_gid_haproxy() - c = get_deployment_container(ctx, ident) - deploy_daemon( - ctx, - ident, - c, - uid, - gid, - deployment_type=deployment_type, - endpoints=daemon_endpoints, - ) elif daemon_type == CephadmAgent.daemon_type: # get current user gid and uid diff --git a/src/cephadm/tests/test_ingress.py b/src/cephadm/tests/test_ingress.py index 51a6e113bc4a..08a9808ddbba 100644 --- a/src/cephadm/tests/test_ingress.py +++ b/src/cephadm/tests/test_ingress.py @@ -168,7 +168,7 @@ def test_haproxy_extract_uid_gid_haproxy(): ) with mock.patch("cephadm.CephContainer") as cc: cc.return_value.run.return_value = "500 500" - uid, gid = hap.extract_uid_gid_haproxy() + uid, gid = hap.uid_gid(ctx) cc.return_value.run.assert_called() assert uid == 500 assert gid == 500 From 04b2f4cddd8c1d132e3b6c31357b7455ac4b02da Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Wed, 27 Sep 2023 18:09:47 -0400 Subject: [PATCH 0088/2492] cephadm: convert tracing type to a ContainerDaemonForm Signed-off-by: John Mulligan --- src/cephadm/cephadm.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index bd116fd00636..5582e1ef692f 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -1356,7 +1356,7 @@ def container(self, ctx: CephadmContext) -> CephContainer: @register_daemon_form -class Tracing(DaemonForm): +class Tracing(ContainerDaemonForm): """Define the configs for the jaeger tracing containers""" components: Dict[str, Dict[str, Any]] = { @@ -1404,6 +1404,14 @@ def create(cls, ctx: CephadmContext, ident: DaemonIdentity) -> 'Tracing': def identity(self) -> DaemonIdentity: return self._identity + def container(self, ctx: CephadmContext) -> CephContainer: + # TODO(jjm) this looks to be the only container for deployment + # not using get_deployment_container. Previous oversight? + return get_container(ctx, self.identity) + + def uid_gid(self, ctx: CephadmContext) -> Tuple[int, int]: + return 65534, 65534 + ################################## @@ -5216,18 +5224,6 @@ def _dispatch_deploy( deployment_type=deployment_type, endpoints=daemon_endpoints, ) - elif daemon_type in Tracing.components: - uid, gid = 65534, 65534 - c = get_container(ctx, ident) - deploy_daemon( - ctx, - ident, - c, - uid, - gid, - deployment_type=deployment_type, - endpoints=daemon_endpoints, - ) elif daemon_type == CephadmAgent.daemon_type: # get current user gid and uid From 1fcacd6b20838968438b05d4cbb832f8c2c97a07 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Wed, 27 Sep 2023 18:12:48 -0400 Subject: [PATCH 0089/2492] cephadm: convert nvmeof type to a ContainerDaemonForm Signed-off-by: John Mulligan --- src/cephadm/cephadm.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 5582e1ef692f..7511b4cf17c2 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -928,7 +928,7 @@ def get_tcmu_runner_container(self): @register_daemon_form -class CephNvmeof(DaemonForm): +class CephNvmeof(ContainerDaemonForm): """Defines a Ceph-Nvmeof container""" daemon_type = 'nvmeof' @@ -1061,6 +1061,17 @@ def get_sysctl_settings() -> List[str]: 'vm.nr_hugepages = 4096', ] + def container(self, ctx: CephadmContext) -> CephContainer: + return get_deployment_container(ctx, self.identity) + + def uid_gid(self, ctx: CephadmContext) -> Tuple[int, int]: + return 167, 167 # TODO: need to get properly the uid/gid + + def config_and_keyring( + self, ctx: CephadmContext + ) -> Tuple[Optional[str], Optional[str]]: + return get_config_and_keyring(ctx) + ################################## @@ -5209,21 +5220,6 @@ def _dispatch_deploy( deployment_type=deployment_type, endpoints=daemon_endpoints ) - elif daemon_type == CephNvmeof.daemon_type: - config, keyring = get_config_and_keyring(ctx) - uid, gid = 167, 167 # TODO: need to get properly the uid/gid - c = get_deployment_container(ctx, ident) - deploy_daemon( - ctx, - ident, - c, - uid, - gid, - config=config, - keyring=keyring, - deployment_type=deployment_type, - endpoints=daemon_endpoints, - ) elif daemon_type == CephadmAgent.daemon_type: # get current user gid and uid From abf39a669f04b130b248c2edc0e7102bbb61b2ec Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Wed, 27 Sep 2023 18:15:37 -0400 Subject: [PATCH 0090/2492] cephadm: convert iscsi type to a ContainerDaemonForm Signed-off-by: John Mulligan --- src/cephadm/cephadm.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 7511b4cf17c2..6faf348a7fde 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -728,7 +728,7 @@ def config_and_keyring( @register_daemon_form -class CephIscsi(DaemonForm): +class CephIscsi(ContainerDaemonForm): """Defines a Ceph-Iscsi container""" daemon_type = 'iscsi' @@ -923,6 +923,17 @@ def get_tcmu_runner_container(self): tcmu_container.cname = self.get_container_name(desc='tcmu') return tcmu_container + def container(self, ctx: CephadmContext) -> CephContainer: + return get_deployment_container(ctx, self.identity) + + def config_and_keyring( + self, ctx: CephadmContext + ) -> Tuple[Optional[str], Optional[str]]: + return get_config_and_keyring(ctx) + + def uid_gid(self, ctx: CephadmContext) -> Tuple[int, int]: + return extract_uid_gid(ctx) + ################################## @@ -5205,22 +5216,6 @@ def _dispatch_deploy( endpoints=daemon_endpoints ) - elif daemon_type == CephIscsi.daemon_type: - config, keyring = get_config_and_keyring(ctx) - uid, gid = extract_uid_gid(ctx) - c = get_deployment_container(ctx, ident) - deploy_daemon( - ctx, - ident, - c, - uid, - gid, - config=config, - keyring=keyring, - deployment_type=deployment_type, - endpoints=daemon_endpoints - ) - elif daemon_type == CephadmAgent.daemon_type: # get current user gid and uid uid = os.getuid() From de6ccf0be4f08fb1fd3bdf301689d98f60295bc0 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Wed, 27 Sep 2023 18:21:51 -0400 Subject: [PATCH 0091/2492] cephadm: move extract_uid_gid_monitoring to Monitoring class Make it a method of the class handling monitoring, such as other classes already are. Signed-off-by: John Mulligan --- src/cephadm/cephadm.py | 52 ++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 6faf348a7fde..403575f0bf4b 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -541,6 +541,28 @@ def get_version(ctx, container_id, daemon_type): version = out.split(' ')[2] return version + @staticmethod + def extract_uid_gid( + ctx: CephadmContext, daemon_type: str + ) -> Tuple[int, int]: + if daemon_type == 'prometheus': + uid, gid = extract_uid_gid(ctx, file_path='/etc/prometheus') + elif daemon_type == 'node-exporter': + uid, gid = 65534, 65534 + elif daemon_type == 'grafana': + uid, gid = extract_uid_gid(ctx, file_path='/var/lib/grafana') + elif daemon_type == 'loki': + uid, gid = extract_uid_gid(ctx, file_path='/etc/loki') + elif daemon_type == 'promtail': + uid, gid = extract_uid_gid(ctx, file_path='/etc/promtail') + elif daemon_type == 'alertmanager': + uid, gid = extract_uid_gid( + ctx, file_path=['/etc/alertmanager', '/etc/prometheus'] + ) + else: + raise Error('{} not implemented yet'.format(daemon_type)) + return uid, gid + def __init__(self, ident: DaemonIdentity) -> None: self._identity = ident @@ -2654,7 +2676,7 @@ def get_container( container_args.extend(cc.get_container_args()) if daemon_type in Monitoring.components: - uid, gid = extract_uid_gid_monitoring(ctx, daemon_type) + uid, gid = Monitoring.extract_uid_gid(ctx, daemon_type) monitoring_args = [ '--user', str(uid), @@ -5013,26 +5035,6 @@ def command_registry_login(ctx: CephadmContext) -> int: ################################## -def extract_uid_gid_monitoring(ctx, daemon_type): - # type: (CephadmContext, str) -> Tuple[int, int] - - if daemon_type == 'prometheus': - uid, gid = extract_uid_gid(ctx, file_path='/etc/prometheus') - elif daemon_type == 'node-exporter': - uid, gid = 65534, 65534 - elif daemon_type == 'grafana': - uid, gid = extract_uid_gid(ctx, file_path='/var/lib/grafana') - elif daemon_type == 'loki': - uid, gid = extract_uid_gid(ctx, file_path='/etc/loki') - elif daemon_type == 'promtail': - uid, gid = extract_uid_gid(ctx, file_path='/etc/promtail') - elif daemon_type == 'alertmanager': - uid, gid = extract_uid_gid(ctx, file_path=['/etc/alertmanager', '/etc/prometheus']) - else: - raise Error('{} not implemented yet'.format(daemon_type)) - return uid, gid - - def get_deployment_container( ctx: CephadmContext, ident: 'DaemonIdentity', @@ -5204,7 +5206,7 @@ def _dispatch_deploy( raise Error('{} deployment requires config-json which must ' 'contain arg for {}'.format(daemon_type.capitalize(), ', '.join(required_args))) - uid, gid = extract_uid_gid_monitoring(ctx, daemon_type) + uid, gid = Monitoring.extract_uid_gid(ctx, daemon_type) c = get_deployment_container(ctx, ident) deploy_daemon( ctx, @@ -6094,7 +6096,7 @@ def command_adopt_ceph(ctx, daemon_type, daemon_id, fsid): def command_adopt_prometheus(ctx, daemon_id, fsid): # type: (CephadmContext, str, str) -> None daemon_type = 'prometheus' - (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type) + (uid, gid) = Monitoring.extract_uid_gid(ctx, daemon_type) # should try to set the ports we know cephadm defaults # to for these services in the firewall. ports = Monitoring.port_map['prometheus'] @@ -6141,7 +6143,7 @@ def command_adopt_grafana(ctx, daemon_id, fsid): # type: (CephadmContext, str, str) -> None daemon_type = 'grafana' - (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type) + (uid, gid) = Monitoring.extract_uid_gid(ctx, daemon_type) # should try to set the ports we know cephadm defaults # to for these services in the firewall. ports = Monitoring.port_map['grafana'] @@ -6212,7 +6214,7 @@ def command_adopt_alertmanager(ctx, daemon_id, fsid): # type: (CephadmContext, str, str) -> None daemon_type = 'alertmanager' - (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type) + (uid, gid) = Monitoring.extract_uid_gid(ctx, daemon_type) # should try to set the ports we know cephadm defaults # to for these services in the firewall. ports = Monitoring.port_map['alertmanager'] From 9015edc3f34bbaf1906b9e5c83be7c49ef7c42c8 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Wed, 27 Sep 2023 18:30:17 -0400 Subject: [PATCH 0092/2492] cephadm: convert monitoring type to a ContainerDaemonForm Signed-off-by: John Mulligan --- src/cephadm/cephadm.py | 66 +++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 403575f0bf4b..6c87d7983b6f 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -422,7 +422,7 @@ def uid_gid(self, ctx: CephadmContext) -> Tuple[int, int]: ################################## @register_daemon_form -class Monitoring(DaemonForm): +class Monitoring(ContainerDaemonForm): """Define the configs for the monitoring containers""" port_map = { @@ -574,6 +574,42 @@ def create(cls, ctx: CephadmContext, ident: DaemonIdentity) -> 'Monitoring': def identity(self) -> DaemonIdentity: return self._identity + def container(self, ctx: CephadmContext) -> CephContainer: + self._prevalidate(ctx) + return get_deployment_container(ctx, self.identity) + + def uid_gid(self, ctx: CephadmContext) -> Tuple[int, int]: + return self.extract_uid_gid(ctx, self.identity.daemon_type) + + def _prevalidate(self, ctx: CephadmContext) -> None: + # before being refactored into a ContainerDaemonForm these checks were + # done inside the deploy function. This was the only "family" of daemons + # that performed these checks in that location + daemon_type = self.identity.daemon_type + config = fetch_configs(ctx) # type: ignore + required_files = self.components[daemon_type].get( + 'config-json-files', list() + ) + required_args = self.components[daemon_type].get( + 'config-json-args', list() + ) + if required_files: + if not config or not all(c in config.get('files', {}).keys() for c in required_files): # type: ignore + raise Error( + '{} deployment requires config-json which must ' + 'contain file content for {}'.format( + daemon_type.capitalize(), ', '.join(required_files) + ) + ) + if required_args: + if not config or not all(c in config.keys() for c in required_args): # type: ignore + raise Error( + '{} deployment requires config-json which must ' + 'contain arg for {}'.format( + daemon_type.capitalize(), ', '.join(required_args) + ) + ) + ################################## @@ -5190,34 +5226,6 @@ def _dispatch_deploy( endpoints=daemon_endpoints, ) - elif daemon_type in Monitoring.components: - # monitoring daemon - prometheus, grafana, alertmanager, node-exporter - # Default Checks - # make sure provided config-json is sufficient - config = fetch_configs(ctx) # type: ignore - required_files = Monitoring.components[daemon_type].get('config-json-files', list()) - required_args = Monitoring.components[daemon_type].get('config-json-args', list()) - if required_files: - if not config or not all(c in config.get('files', {}).keys() for c in required_files): # type: ignore - raise Error('{} deployment requires config-json which must ' - 'contain file content for {}'.format(daemon_type.capitalize(), ', '.join(required_files))) - if required_args: - if not config or not all(c in config.keys() for c in required_args): # type: ignore - raise Error('{} deployment requires config-json which must ' - 'contain arg for {}'.format(daemon_type.capitalize(), ', '.join(required_args))) - - uid, gid = Monitoring.extract_uid_gid(ctx, daemon_type) - c = get_deployment_container(ctx, ident) - deploy_daemon( - ctx, - ident, - c, - uid, - gid, - deployment_type=deployment_type, - endpoints=daemon_endpoints - ) - elif daemon_type == CephadmAgent.daemon_type: # get current user gid and uid uid = os.getuid() From ffe1f2f8f159749905e224dbaa06f79681063b08 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Thu, 28 Sep 2023 11:02:37 -0400 Subject: [PATCH 0093/2492] cephadm: update test to avoid using exception handling as an assertion The use of an exception as an assertion mostly works but has the side effect of hiding other errors. Hiding these errors can make it hard to debug problems in this code path, as it did for me recently. Update the test to use a standard assertion as well as asserting that the assertion must have been called. Signed-off-by: John Mulligan --- src/cephadm/tests/test_cephadm.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/cephadm/tests/test_cephadm.py b/src/cephadm/tests/test_cephadm.py index ff474c23ccd9..7e31b26307c3 100644 --- a/src/cephadm/tests/test_cephadm.py +++ b/src/cephadm/tests/test_cephadm.py @@ -423,13 +423,12 @@ def test_mon_crush_location(self, _get_deployment_container, _migrate_sysctl, _m ) def _crush_location_checker(ctx, ident, container, uid, gid, **kwargs): - print(container.args) - raise Exception(' '.join(container.args)) + argval = ' '.join(container.args) + assert '--set-crush-location database=a' in argval _deploy_daemon.side_effect = _crush_location_checker - - with pytest.raises(Exception, match='--set-crush-location database=a'): - _cephadm.command_deploy_from(ctx) + _cephadm.command_deploy_from(ctx) + _deploy_daemon.assert_called() @mock.patch('cephadm.logger') @mock.patch('cephadm.fetch_custom_config_files') From b911cf42c058cde51ce5a57234845136e30743bd Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 21 Sep 2023 20:46:50 -0400 Subject: [PATCH 0094/2492] qa: narrow search to debug_asok To avoid matching debug_asok_assert_abort. Signed-off-by: Patrick Donnelly --- qa/workunits/mon/config.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qa/workunits/mon/config.sh b/qa/workunits/mon/config.sh index 1b00201ae481..9a62081c6680 100755 --- a/qa/workunits/mon/config.sh +++ b/qa/workunits/mon/config.sh @@ -98,11 +98,11 @@ ceph tell osd.0 config unset debug_asok ceph tell osd.0 config unset debug_asok ceph config rm osd.0 debug_asok -while ceph config show osd.0 | grep debug_asok | grep mon +while ceph config show osd.0 | grep '^debug_asok[:[space]:]' | grep mon do sleep 1 done -ceph config show osd.0 | grep -c debug_asok | grep 0 +ceph config show osd.0 | grep -c '^debug_asok[:[space]:]' | grep 0 ceph config set osd.0 osd_scrub_cost 123 while ! ceph config show osd.0 | grep osd_scrub_cost | grep mon From cd1833ab152d0231c1c6f3685238f4fcd3284f6e Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 21 Sep 2023 20:47:40 -0400 Subject: [PATCH 0095/2492] qa: add reproducer for obs removal deadlock And hopefully this "config set" race test may catch future bugs! Signed-off-by: Patrick Donnelly --- qa/workunits/mon/config.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/qa/workunits/mon/config.sh b/qa/workunits/mon/config.sh index 9a62081c6680..7d97a452bd89 100755 --- a/qa/workunits/mon/config.sh +++ b/qa/workunits/mon/config.sh @@ -130,6 +130,21 @@ rm -f $t1 $t2 expect_false ceph config reset expect_false ceph config reset -1 + + +# test parallel config set +# reproducer for https://tracker.ceph.com/issues/62832 +ceph config reset 0 +for ((try = 0; try < 10; try++)); do + set +x + for ((i = 0; i < 100; i++)); do + # Use a config that will get "handled" by the Objecter instantiated by the ceph binary + ceph config set client rados_mon_op_timeout $((i+300)) & + done 2> /dev/null + set -x + wait +done + # we are at end of testing, so it's okay to revert everything ceph config reset 0 From a8bd314bbd78dbe73371e7a8beaaa1929577b76e Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 3 Oct 2023 20:45:12 +0000 Subject: [PATCH 0096/2492] common/ceph_mutex: note whether mutex debug methods are usable So we can do checks like: ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); Without this boolean, this check: ceph_assert(!ceph_mutex_is_locked_by_me(lock)); will fail for all crimson/release builds because the method always returns true. Signed-off-by: Patrick Donnelly --- src/common/ceph_mutex.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/common/ceph_mutex.h b/src/common/ceph_mutex.h index 6ed8c56d5dad..059d81f2ac39 100644 --- a/src/common/ceph_mutex.h +++ b/src/common/ceph_mutex.h @@ -83,6 +83,7 @@ namespace ceph { return {}; } + static constexpr bool mutex_debugging = false; #define ceph_mutex_is_locked(m) true #define ceph_mutex_is_locked_by_me(m) true } @@ -130,6 +131,8 @@ namespace ceph { return {std::forward(args)...}; } + static constexpr bool mutex_debugging = true; + // debug methods #define ceph_mutex_is_locked(m) ((m).is_locked()) #define ceph_mutex_is_not_locked(m) (!(m).is_locked()) @@ -183,6 +186,8 @@ namespace ceph { return {}; } + static constexpr bool mutex_debugging = false; + // debug methods. Note that these can blindly return true // because any code that does anything other than assert these // are true is broken. From 7b5076bafeb8981cd809437ee04fc570857a05f4 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Wed, 20 Sep 2023 22:00:03 -0400 Subject: [PATCH 0097/2492] common: add missing locks in config_proxy methods It's not generally safe to access the md_config_t without these locks. Some methods are probably harmless (accessing read-only state) but best to be consistent. Signed-off-by: Patrick Donnelly --- src/common/config_proxy.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/common/config_proxy.h b/src/common/config_proxy.h index 02c670f60277..400aa4ed052d 100644 --- a/src/common/config_proxy.h +++ b/src/common/config_proxy.h @@ -150,12 +150,15 @@ class ConfigProxy { std::forward(args)...); } void config_options(ceph::Formatter *f) const { + std::lock_guard l{lock}; config.config_options(f); } const decltype(md_config_t::schema)& get_schema() const { + std::lock_guard l{lock}; return config.schema; } const Option* get_schema(const std::string_view key) const { + std::lock_guard l{lock}; auto found = config.schema.find(key); if (found == config.schema.end()) { return nullptr; @@ -164,6 +167,7 @@ class ConfigProxy { } } const Option *find_option(const std::string& name) const { + std::lock_guard l{lock}; return config.find_option(name); } void diff(ceph::Formatter *f, const std::string& name = {}) const { @@ -186,6 +190,7 @@ class ConfigProxy { sections, key, out, emeta); } unsigned get_osd_pool_default_min_size(uint8_t size) const { + std::lock_guard l{lock}; return config.get_osd_pool_default_min_size(values, size); } void early_expand_meta(std::string &val, @@ -225,9 +230,11 @@ class ConfigProxy { call_observers(locker, rev_obs); } void set_safe_to_start_threads() { + std::lock_guard l(lock); config.set_safe_to_start_threads(); } void _clear_safe_to_start_threads() { + std::lock_guard l(lock); config._clear_safe_to_start_threads(); } void show_config(std::ostream& out) { @@ -319,12 +326,15 @@ class ConfigProxy { conf_files, warnings, flags); } bool has_parse_error() const { + std::lock_guard l(lock); return !config.parse_error.empty(); } std::string get_parse_error() { + std::lock_guard l(lock); return config.parse_error; } void complain_about_parse_error(CephContext *cct) { + std::lock_guard l(lock); return config.complain_about_parse_error(cct); } void do_argv_commands() const { @@ -342,9 +352,11 @@ class ConfigProxy { config.get_defaults_bl(values, bl); } const std::string& get_conf_path() const { + std::lock_guard l(lock); return config.get_conf_path(); } std::optional get_val_default(std::string_view key) { + std::lock_guard l(lock); return config.get_val_default(key); } }; From 6fa7167732085d651285a11fd85c1bc33a2afcd7 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Fri, 4 Aug 2023 11:11:09 -0400 Subject: [PATCH 0098/2492] qa/rgw/tempest: rearrange tasks and overrides Signed-off-by: Casey Bodley --- qa/suites/rgw/tempest/0-install.yaml | 15 +++++++++ qa/suites/rgw/tempest/overrides.yaml | 14 ++++++++ .../tasks/{rgw_tempest.yaml => tempest.yaml} | 33 ------------------- 3 files changed, 29 insertions(+), 33 deletions(-) create mode 100644 qa/suites/rgw/tempest/0-install.yaml rename qa/suites/rgw/tempest/tasks/{rgw_tempest.yaml => tempest.yaml} (67%) diff --git a/qa/suites/rgw/tempest/0-install.yaml b/qa/suites/rgw/tempest/0-install.yaml new file mode 100644 index 000000000000..fc2cfcc7b6d2 --- /dev/null +++ b/qa/suites/rgw/tempest/0-install.yaml @@ -0,0 +1,15 @@ +tasks: +- install: +- ceph: +- tox: [ client.0 ] +- keystone: + client.0: + force-branch: stable/2023.1 + services: + - name: swift + type: object-store + description: Swift Service +- rgw: + client.0: + frontend_prefix: /swift + use-keystone-role: client.0 diff --git a/qa/suites/rgw/tempest/overrides.yaml b/qa/suites/rgw/tempest/overrides.yaml index e7a292ffd1fd..9fb0e11e9493 100644 --- a/qa/suites/rgw/tempest/overrides.yaml +++ b/qa/suites/rgw/tempest/overrides.yaml @@ -1,7 +1,21 @@ overrides: ceph: conf: + global: + osd_min_pg_log_entries: 10 + osd_max_pg_log_entries: 10 client: setuser: ceph setgroup: ceph debug rgw: 20 + rgw keystone api version: 3 + rgw keystone accepted roles: admin,member + rgw keystone implicit tenants: true + rgw keystone accepted admin roles: admin + rgw swift enforce content length: true + rgw swift account in url: true + rgw swift versioning enabled: true + rgw keystone admin domain: Default + rgw keystone admin user: admin + rgw keystone admin password: ADMIN + rgw keystone admin project: admin diff --git a/qa/suites/rgw/tempest/tasks/rgw_tempest.yaml b/qa/suites/rgw/tempest/tasks/tempest.yaml similarity index 67% rename from qa/suites/rgw/tempest/tasks/rgw_tempest.yaml rename to qa/suites/rgw/tempest/tasks/tempest.yaml index ad9dc9dd5025..560e41de5bbb 100644 --- a/qa/suites/rgw/tempest/tasks/rgw_tempest.yaml +++ b/qa/suites/rgw/tempest/tasks/tempest.yaml @@ -1,18 +1,4 @@ tasks: -- install: -- ceph: -- tox: [ client.0 ] -- keystone: - client.0: - force-branch: stable/2023.1 - services: - - name: swift - type: object-store - description: Swift Service -- rgw: - client.0: - frontend_prefix: /swift - use-keystone-role: client.0 - tempest: client.0: sha1: 34.1.0 @@ -51,22 +37,3 @@ tasks: - .*test_object_expiry.ObjectExpiryTest.test_get_object_after_expiry_time - .*test_object_expiry.ObjectExpiryTest.test_get_object_at_expiry_time - .*test_account_services.AccountTest.test_list_no_account_metadata - -overrides: - ceph: - conf: - global: - osd_min_pg_log_entries: 10 - osd_max_pg_log_entries: 10 - client: - rgw keystone api version: 3 - rgw keystone accepted roles: admin,member - rgw keystone implicit tenants: true - rgw keystone accepted admin roles: admin - rgw swift enforce content length: true - rgw swift account in url: true - rgw swift versioning enabled: true - rgw keystone admin domain: Default - rgw keystone admin user: admin - rgw keystone admin password: ADMIN - rgw keystone admin project: admin From f19d05dcab8093424d9cd50943972fa7f1942f92 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Fri, 4 Aug 2023 17:03:40 -0400 Subject: [PATCH 0099/2492] qa/keystone: merge overrides into config Signed-off-by: Casey Bodley --- qa/tasks/keystone.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/qa/tasks/keystone.py b/qa/tasks/keystone.py index 7aa785055c21..f0f09842e605 100644 --- a/qa/tasks/keystone.py +++ b/qa/tasks/keystone.py @@ -463,6 +463,8 @@ def task(ctx, config): config = all_clients if isinstance(config, list): config = dict.fromkeys(config) + overrides = ctx.config.get('overrides', {}) + teuthology.deep_merge(config, overrides.get('keystone', {})) log.debug('Keystone config is %s', config) From 86b8b4b0198045b733235315c167ff1cb249a16b Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Fri, 4 Aug 2023 12:00:28 -0400 Subject: [PATCH 0100/2492] qa/keystone: extract os_auth_args to helper function Signed-off-by: Casey Bodley --- qa/tasks/keystone.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/qa/tasks/keystone.py b/qa/tasks/keystone.py index f0f09842e605..27e808f68835 100644 --- a/qa/tasks/keystone.py +++ b/qa/tasks/keystone.py @@ -326,25 +326,26 @@ def dict_to_args(specials, items): args.extend(arg for arg in special_vals.values() if arg) return args +def os_auth_args(host, port): + return [ + '--os-username', 'admin', + '--os-password', 'ADMIN', + '--os-user-domain-id', 'default', + '--os-project-name', 'admin', + '--os-project-domain-id', 'default', + '--os-identity-api-version', '3', + '--os-auth-url', 'http://{host}:{port}/v3'.format(host=host, port=port), + ] + def run_section_cmds(ctx, cclient, section_cmd, specials, section_config_list): public_host, public_port = ctx.keystone.public_endpoints[cclient] - - auth_section = [ - ( 'os-username', 'admin' ), - ( 'os-password', 'ADMIN' ), - ( 'os-user-domain-id', 'default' ), - ( 'os-project-name', 'admin' ), - ( 'os-project-domain-id', 'default' ), - ( 'os-identity-api-version', '3' ), - ( 'os-auth-url', 'http://{host}:{port}/v3'.format(host=public_host, - port=public_port) ), - ] + auth_args = os_auth_args(public_host, public_port) for section_item in section_config_list: run_in_keystone_venv(ctx, cclient, - [ 'openstack' ] + section_cmd.split() + - dict_to_args(specials, auth_section + list(section_item.items())) + + [ 'openstack' ] + section_cmd.split() + auth_args + + dict_to_args(specials, list(section_item.items())) + [ '--debug' ]) def create_endpoint(ctx, cclient, service, url, adminurl=None): From 1456d12a9723f509717d706c09b252f9b810b300 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Fri, 4 Aug 2023 12:01:15 -0400 Subject: [PATCH 0101/2492] qa/keystone: config can create ec2 credentials Signed-off-by: Casey Bodley --- qa/tasks/keystone.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/qa/tasks/keystone.py b/qa/tasks/keystone.py index 27e808f68835..fc45b9c2938d 100644 --- a/qa/tasks/keystone.py +++ b/qa/tasks/keystone.py @@ -387,6 +387,8 @@ def fill_keystone(ctx, config): cconfig.get('projects', [])) run_section_cmds(ctx, cclient, 'user create --or-show', 'name', cconfig.get('users', [])) + run_section_cmds(ctx, cclient, 'ec2 credentials create', '', + cconfig.get('ec2 credentials', [])) run_section_cmds(ctx, cclient, 'role create --or-show', 'name', cconfig.get('roles', [])) run_section_cmds(ctx, cclient, 'role add', 'name', @@ -441,6 +443,9 @@ def task(ctx, config): - name: custom password: SECRET project: custom + ec2 credentials: + - project: custom + user: custom roles: [ name: custom ] role-mappings: - name: custom From f9872e58a1e5c0e0d1ca5d56e72e2333f5d8e810 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Fri, 4 Aug 2023 12:02:24 -0400 Subject: [PATCH 0102/2492] qa/keystone: expose function to read a user's ec2 creds Signed-off-by: Casey Bodley --- qa/tasks/keystone.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/qa/tasks/keystone.py b/qa/tasks/keystone.py index fc45b9c2938d..bffeeeae1811 100644 --- a/qa/tasks/keystone.py +++ b/qa/tasks/keystone.py @@ -3,6 +3,8 @@ """ import argparse import contextlib +from io import StringIO +import json import logging # still need this for python3.6 @@ -35,12 +37,12 @@ def toxvenv_sh(ctx, remote, args, **kwargs): activate = get_toxvenv_dir(ctx) + '/bin/activate' return remote.sh(['source', activate, run.Raw('&&')] + args, **kwargs) -def run_in_keystone_venv(ctx, client, args): - run_in_keystone_dir(ctx, client, +def run_in_keystone_venv(ctx, client, args, **kwargs): + return run_in_keystone_dir(ctx, client, [ 'source', '.tox/venv/bin/activate', run.Raw('&&') - ] + args) + ] + args, **kwargs) def get_keystone_venved_cmd(ctx, cmd, args, env=[]): kbindir = get_keystone_dir(ctx) + '/.tox/venv/bin/' @@ -420,6 +422,29 @@ def assign_ports(ctx, config, initial_port): return role_endpoints +def read_ec2_credentials(ctx, client, user): + """ + Look up EC2 credentials for the given user. + + Returns a dictionary of the form: + { + "Access": "b2c9a792ff934b50b7e5c6d8f0fbbc96", + "Secret": "53b34a24a8e244ca89f1d754f089b63a", + "Project ID": "49208b6cc1864a0ea1cd7de3b456db11", + "User ID": "3276c0e0116a4a3ab1dd462ae4846416" + } + """ + public_host, public_port = ctx.keystone.public_endpoints[client] + procs = run_in_keystone_venv(ctx, client, + ['openstack', 'ec2', 'credentials', 'list', + '--user', user, '--format', 'json', '--debug'] + + os_auth_args(public_host, public_port), + stdout=StringIO()) + assert len(procs) == 1 + response = json.loads(procs[0].stdout.getvalue()) + assert len(response) + return response[0] + @contextlib.contextmanager def task(ctx, config): """ @@ -476,6 +501,7 @@ def task(ctx, config): ctx.keystone = argparse.Namespace() ctx.keystone.public_endpoints = assign_ports(ctx, config, 5000) + ctx.keystone.read_ec2_credentials = read_ec2_credentials with contextutil.nested( lambda: download(ctx=ctx, config=config), From ff590c5ae467d3fb3cff920a1c3a04cf060097ec Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Fri, 4 Aug 2023 12:11:15 -0400 Subject: [PATCH 0103/2492] qa/s3tests: create_users() takes all yaml config Signed-off-by: Casey Bodley --- qa/tasks/s3tests.py | 60 +++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/qa/tasks/s3tests.py b/qa/tasks/s3tests.py index 70b49c086b6f..ad7d1b0fb5ac 100644 --- a/qa/tasks/s3tests.py +++ b/qa/tasks/s3tests.py @@ -90,22 +90,21 @@ def _config_user(s3tests_conf, section, user): @contextlib.contextmanager -def create_users(ctx, config): +def create_users(ctx, config, s3tests_conf): """ Create a main and an alternate s3 user. """ - assert isinstance(config, dict) log.info('Creating rgw users...') testdir = teuthology.get_testdir(ctx) users = {'s3 main': 'foo', 's3 alt': 'bar', 's3 tenant': 'testx$tenanteduser', 'iam': 'foobar'} - for client in config['clients']: - s3tests_conf = config['s3tests_conf'][client] - s3tests_conf.setdefault('fixtures', {}) - s3tests_conf['fixtures'].setdefault('bucket prefix', 'test-' + client + '-{random}-') + for client, cconfig in config.items(): + conf = s3tests_conf[client] + conf.setdefault('fixtures', {}) + conf['fixtures'].setdefault('bucket prefix', 'test-' + client + '-{random}-') for section, user in users.items(): - _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client)) - log.debug('Creating user {user} on {host}'.format(user=s3tests_conf[section]['user_id'], host=client)) + _config_user(conf, section, '{user}.{client}'.format(user=user, client=client)) + log.debug('Creating user {user} on {host}'.format(user=conf[section]['user_id'], host=client)) cluster_name, daemon_type, client_id = teuthology.split_role(client) client_with_id = daemon_type + '.' + client_id # create user @@ -117,12 +116,12 @@ def create_users(ctx, config): 'radosgw-admin', '-n', client_with_id, 'user', 'create', - '--uid', s3tests_conf[section]['user_id'], - '--display-name', s3tests_conf[section]['display_name'], - '--email', s3tests_conf[section]['email'], + '--uid', conf[section]['user_id'], + '--display-name', conf[section]['display_name'], + '--email', conf[section]['email'], '--caps', 'user-policy=*', - '--access-key', s3tests_conf[section]['access_key'], - '--secret', s3tests_conf[section]['secret_key'], + '--access-key', conf[section]['access_key'], + '--secret', conf[section]['secret_key'], '--cluster', cluster_name, ], ) @@ -136,10 +135,10 @@ def create_users(ctx, config): 'radosgw-admin', '-n', client_with_id, 'mfa', 'create', - '--uid', s3tests_conf[section]['user_id'], - '--totp-serial', s3tests_conf[section]['totp_serial'], - '--totp-seed', s3tests_conf[section]['totp_seed'], - '--totp-seconds', s3tests_conf[section]['totp_seconds'], + '--uid', conf[section]['user_id'], + '--totp-serial', conf[section]['totp_serial'], + '--totp-seed', conf[section]['totp_seed'], + '--totp-seconds', conf[section]['totp_seconds'], '--totp-window', '8', '--totp-seed-type', 'base32', '--cluster', cluster_name, @@ -156,7 +155,7 @@ def create_users(ctx, config): 'radosgw-admin', '-n', client_with_id, 'caps', 'add', - '--uid', s3tests_conf[section]['user_id'], + '--uid', conf[section]['user_id'], '--caps', 'roles=*', '--cluster', cluster_name, ], @@ -169,26 +168,26 @@ def create_users(ctx, config): 'radosgw-admin', '-n', client_with_id, 'caps', 'add', - '--uid', s3tests_conf[section]['user_id'], + '--uid', conf[section]['user_id'], '--caps', 'oidc-provider=*', '--cluster', cluster_name, ], ) if "TOKEN" in os.environ: - s3tests_conf.setdefault('webidentity', {}) - s3tests_conf['webidentity'].setdefault('token',os.environ['TOKEN']) - s3tests_conf['webidentity'].setdefault('aud',os.environ['AUD']) - s3tests_conf['webidentity'].setdefault('sub',os.environ['SUB']) - s3tests_conf['webidentity'].setdefault('azp',os.environ['AZP']) - s3tests_conf['webidentity'].setdefault('user_token',os.environ['USER_TOKEN']) - s3tests_conf['webidentity'].setdefault('thumbprint',os.environ['THUMBPRINT']) - s3tests_conf['webidentity'].setdefault('KC_REALM',os.environ['KC_REALM']) + conf.setdefault('webidentity', {}) + conf['webidentity'].setdefault('token',os.environ['TOKEN']) + conf['webidentity'].setdefault('aud',os.environ['AUD']) + conf['webidentity'].setdefault('sub',os.environ['SUB']) + conf['webidentity'].setdefault('azp',os.environ['AZP']) + conf['webidentity'].setdefault('user_token',os.environ['USER_TOKEN']) + conf['webidentity'].setdefault('thumbprint',os.environ['THUMBPRINT']) + conf['webidentity'].setdefault('KC_REALM',os.environ['KC_REALM']) try: yield finally: - for client in config['clients']: + for client in config.keys(): for user in users.values(): uid = '{user}.{client}'.format(user=user, client=client) cluster_name, daemon_type, client_id = teuthology.split_role(client) @@ -638,10 +637,7 @@ def task(ctx, config): with contextutil.nested( lambda: download(ctx=ctx, config=config), - lambda: create_users(ctx=ctx, config=dict( - clients=clients, - s3tests_conf=s3tests_conf, - )), + lambda: create_users(ctx=ctx, config=config, s3tests_conf=s3tests_conf), lambda: configure(ctx=ctx, config=dict( clients=config, s3tests_conf=s3tests_conf, From fa70c782882a541b5beb1cb8ebfdc2a5119f1d88 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Fri, 4 Aug 2023 12:25:19 -0400 Subject: [PATCH 0104/2492] qa/s3tests: enable testing against keystone users with ec2 Signed-off-by: Casey Bodley --- qa/tasks/s3tests.py | 51 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/qa/tasks/s3tests.py b/qa/tasks/s3tests.py index ad7d1b0fb5ac..7006602f2bea 100644 --- a/qa/tasks/s3tests.py +++ b/qa/tasks/s3tests.py @@ -102,8 +102,31 @@ def create_users(ctx, config, s3tests_conf): conf = s3tests_conf[client] conf.setdefault('fixtures', {}) conf['fixtures'].setdefault('bucket prefix', 'test-' + client + '-{random}-') + + keystone_users = cconfig.get('keystone users', {}) for section, user in users.items(): _config_user(conf, section, '{user}.{client}'.format(user=user, client=client)) + + # for keystone users, read ec2 credentials into s3tests.conf instead + # of creating a local user + keystone_user = keystone_users.get(section) + if keystone_user: + project_name = keystone_user.pop('project') + creds = ctx.keystone.read_ec2_credentials(ctx, **keystone_user) + access = creds['Access'] + secret = creds['Secret'] + project_id = creds['Project ID'] + + conf[section]['access_key'] = access + conf[section]['secret_key'] = secret + conf[section]['user_id'] = project_id + conf[section]['display_name'] = project_name + + log.debug('Using keystone user {kuser} credentials ({access} : {secret}) for {pname}:{pid} on {host}'.format( + kuser=keystone_user['user'], access=access, secret=secret, + pname=project_name, pid=project_id, host=client)) + continue + log.debug('Creating user {user} on {host}'.format(user=conf[section]['user_id'], host=client)) cluster_name, daemon_type, client_id = teuthology.split_role(client) client_with_id = daemon_type + '.' + client_id @@ -189,6 +212,9 @@ def create_users(ctx, config, s3tests_conf): finally: for client in config.keys(): for user in users.values(): + # don't need to delete keystone users + if not user in keystone_users: + continue uid = '{user}.{client}'.format(user=user, client=client) cluster_name, daemon_type, client_id = teuthology.split_role(client) client_with_id = daemon_type + '.' + client_id @@ -501,6 +527,31 @@ def task(ctx, config): cloudtier_tests: True rgw_server: client.0 + To test against Keystone users with EC2 credentials:: + + tasks: + - ceph: + - rgw: [client.0 client.1] + - keystone: + client.0: + projects: + - name: myproject + description: my project + users: + - name: myuser + password: SECRET + project: myproject + ec2 credentials: + - project: myproject + user: myuser + - s3tests: + client.0: + keystone users: + s3 main: + client: client.0 + project: myproject + user: myuser + """ assert hasattr(ctx, 'rgw'), 's3tests must run after the rgw task' assert hasattr(ctx, 'tox'), 's3tests must run after the tox task' From 7b27f9dea979f6845e71dcc8eeffac7d1485eeab Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Fri, 4 Aug 2023 12:29:47 -0400 Subject: [PATCH 0105/2492] qa/rgw/tempest: run s3tests against keystone ec2 Fixes: https://tracker.ceph.com/issues/59424 Signed-off-by: Casey Bodley --- qa/suites/rgw/tempest/s3tests-branch.yaml | 1 + qa/suites/rgw/tempest/tasks/+ | 0 qa/suites/rgw/tempest/tasks/s3tests.yaml | 35 +++++++++++++++++++++++ 3 files changed, 36 insertions(+) create mode 120000 qa/suites/rgw/tempest/s3tests-branch.yaml create mode 100644 qa/suites/rgw/tempest/tasks/+ create mode 100644 qa/suites/rgw/tempest/tasks/s3tests.yaml diff --git a/qa/suites/rgw/tempest/s3tests-branch.yaml b/qa/suites/rgw/tempest/s3tests-branch.yaml new file mode 120000 index 000000000000..bdcaca48ae02 --- /dev/null +++ b/qa/suites/rgw/tempest/s3tests-branch.yaml @@ -0,0 +1 @@ +.qa/rgw/s3tests-branch.yaml \ No newline at end of file diff --git a/qa/suites/rgw/tempest/tasks/+ b/qa/suites/rgw/tempest/tasks/+ new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/qa/suites/rgw/tempest/tasks/s3tests.yaml b/qa/suites/rgw/tempest/tasks/s3tests.yaml new file mode 100644 index 000000000000..f1ee8a17c4ce --- /dev/null +++ b/qa/suites/rgw/tempest/tasks/s3tests.yaml @@ -0,0 +1,35 @@ +overrides: + ceph: + conf: + client: + rgw s3 auth use keystone: true + rgw crypt s3 kms backend: testing + rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo= testkey-2=aWIKTWFrZWZpbGUKbWFuCm91dApzcmMKVGVzdGluZwo= + rgw crypt require ssl: false + keystone: + client.0: + projects: + - name: s3tests + description: s3tests project + users: + - name: s3tests-main + password: SECRET + project: s3tests + ec2 credentials: + - project: s3tests + user: s3tests-main + roles: [ name: member ] + role-mappings: + - name: member + user: s3tests-main + project: s3tests + +tasks: +- s3tests: + client.0: + rgw_server: client.0 + keystone users: + s3 main: + client: client.0 + project: s3tests + user: s3tests-main From c19aab4c31e89defca9409440463f4654b6cafd5 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Wed, 9 Aug 2023 14:58:49 -0400 Subject: [PATCH 0106/2492] qa/rgw/tempest: override frontend_prefix in tempest task tempest relies on `frontend_prefix: /swift` to serve the swift api from the root path s3tests relies on the same for s3. move the frontend_prefix override into the tempest task so it doesn't apply to s3tests Signed-off-by: Casey Bodley --- qa/suites/rgw/tempest/0-install.yaml | 1 - qa/suites/rgw/tempest/tasks/+ | 0 qa/suites/rgw/tempest/tasks/tempest.yaml | 6 ++++++ 3 files changed, 6 insertions(+), 1 deletion(-) delete mode 100644 qa/suites/rgw/tempest/tasks/+ diff --git a/qa/suites/rgw/tempest/0-install.yaml b/qa/suites/rgw/tempest/0-install.yaml index fc2cfcc7b6d2..f968db20c2bf 100644 --- a/qa/suites/rgw/tempest/0-install.yaml +++ b/qa/suites/rgw/tempest/0-install.yaml @@ -11,5 +11,4 @@ tasks: description: Swift Service - rgw: client.0: - frontend_prefix: /swift use-keystone-role: client.0 diff --git a/qa/suites/rgw/tempest/tasks/+ b/qa/suites/rgw/tempest/tasks/+ deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/qa/suites/rgw/tempest/tasks/tempest.yaml b/qa/suites/rgw/tempest/tasks/tempest.yaml index 560e41de5bbb..988c7515b51f 100644 --- a/qa/suites/rgw/tempest/tasks/tempest.yaml +++ b/qa/suites/rgw/tempest/tasks/tempest.yaml @@ -1,3 +1,9 @@ +overrides: + rgw: + client.0: + # tempest tests expect the swift api at the root + frontend_prefix: /swift + tasks: - tempest: client.0: From 0c70dd8e39cc3d0cdef8bbcc8a0c6f214e54c770 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Wed, 20 Sep 2023 16:57:01 -0400 Subject: [PATCH 0107/2492] common: resolve config proxy deadlock using refcounted pointers 7e8c683 introduced some gymnastics with a "CallGate" to maintain a count for each observer we may be "calling into" with a config change (namely: handle_conf_change). This was to prevent remove_observer coming in and deleting the observer in the middle of the call. More importantly, it was to avoid holding the lock while traversing the observers so that the config_proxy lock can be dropped while calling handle_conf_change. This is important as e.g. the MDS may attempt to acquire the config_proxy lock in its MDSRank::handle_conf_change method (what prompted the change). However, this introduces a new deadlock: - Thread 2 acquires the config_proxy lock and then removes an observer. It blocks waiting for the observer's CallGate to close. - Thread 1 had dropped the config_proxy lock while traversing the observers to call each observer's handle_conf_change method. Those methods may attempt to reacquire the config_proxy lock. This creates the deadlock as it's waiting for Thread 2 to drop the lock while Thread 1 cannot release the CallGate. The solution, I believe, is to properly refcount "uses" of the observers for the purposes of flushing these changes. Use std::shared_ptr to effect this. Reproducing this is fairly simply with several parallel calls to `config set`. During the course of executing `config set`, the Objecter may receive config updates that will be flushed and potentially race with cleanup of observers during shutdown. Fixes: https://tracker.ceph.com/issues/62832 Partial-revert: 7e8c683 Partial-revert: 4458a72 Signed-off-by: Patrick Donnelly --- src/common/config_obs_mgr.h | 39 ++++--- src/common/config_proxy.h | 180 ++++++++++++------------------ src/crimson/common/config_proxy.h | 25 +++-- 3 files changed, 110 insertions(+), 134 deletions(-) diff --git a/src/common/config_obs_mgr.h b/src/common/config_obs_mgr.h index 06b3cf934a53..759930df92d9 100644 --- a/src/common/config_obs_mgr.h +++ b/src/common/config_obs_mgr.h @@ -14,13 +14,11 @@ class ConfigValues; // the changes of settings at runtime. template class ObserverMgr : public ConfigTracker { - // Maps configuration options to the observer listening for them. - using obs_map_t = std::multimap; - obs_map_t observers; - public: - typedef std::map> rev_obs_map; - typedef std::function config_gather_cb; + using config_obs_ptr = std::shared_ptr; + using config_obs_wptr = std::weak_ptr; + typedef std::map> rev_obs_map; + typedef std::function config_gather_cb; // Adds a new observer to this configuration. You can do this at any time, // but it will only receive notifications for the changes that happen after @@ -37,15 +35,18 @@ class ObserverMgr : public ConfigTracker { // you need to delete it yourself. // This function will assert if you try to delete an observer that isn't // there. - void remove_observer(ConfigObs* observer); + config_obs_wptr remove_observer(ConfigObs* observer); // invoke callback for every observers tracking keys void for_each_observer(config_gather_cb callback); // invoke callback for observers keys tracking the provided change set - template - void for_each_change(const std::set& changes, - ConfigProxyT& proxy, + void for_each_change(const std::map& changes, config_gather_cb callback, std::ostream *oss); bool is_tracking(const std::string& name) const override; + +private: + // Maps configuration options to the observer listening for them. + using obs_map_t = std::multimap; + obs_map_t observers; }; // we could put the implementations in a .cc file, and only instantiate the @@ -60,17 +61,20 @@ template void ObserverMgr::add_observer(ConfigObs* observer) { const char **keys = observer->get_tracked_conf_keys(); + auto ptr = std::make_shared(observer); for (const char ** k = keys; *k; ++k) { - observers.emplace(*k, observer); + observers.emplace(*k, ptr); } } template -void ObserverMgr::remove_observer(ConfigObs* observer) +typename ObserverMgr::config_obs_wptr ObserverMgr::remove_observer(ConfigObs* observer) { [[maybe_unused]] bool found_obs = false; + config_obs_ptr ptr; for (auto o = observers.begin(); o != observers.end(); ) { - if (o->second == observer) { + if (*o->second == observer) { + ptr = std::move(o->second); observers.erase(o++); found_obs = true; } else { @@ -78,6 +82,7 @@ void ObserverMgr::remove_observer(ConfigObs* observer) } } ceph_assert(found_obs); + return config_obs_wptr(ptr); } template @@ -89,17 +94,15 @@ void ObserverMgr::for_each_observer(config_gather_cb callback) } template -template -void ObserverMgr::for_each_change(const std::set& changes, - ConfigProxyT& proxy, +void ObserverMgr::for_each_change(const std::map& changes, config_gather_cb callback, std::ostream *oss) { // create the reverse observer mapping, mapping observers to the set of // changed keys that they'll get. std::string val; - for (auto& key : changes) { + for (auto& [key, present] : changes) { auto [first, last] = observers.equal_range(key); - if ((oss) && !proxy.get_val(key, &val)) { + if ((oss) && present) { (*oss) << key << " = '" << val << "' "; if (first == last) { (*oss) << "(not observed, change may require restart) "; diff --git a/src/common/config_proxy.h b/src/common/config_proxy.h index 400aa4ed052d..b9b47d9cef47 100644 --- a/src/common/config_proxy.h +++ b/src/common/config_proxy.h @@ -18,91 +18,51 @@ class ConfigProxy { */ ConfigValues values; using md_config_obs_t = ceph::md_config_obs_impl; - ObserverMgr obs_mgr; + using ObsMgr = ObserverMgr; + ObsMgr obs_mgr; md_config_t config; /** A lock that protects the md_config_t internals. It is * recursive, for simplicity. * It is best if this lock comes first in the lock hierarchy. We will * hold this lock when calling configuration observers. */ - mutable ceph::recursive_mutex lock = - ceph::make_recursive_mutex("ConfigProxy::lock"); + mutable ceph::mutex lock = ceph::make_mutex("ConfigProxy::lock"); + ceph::condition_variable cond; - class CallGate { - private: - uint32_t call_count = 0; - ceph::mutex lock; - ceph::condition_variable cond; - public: - CallGate() - : lock(ceph::make_mutex("call::gate::lock")) { - } + using rev_obs_map_t = ObsMgr::rev_obs_map; - void enter() { - std::lock_guard locker(lock); - ++call_count; - } - void leave() { - std::lock_guard locker(lock); - ceph_assert(call_count > 0); - if (--call_count == 0) { - cond.notify_all(); - } + void _call_observers(rev_obs_map_t& rev_obs) { + ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); + for (auto& [obs, keys] : rev_obs) { + (*obs)->handle_conf_change(*this, keys); } - void close() { - std::unique_lock locker(lock); - while (call_count != 0) { - cond.wait(locker); - } + rev_obs.clear(); // drop shared_ptrs + { + std::lock_guard l{lock}; + cond.notify_all(); } - }; - - void call_gate_enter(md_config_obs_t *obs) { - auto p = obs_call_gate.find(obs); - ceph_assert(p != obs_call_gate.end()); - p->second->enter(); - } - void call_gate_leave(md_config_obs_t *obs) { - auto p = obs_call_gate.find(obs); - ceph_assert(p != obs_call_gate.end()); - p->second->leave(); - } - void call_gate_close(md_config_obs_t *obs) { - auto p = obs_call_gate.find(obs); - ceph_assert(p != obs_call_gate.end()); - p->second->close(); } - - using rev_obs_map_t = ObserverMgr::rev_obs_map; - typedef std::unique_ptr CallGateRef; - - std::map obs_call_gate; - - void call_observers(std::unique_lock& locker, - rev_obs_map_t& rev_obs) { - // observers are notified outside of lock - locker.unlock(); - for (auto& [obs, keys] : rev_obs) { - obs->handle_conf_change(*this, keys); - } - locker.lock(); - - for (auto& rev_ob : rev_obs) { - call_gate_leave(rev_ob.first); + void _gather_changes(std::set &changes, + rev_obs_map_t *rev_obs, std::ostream* oss) { + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + std::map changes_present; + for (auto& change : changes) { + std::string dummy; + changes_present[change] = (0 == config.get_val(values, change, &dummy)); } + obs_mgr.for_each_change( + changes_present, + [this, rev_obs](auto obs, const std::string &key) { + _map_observer_changes(obs, key, rev_obs); + }, oss); + changes.clear(); } - void map_observer_changes(md_config_obs_t *obs, const std::string &key, + void _map_observer_changes(ObsMgr::config_obs_ptr obs, const std::string& key, rev_obs_map_t *rev_obs) { - ceph_assert(ceph_mutex_is_locked(lock)); + ceph_assert(ceph_mutex_is_locked_by_me(lock)); auto [it, new_entry] = rev_obs->emplace(obs, std::set{}); it->second.emplace(key); - if (new_entry) { - // this needs to be done under lock as once this lock is - // dropped (before calling observers) a remove_observer() - // can sneak in and cause havoc. - call_gate_enter(obs); - } } public: @@ -200,34 +160,39 @@ class ConfigProxy { } // for those want to reexpand special meta, e.g, $pid void finalize_reexpand_meta() { - std::unique_lock locker(lock); rev_obs_map_t rev_obs; - if (config.finalize_reexpand_meta(values, obs_mgr)) { - _gather_changes(values.changed, &rev_obs, nullptr); + { + std::lock_guard locker(lock); + if (config.finalize_reexpand_meta(values, obs_mgr)) { + _gather_changes(values.changed, &rev_obs, nullptr); + } } - call_observers(locker, rev_obs); + _call_observers(rev_obs); } void add_observer(md_config_obs_t* obs) { std::lock_guard l(lock); obs_mgr.add_observer(obs); - obs_call_gate.emplace(obs, std::make_unique()); + cond.notify_all(); } void remove_observer(md_config_obs_t* obs) { - std::lock_guard l(lock); - call_gate_close(obs); - obs_call_gate.erase(obs); - obs_mgr.remove_observer(obs); + std::unique_lock l(lock); + auto wptr = obs_mgr.remove_observer(obs); + while (!wptr.expired()) { + cond.wait(l); + } } void call_all_observers() { - std::unique_lock locker(lock); rev_obs_map_t rev_obs; - obs_mgr.for_each_observer( - [this, &rev_obs](md_config_obs_t *obs, const std::string &key) { - map_observer_changes(obs, key, &rev_obs); - }); + { + std::lock_guard locker(lock); + obs_mgr.for_each_observer( + [this, &rev_obs](auto obs, const std::string& key) { + _map_observer_changes(obs, key, &rev_obs); + }); + } - call_observers(locker, rev_obs); + _call_observers(rev_obs); } void set_safe_to_start_threads() { std::lock_guard l(lock); @@ -255,25 +220,18 @@ class ConfigProxy { } // Expand all metavariables. Make any pending observer callbacks. void apply_changes(std::ostream* oss) { - std::unique_lock locker(lock); rev_obs_map_t rev_obs; - // apply changes until the cluster name is assigned - if (!values.cluster.empty()) { - // meta expands could have modified anything. Copy it all out again. - _gather_changes(values.changed, &rev_obs, oss); + { + std::lock_guard locker(lock); + // apply changes until the cluster name is assigned + if (!values.cluster.empty()) { + // meta expands could have modified anything. Copy it all out again. + _gather_changes(values.changed, &rev_obs, oss); + } } - call_observers(locker, rev_obs); - } - void _gather_changes(std::set &changes, - rev_obs_map_t *rev_obs, std::ostream* oss) { - obs_mgr.for_each_change( - changes, *this, - [this, rev_obs](md_config_obs_t *obs, const std::string &key) { - map_observer_changes(obs, key, rev_obs); - }, oss); - changes.clear(); + _call_observers(rev_obs); } int set_val(const std::string_view key, const std::string& s, std::stringstream* err_ss=nullptr) { @@ -291,23 +249,27 @@ class ConfigProxy { int set_mon_vals(CephContext *cct, const std::map>& kv, md_config_t::config_callback config_cb) { - std::unique_lock locker(lock); - int ret = config.set_mon_vals(cct, values, obs_mgr, kv, config_cb); - + int ret; rev_obs_map_t rev_obs; - _gather_changes(values.changed, &rev_obs, nullptr); - call_observers(locker, rev_obs); + { + std::lock_guard locker(lock); + ret = config.set_mon_vals(cct, values, obs_mgr, kv, config_cb); + _gather_changes(values.changed, &rev_obs, nullptr); + } + + _call_observers(rev_obs); return ret; } int injectargs(const std::string &s, std::ostream *oss) { - std::unique_lock locker(lock); - int ret = config.injectargs(values, obs_mgr, s, oss); - + int ret; rev_obs_map_t rev_obs; - _gather_changes(values.changed, &rev_obs, oss); - - call_observers(locker, rev_obs); + { + std::lock_guard locker(lock); + ret = config.injectargs(values, obs_mgr, s, oss); + _gather_changes(values.changed, &rev_obs, oss); + } + _call_observers(rev_obs); return ret; } void parse_env(unsigned entity_type, diff --git a/src/crimson/common/config_proxy.h b/src/crimson/common/config_proxy.h index 4c0e655075ad..822db34f61a4 100644 --- a/src/crimson/common/config_proxy.h +++ b/src/crimson/common/config_proxy.h @@ -54,13 +54,18 @@ class ConfigProxy : public seastar::peering_sharded_service // avoid racings with other do_change() calls in parallel. ObserverMgr::rev_obs_map rev_obs; owner.values.reset(new_values); - owner.obs_mgr.for_each_change(owner.values->changed, owner, - [&rev_obs](ConfigObserver *obs, + std::map changes_present; + for (const auto& change : owner.values->changed) { + std::string dummy; + changes_present[change] = owner.get_val(change, &dummy); + } + owner.obs_mgr.for_each_change(changes_present, + [&rev_obs](auto obs, const std::string &key) { rev_obs[obs].insert(key); }, nullptr); for (auto& [obs, keys] : rev_obs) { - obs->handle_conf_change(owner, keys); + (*obs)->handle_conf_change(owner, keys); } return seastar::parallel_for_each(boost::irange(1u, seastar::smp::count), @@ -70,13 +75,19 @@ class ConfigProxy : public seastar::peering_sharded_service proxy.values.reset(); proxy.values = std::move(foreign_values); + std::map changes_present; + for (const auto& change : proxy.values->changed) { + std::string dummy; + changes_present[change] = proxy.get_val(change, &dummy); + } + ObserverMgr::rev_obs_map rev_obs; - proxy.obs_mgr.for_each_change(proxy.values->changed, proxy, - [&rev_obs](ConfigObserver *obs, const std::string& key) { + proxy.obs_mgr.for_each_change(changes_present, + [&rev_obs](auto obs, const std::string& key) { rev_obs[obs].insert(key); }, nullptr); - for (auto& obs_keys : rev_obs) { - obs_keys.first->handle_conf_change(proxy, obs_keys.second); + for (auto& [obs, keys] : rev_obs) { + (*obs)->handle_conf_change(proxy, keys); } }); }).finally([new_values] { From bfd12365979a702bd0575acbd3d2b3eb5f103ec2 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Wed, 27 Sep 2023 17:13:44 +0800 Subject: [PATCH 0108/2492] crimson/os/seastore/onode_manager: populate delta recorders for each onode modification Signed-off-by: Xuehan Xu --- .../os/seastore/object_data_handler.cc | 7 +- src/crimson/os/seastore/onode.h | 10 +- .../staged-fltree/fltree_onode_manager.cc | 12 +-- .../staged-fltree/fltree_onode_manager.h | 68 +++++++++++++- src/crimson/os/seastore/seastore.cc | 94 ++++++++----------- src/crimson/os/seastore/seastore.h | 3 +- .../onode_tree/test_fltree_onode_manager.cc | 10 +- .../seastore/test_object_data_handler.cc | 67 ++++++++++++- 8 files changed, 192 insertions(+), 79 deletions(-) diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc index 0d852696b714..025f91993efa 100644 --- a/src/crimson/os/seastore/object_data_handler.cc +++ b/src/crimson/os/seastore/object_data_handler.cc @@ -888,7 +888,7 @@ auto with_object_data( return std::invoke(f, object_data ).si_then([ctx, &object_data] { if (object_data.must_update()) { - ctx.onode.get_mutable_layout(ctx.t).object_data.update(object_data); + ctx.onode.update_object_data(ctx.t, object_data); } return seastar::now(); }); @@ -909,11 +909,10 @@ auto with_objects_data( return std::invoke(f, object_data, d_object_data ).si_then([ctx, &object_data, &d_object_data] { if (object_data.must_update()) { - ctx.onode.get_mutable_layout(ctx.t).object_data.update(object_data); + ctx.onode.update_object_data(ctx.t, object_data); } if (d_object_data.must_update()) { - ctx.d_onode->get_mutable_layout( - ctx.t).object_data.update(d_object_data); + ctx.d_onode->update_object_data(ctx.t, d_object_data); } return seastar::now(); }); diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h index 069daa3df5b5..e803a2e4e74f 100644 --- a/src/crimson/os/seastore/onode.h +++ b/src/crimson/os/seastore/onode.h @@ -64,9 +64,17 @@ class Onode : public boost::intrusive_ref_counter< virtual bool is_alive() const = 0; virtual const onode_layout_t &get_layout() const = 0; - virtual onode_layout_t &get_mutable_layout(Transaction &t) = 0; virtual ~Onode() = default; + virtual void update_onode_size(Transaction&, uint32_t) = 0; + virtual void update_omap_root(Transaction&, omap_root_t&) = 0; + virtual void update_xattr_root(Transaction&, omap_root_t&) = 0; + virtual void update_object_data(Transaction&, object_data_t&) = 0; + virtual void update_object_info(Transaction&, ceph::bufferlist&) = 0; + virtual void update_snapset(Transaction&, ceph::bufferlist&) = 0; + virtual void clear_object_info(Transaction&) = 0; + virtual void clear_snapset(Transaction&) = 0; + laddr_t get_metadata_hint(uint64_t block_size) const { assert(default_metadata_offset); assert(default_metadata_range); diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc index bff27ab65178..a1f593889d52 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc @@ -51,17 +51,17 @@ FLTreeOnodeManager::get_or_create_onode( ).si_then([this, &trans, &hoid, FNAME](auto p) -> get_or_create_onode_ret { auto [cursor, created] = std::move(p); - auto val = OnodeRef(new FLTreeOnode( + auto onode = new FLTreeOnode( default_data_reservation, default_metadata_range, - cursor.value())); + cursor.value()); if (created) { DEBUGT("created onode for entry for {}", trans, hoid); - val->get_mutable_layout(trans) = onode_layout_t{}; + onode->with_mutable_layout(trans, [](onode_layout_t &mlayout) { + mlayout = onode_layout_t{}; + }); } - return get_or_create_onode_iertr::make_ready_future( - val - ); + return get_or_create_onode_iertr::make_ready_future(onode); }); } diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h index 09998fbfaea9..619609419ad8 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h @@ -78,14 +78,16 @@ struct FLTreeOnode final : Onode, Value { return *read_payload(); } - onode_layout_t &get_mutable_layout(Transaction &t) final { + template + void with_mutable_layout(Transaction &t, Func&& f) { assert(status != status_t::DELETED); auto p = prepare_mutate_payload< onode_layout_t, Recorder>(t); status = status_t::MUTATED; - return *reinterpret_cast(p.first.get_write()); - }; + f(*reinterpret_cast(p.first.get_write())); + populate_recorder(t); + } void populate_recorder(Transaction &t) { assert(status == status_t::MUTATED); @@ -99,6 +101,66 @@ struct FLTreeOnode final : Onode, Value { status = status_t::STABLE; } + void update_onode_size(Transaction &t, uint32_t size) final { + with_mutable_layout(t, [size](onode_layout_t &mlayout) { + mlayout.size = size; + }); + } + + void update_omap_root(Transaction &t, omap_root_t &oroot) final { + with_mutable_layout(t, [&oroot](onode_layout_t &mlayout) { + mlayout.omap_root.update(oroot); + }); + } + + void update_xattr_root(Transaction &t, omap_root_t &xroot) final { + with_mutable_layout(t, [&xroot](onode_layout_t &mlayout) { + mlayout.xattr_root.update(xroot); + }); + } + + void update_object_data(Transaction &t, object_data_t &odata) final { + with_mutable_layout(t, [&odata](onode_layout_t &mlayout) { + mlayout.object_data.update(odata); + }); + } + + void update_object_info(Transaction &t, ceph::bufferlist &oi_bl) final { + with_mutable_layout(t, [&oi_bl](onode_layout_t &mlayout) { + maybe_inline_memcpy( + &mlayout.oi[0], + oi_bl.c_str(), + oi_bl.length(), + onode_layout_t::MAX_OI_LENGTH); + mlayout.oi_size = oi_bl.length(); + }); + } + + void clear_object_info(Transaction &t) final { + with_mutable_layout(t, [](onode_layout_t &mlayout) { + memset(&mlayout.oi[0], 0, mlayout.oi_size); + mlayout.oi_size = 0; + }); + } + + void update_snapset(Transaction &t, ceph::bufferlist &ss_bl) final { + with_mutable_layout(t, [&ss_bl](onode_layout_t &mlayout) { + maybe_inline_memcpy( + &mlayout.ss[0], + ss_bl.c_str(), + ss_bl.length(), + onode_layout_t::MAX_OI_LENGTH); + mlayout.ss_size = ss_bl.length(); + }); + } + + void clear_snapset(Transaction &t) final { + with_mutable_layout(t, [](onode_layout_t &mlayout) { + memset(&mlayout.ss[0], 0, mlayout.ss_size); + mlayout.ss_size = 0; + }); + } + void mark_delete() { assert(status != status_t::DELETED); status = status_t::DELETED; diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index 897a063e0fe6..edbc9a0115ec 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -1529,10 +1529,10 @@ SeaStore::Shard::_write( LOG_PREFIX(SeaStore::_write); DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len); { - auto &object_size = onode->get_mutable_layout(*ctx.transaction).size; - object_size = std::max( - offset + len, - object_size); + const auto &object_size = onode->get_layout().size; + onode->update_onode_size( + *ctx.transaction, + std::max(offset + len, object_size)); } return seastar::do_with( std::move(_bl), @@ -1563,8 +1563,7 @@ SeaStore::Shard::_clone( //TODO: currently, we only care about object data, leaving cloning // of xattr/omap for future work auto &object_size = onode->get_layout().size; - auto &d_object_size = d_onode->get_mutable_layout(*ctx.transaction).size; - d_object_size = object_size; + d_onode->update_onode_size(*ctx.transaction, object_size); return objHandler.clone( ObjectDataHandler::context_t{ *transaction_manager, @@ -1586,8 +1585,10 @@ SeaStore::Shard::_zero( if (offset + len >= max_object_size) { return crimson::ct_error::input_output_error::make(); } - auto &object_size = onode->get_mutable_layout(*ctx.transaction).size; - object_size = std::max(offset + len, object_size); + const auto &object_size = onode->get_layout().size; + onode->update_onode_size( + *ctx.transaction, + std::max(offset + len, object_size)); return seastar::do_with( ObjectDataHandler(max_object_size), [=, this, &ctx, &onode](auto &objhandler) { @@ -1607,7 +1608,6 @@ SeaStore::Shard::_omap_set_kvs( OnodeRef &onode, const omap_root_le_t& omap_root, Transaction& t, - omap_root_le_t& mutable_omap_root, std::map&& kvs) { return seastar::do_with( @@ -1627,10 +1627,6 @@ SeaStore::Shard::_omap_set_kvs( return omap_manager.omap_set_keys(root, t, std::move(keys)); }).si_then([&] { return tm_iertr::make_ready_future(std::move(root)); - }).si_then([&mutable_omap_root](auto root) { - if (root.must_update()) { - mutable_omap_root.update(root); - } }); } ); @@ -1648,8 +1644,12 @@ SeaStore::Shard::_omap_set_values( onode, onode->get_layout().omap_root, *ctx.transaction, - onode->get_mutable_layout(*ctx.transaction).omap_root, - std::move(aset)); + std::move(aset) + ).si_then([onode, &ctx](auto root) { + if (root.must_update()) { + onode->update_omap_root(*ctx.transaction, root); + } + }); } SeaStore::Shard::tm_ret @@ -1691,8 +1691,7 @@ SeaStore::Shard::_omap_clear( *ctx.transaction) .si_then([&] { if (omap_root.must_update()) { - onode->get_mutable_layout(*ctx.transaction - ).omap_root.update(omap_root); + onode->update_omap_root(*ctx.transaction, omap_root); } }); }); @@ -1733,8 +1732,7 @@ SeaStore::Shard::_omap_rmkeys( } ).si_then([&] { if (omap_root.must_update()) { - onode->get_mutable_layout(*ctx.transaction - ).omap_root.update(omap_root); + onode->update_omap_root(*ctx.transaction, omap_root); } }); } @@ -1782,8 +1780,7 @@ SeaStore::Shard::_omap_rmkeyrange( config ).si_then([&] { if (omap_root.must_update()) { - onode->get_mutable_layout(*ctx.transaction - ).omap_root.update(omap_root); + onode->update_omap_root(*ctx.transaction, omap_root); } }); }); @@ -1798,7 +1795,7 @@ SeaStore::Shard::_truncate( { LOG_PREFIX(SeaStore::_truncate); DEBUGT("onode={} size={}", *ctx.transaction, *onode, size); - onode->get_mutable_layout(*ctx.transaction).size = size; + onode->update_onode_size(*ctx.transaction, size); return seastar::do_with( ObjectDataHandler(max_object_size), [=, this, &ctx, &onode](auto &objhandler) { @@ -1822,45 +1819,34 @@ SeaStore::Shard::_setattrs( DEBUGT("onode={}", *ctx.transaction, *onode); auto fut = tm_iertr::now(); - auto& layout = onode->get_mutable_layout(*ctx.transaction); + auto& layout = onode->get_layout(); if (auto it = aset.find(OI_ATTR); it != aset.end()) { auto& val = it->second; if (likely(val.length() <= onode_layout_t::MAX_OI_LENGTH)) { - maybe_inline_memcpy( - &layout.oi[0], - val.c_str(), - val.length(), - onode_layout_t::MAX_OI_LENGTH); if (!layout.oi_size) { // if oi was not in the layout, it probably exists in the omap, // need to remove it first fut = _xattr_rmattr(ctx, onode, OI_ATTR); } - layout.oi_size = val.length(); + onode->update_object_info(*ctx.transaction, val); aset.erase(it); } else { - layout.oi_size = 0; + onode->clear_object_info(*ctx.transaction); } } if (auto it = aset.find(SS_ATTR); it != aset.end()) { auto& val = it->second; if (likely(val.length() <= onode_layout_t::MAX_SS_LENGTH)) { - maybe_inline_memcpy( - &layout.ss[0], - val.c_str(), - val.length(), - onode_layout_t::MAX_SS_LENGTH); if (!layout.ss_size) { fut = _xattr_rmattr(ctx, onode, SS_ATTR); } - layout.ss_size = val.length(); - + onode->update_snapset(*ctx.transaction, val); aset.erase(it); } else { - layout.ss_size = 0; + onode->clear_snapset(*ctx.transaction); } } @@ -1869,14 +1855,17 @@ SeaStore::Shard::_setattrs( } return fut.si_then( - [this, onode, &ctx, &layout, - aset=std::move(aset)]() mutable { + [this, onode, &ctx, aset=std::move(aset)]() mutable { return _omap_set_kvs( onode, onode->get_layout().xattr_root, *ctx.transaction, - layout.xattr_root, - std::move(aset)); + std::move(aset) + ).si_then([onode, &ctx](auto root) { + if (root.must_update()) { + onode->update_xattr_root(*ctx.transaction, root); + } + }); }); } @@ -1888,14 +1877,12 @@ SeaStore::Shard::_rmattr( { LOG_PREFIX(SeaStore::_rmattr); DEBUGT("onode={}", *ctx.transaction, *onode); - auto& layout = onode->get_mutable_layout(*ctx.transaction); + auto& layout = onode->get_layout(); if ((name == OI_ATTR) && (layout.oi_size > 0)) { - memset(&layout.oi[0], 0, layout.oi_size); - layout.oi_size = 0; + onode->clear_object_info(*ctx.transaction); return tm_iertr::now(); } else if ((name == SS_ATTR) && (layout.ss_size > 0)) { - memset(&layout.ss[0], 0, layout.ss_size); - layout.ss_size = 0; + onode->clear_snapset(*ctx.transaction); return tm_iertr::now(); } else { return _xattr_rmattr( @@ -1927,8 +1914,7 @@ SeaStore::Shard::_xattr_rmattr( return omap_manager.omap_rm_key(xattr_root, *ctx.transaction, name) .si_then([&] { if (xattr_root.must_update()) { - onode->get_mutable_layout(*ctx.transaction - ).xattr_root.update(xattr_root); + onode->update_xattr_root(*ctx.transaction, xattr_root); } }); }); @@ -1942,11 +1928,8 @@ SeaStore::Shard::_rmattrs( { LOG_PREFIX(SeaStore::_rmattrs); DEBUGT("onode={}", *ctx.transaction, *onode); - auto& layout = onode->get_mutable_layout(*ctx.transaction); - memset(&layout.oi[0], 0, layout.oi_size); - layout.oi_size = 0; - memset(&layout.ss[0], 0, layout.ss_size); - layout.ss_size = 0; + onode->clear_object_info(*ctx.transaction); + onode->clear_snapset(*ctx.transaction); return _xattr_clear(ctx, onode); } @@ -1970,8 +1953,7 @@ SeaStore::Shard::_xattr_clear( return omap_manager.omap_clear(xattr_root, *ctx.transaction) .si_then([&] { if (xattr_root.must_update()) { - onode->get_mutable_layout(*ctx.transaction - ).xattr_root.update(xattr_root); + onode->update_xattr_root(*ctx.transaction, xattr_root); } }); }); diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h index 876fadca8c78..def40a282208 100644 --- a/src/crimson/os/seastore/seastore.h +++ b/src/crimson/os/seastore/seastore.h @@ -408,12 +408,11 @@ class SeaStore final : public FuturizedStore { tm_ret _remove_collection( internal_context_t &ctx, const coll_t& cid); - using omap_set_kvs_ret = tm_iertr::future<>; + using omap_set_kvs_ret = tm_iertr::future; omap_set_kvs_ret _omap_set_kvs( OnodeRef &onode, const omap_root_le_t& omap_root, Transaction& t, - omap_root_le_t& mutable_omap_root, std::map&& kvs); boost::intrusive_ptr _get_collection(const coll_t& cid); diff --git a/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc b/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc index 17ad975d5e87..3d98dcf5aef2 100644 --- a/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc +++ b/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc @@ -30,10 +30,12 @@ struct onode_item_t { uint32_t cnt_modify = 0; void initialize(Transaction& t, Onode& value) const { - auto& layout = value.get_mutable_layout(t); - layout.size = size; - layout.omap_root.update(omap_root_t(id, cnt_modify, - value.get_metadata_hint(block_size))); + auto &ftvalue = static_cast(value); + ftvalue.with_mutable_layout(t, [this, &value](auto &mlayout) { + mlayout.size = size; + mlayout.omap_root.update(omap_root_t(id, cnt_modify, + value.get_metadata_hint(block_size))); + }); validate(value); } diff --git a/src/test/crimson/seastore/test_object_data_handler.cc b/src/test/crimson/seastore/test_object_data_handler.cc index 6510cb5d93fe..47607643946e 100644 --- a/src/test/crimson/seastore/test_object_data_handler.cc +++ b/src/test/crimson/seastore/test_object_data_handler.cc @@ -30,9 +30,9 @@ class TestOnode final : public Onode { const onode_layout_t &get_layout() const final { return layout; } - onode_layout_t &get_mutable_layout(Transaction &t) final { - dirty = true; - return layout; + template + void with_mutable_layout(Transaction &t, Func&& f) { + f(layout); } bool is_alive() const { return true; @@ -40,6 +40,67 @@ class TestOnode final : public Onode { bool is_dirty() const { return dirty; } laddr_t get_hint() const final {return L_ADDR_MIN; } ~TestOnode() final = default; + + void update_onode_size(Transaction &t, uint32_t size) final { + with_mutable_layout(t, [size](onode_layout_t &mlayout) { + mlayout.size = size; + }); + } + + void update_omap_root(Transaction &t, omap_root_t &oroot) final { + with_mutable_layout(t, [&oroot](onode_layout_t &mlayout) { + mlayout.omap_root.update(oroot); + }); + } + + void update_xattr_root(Transaction &t, omap_root_t &xroot) final { + with_mutable_layout(t, [&xroot](onode_layout_t &mlayout) { + mlayout.xattr_root.update(xroot); + }); + } + + void update_object_data(Transaction &t, object_data_t &odata) final { + with_mutable_layout(t, [&odata](onode_layout_t &mlayout) { + mlayout.object_data.update(odata); + }); + } + + void update_object_info(Transaction &t, ceph::bufferlist &oi_bl) final { + with_mutable_layout(t, [&oi_bl](onode_layout_t &mlayout) { + maybe_inline_memcpy( + &mlayout.oi[0], + oi_bl.c_str(), + oi_bl.length(), + onode_layout_t::MAX_OI_LENGTH); + mlayout.oi_size = oi_bl.length(); + }); + } + + void clear_object_info(Transaction &t) final { + with_mutable_layout(t, [](onode_layout_t &mlayout) { + memset(&mlayout.oi[0], 0, mlayout.oi_size); + mlayout.oi_size = 0; + }); + } + + void update_snapset(Transaction &t, ceph::bufferlist &ss_bl) final { + with_mutable_layout(t, [&ss_bl](onode_layout_t &mlayout) { + maybe_inline_memcpy( + &mlayout.ss[0], + ss_bl.c_str(), + ss_bl.length(), + onode_layout_t::MAX_OI_LENGTH); + mlayout.ss_size = ss_bl.length(); + }); + } + + void clear_snapset(Transaction &t) final { + with_mutable_layout(t, [](onode_layout_t &mlayout) { + memset(&mlayout.ss[0], 0, mlayout.ss_size); + mlayout.ss_size = 0; + }); + } + }; struct object_data_handler_test_t: From 544985f08969704e29a52b9a0005c8233c08cc92 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Wed, 27 Sep 2023 17:15:29 +0800 Subject: [PATCH 0109/2492] crimson/os/seastore/onode_manager: drop write_dirty Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/onode_manager.h | 6 ---- .../staged-fltree/fltree_onode_manager.cc | 28 ------------------- .../staged-fltree/fltree_onode_manager.h | 4 --- src/crimson/os/seastore/seastore.cc | 2 -- .../onode_tree/test_fltree_onode_manager.cc | 10 ------- 5 files changed, 50 deletions(-) diff --git a/src/crimson/os/seastore/onode_manager.h b/src/crimson/os/seastore/onode_manager.h index 123c9e4f865e..5a454906dc6f 100644 --- a/src/crimson/os/seastore/onode_manager.h +++ b/src/crimson/os/seastore/onode_manager.h @@ -58,12 +58,6 @@ class OnodeManager { Transaction &trans, const std::vector &hoids) = 0; - using write_dirty_iertr = base_iertr; - using write_dirty_ret = write_dirty_iertr::future<>; - virtual write_dirty_ret write_dirty( - Transaction &trans, - const std::vector &onodes) = 0; - using erase_onode_iertr = base_iertr; using erase_onode_ret = erase_onode_iertr::future<>; virtual erase_onode_ret erase_onode( diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc index a1f593889d52..dc6c183f5385 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc @@ -87,34 +87,6 @@ FLTreeOnodeManager::get_or_create_onodes( }); } -FLTreeOnodeManager::write_dirty_ret FLTreeOnodeManager::write_dirty( - Transaction &trans, - const std::vector &onodes) -{ - return trans_intr::do_for_each( - onodes, - [&trans](auto &onode) -> eagain_ifuture<> { - if (!onode) { - return eagain_iertr::make_ready_future<>(); - } - auto &flonode = static_cast(*onode); - if (!flonode.is_alive()) { - return eagain_iertr::make_ready_future<>(); - } - switch (flonode.status) { - case FLTreeOnode::status_t::MUTATED: { - flonode.populate_recorder(trans); - return eagain_iertr::make_ready_future<>(); - } - case FLTreeOnode::status_t::STABLE: { - return eagain_iertr::make_ready_future<>(); - } - default: - __builtin_unreachable(); - } - }); -} - FLTreeOnodeManager::erase_onode_ret FLTreeOnodeManager::erase_onode( Transaction &trans, OnodeRef &onode) diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h index 619609419ad8..d5a0dfc7bd5a 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h @@ -212,10 +212,6 @@ class FLTreeOnodeManager : public crimson::os::seastore::OnodeManager { Transaction &trans, const std::vector &hoids) final; - write_dirty_ret write_dirty( - Transaction &trans, - const std::vector &onodes) final; - erase_onode_ret erase_onode( Transaction &trans, OnodeRef &onode) final; diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index edbc9a0115ec..4a52fa22d998 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -1226,8 +1226,6 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks( return seastar::make_ready_future( seastar::stop_iteration::yes); }; - }).si_then([this, &ctx, &d_onodes] { - return onode_manager->write_dirty(*ctx.transaction, d_onodes); }); }).si_then([this, &ctx] { return transaction_manager->submit_transaction(*ctx.transaction); diff --git a/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc b/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc index 3d98dcf5aef2..cd5dd4407f80 100644 --- a/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc +++ b/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc @@ -121,13 +121,6 @@ struct fltree_onode_manager_test_t return manager->get_or_create_onode(t, p_kv->key); }).unsafe_get0(); std::invoke(f, t, *onode, p_kv->value); - with_trans_intr(t, [&](auto &t) { - if (onode->is_alive()) { - return manager->write_dirty(t, {onode}); - } else { - return OnodeManager::write_dirty_iertr::now(); - } - }).unsafe_get0(); }); } @@ -182,9 +175,6 @@ struct fltree_onode_manager_test_t boost::tie(onode, p_item) = tup; std::invoke(f, t, *onode, *p_item); } - with_trans_intr(t, [&](auto &t) { - return manager->write_dirty(t, onodes); - }).unsafe_get0(); }); } From db726d7a44cde04f82f2e9d2cacd5ac0edfd86ac Mon Sep 17 00:00:00 2001 From: Leonid Usov Date: Wed, 6 Sep 2023 19:29:48 +0300 Subject: [PATCH 0110/2492] vstart.sh: make sure that --localhost does only bind to 127.0.0.1 in all cases Signed-off-by: Leonid Usov --- src/vstart.sh | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/vstart.sh b/src/vstart.sh index a33bcde3234b..99ecdf510e9b 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -174,6 +174,7 @@ pmem_enabled=0 zoned_enabled=0 io_uring_enabled=0 with_jaeger=0 +force_addr=0 with_mgr_dashboard=true if [[ "$(get_cmake_variable WITH_MGR_DASHBOARD_FRONTEND)" != "ON" ]] || @@ -344,6 +345,7 @@ case $1 in ;; -l | --localhost) ip="127.0.0.1" + force_addr=1 ;; -i) [ -z "$2" ] && usage_exit @@ -737,6 +739,12 @@ prepare_conf() { msgr_conf="ms bind msgr2 = false ms bind msgr1 = true" fi + if [ $force_addr -eq 1 ]; then + msgr_conf+=" + public bind addr = $IP + public addr = $IP + cluster addr = $IP" + fi wconf <public_bind_addr.is_blank_ip()) { + # bind_addrs = make_mon_addrs(g_conf()->public_bind_addr); + # } + # + if [ $force_addr -eq 1 ]; then + wconf < Date: Thu, 28 Sep 2023 13:24:35 +0530 Subject: [PATCH 0111/2492] cephfs-shell: use pkg_resources rather than packaging module `pkg_resources` is already being used by other py scripts. Fixes: https://tracker.ceph.com/issues/62739 Signed-off-by: Venky Shankar --- src/tools/cephfs/shell/cephfs-shell | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/tools/cephfs/shell/cephfs-shell b/src/tools/cephfs/shell/cephfs-shell index bdd1bb4f0b2b..d1726940ffde 100755 --- a/src/tools/cephfs/shell/cephfs-shell +++ b/src/tools/cephfs/shell/cephfs-shell @@ -16,13 +16,14 @@ import shlex import stat import errno -from packaging import version +from pkg_resources import packaging # type: ignore from cmd2 import Cmd from cmd2 import __version__ as cmd2_version # XXX: In cmd2 versions < 1.0.1, we'll get SystemExit(2) instead of # Cmd2ArgparseError -if version.parse(cmd2_version) >= version.parse("1.0.1"): +Version = packaging.version.Version +if Version(cmd2_version) >= Version("1.0.1"): from cmd2.exceptions import Cmd2ArgparseError else: # HACK: so that we don't have check for version everywhere From 5be9213738bfbde12172b3a5158667ebe9dfa13d Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Thu, 28 Sep 2023 13:25:46 +0530 Subject: [PATCH 0112/2492] doc/cephfs-shell: drop installing packaging module Signed-off-by: Venky Shankar --- doc/man/8/cephfs-shell.rst | 2 +- src/tools/cephfs/shell/cephfs-shell | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/man/8/cephfs-shell.rst b/doc/man/8/cephfs-shell.rst index 91136f123b6c..974ba37d9110 100644 --- a/doc/man/8/cephfs-shell.rst +++ b/doc/man/8/cephfs-shell.rst @@ -56,7 +56,7 @@ Options .. code:: bash - [build]$ python3 -m venv venv && source venv/bin/activate && pip3 install cmd2 colorama packaging + [build]$ python3 -m venv venv && source venv/bin/activate && pip3 install cmd2 colorama [build]$ source vstart_environment.sh && source venv/bin/activate && python3 ../src/tools/cephfs/shell/cephfs-shell Commands diff --git a/src/tools/cephfs/shell/cephfs-shell b/src/tools/cephfs/shell/cephfs-shell index d1726940ffde..d2c050f89316 100755 --- a/src/tools/cephfs/shell/cephfs-shell +++ b/src/tools/cephfs/shell/cephfs-shell @@ -1701,11 +1701,11 @@ def read_shell_conf(shell, shell_conf_file): sec = 'cephfs-shell' opts = [] - if version.parse(cmd2_version) >= version.parse("0.10.0"): + if Version(cmd2_version) >= Version("0.10.0"): for attr in shell.settables.keys(): opts.append(attr) else: - if version.parse(cmd2_version) <= version.parse("0.9.13"): + if Version(cmd2_version) <= Version("0.9.13"): # hardcoding options for 0.7.9 because - # 1. we use cmd2 v0.7.9 with teuthology and # 2. there's no way distinguish between a shell setting and shell @@ -1714,7 +1714,7 @@ def read_shell_conf(shell, shell_conf_file): 'continuation_prompt', 'debug', 'echo', 'editor', 'feedback_to_output', 'locals_in_py', 'prompt', 'quiet', 'timing'] - elif version.parse(cmd2_version) >= version.parse("0.9.23"): + elif Version(cmd2_version) >= Version("0.9.23"): opts.append('allow_style') # no equivalent option was defined by cmd2. else: @@ -1769,7 +1769,7 @@ def manage_args(): args.exe_and_quit = False # Execute and quit, don't launch the shell. if args.batch: - if version.parse(cmd2_version) <= version.parse("0.9.13"): + if Version(cmd2_version) <= Version("0.9.13"): args.commands = ['load ' + args.batch, ',quit'] else: args.commands = ['run_script ' + args.batch, ',quit'] @@ -1814,7 +1814,7 @@ def execute_cmds_and_quit(args): # value to indicate whether the execution of the commands should stop, but # since 0.9.7 it returns the return value of do_* methods only if it's # not None. When it is None it returns False instead of None. - if version.parse(cmd2_version) <= version.parse("0.9.6"): + if Version(cmd2_version) <= Version("0.9.6"): stop_exec_val = None else: stop_exec_val = False From 6a6a9ddd46e5dd2135dfd241fc0dff8ff7472a06 Mon Sep 17 00:00:00 2001 From: avanthakkar Date: Thu, 5 Oct 2023 17:48:34 +0530 Subject: [PATCH 0113/2492] mgr/cephadm is not defining haproxy tcp healthchecks for Ganesha Fixes: https://tracker.ceph.com/issues/62638 Signed-off-by: avanthakkar --- .../mgr/cephadm/templates/services/ingress/haproxy.cfg.j2 | 2 +- src/pybind/mgr/cephadm/tests/test_services.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pybind/mgr/cephadm/templates/services/ingress/haproxy.cfg.j2 b/src/pybind/mgr/cephadm/templates/services/ingress/haproxy.cfg.j2 index 100acce401ba..c114a8cba115 100644 --- a/src/pybind/mgr/cephadm/templates/services/ingress/haproxy.cfg.j2 +++ b/src/pybind/mgr/cephadm/templates/services/ingress/haproxy.cfg.j2 @@ -85,6 +85,6 @@ backend backend default-server {{ default_server_opts|join(" ") }} {% endif %} {% for server in servers %} - server {{ server.name }} {{ server.ip }}:{{ server.port }} + server {{ server.name }} {{ server.ip }}:{{ server.port }} check {% endfor %} {% endif %} diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index 21c62ffd06fb..52a3b3619668 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -1647,7 +1647,7 @@ def fake_get_addr(hostname: str) -> str: ) if enable_haproxy_protocol: haproxy_txt += ' default-server send-proxy-v2\n' - haproxy_txt += ' server nfs.foo.0 192.168.122.111:12049\n' + haproxy_txt += ' server nfs.foo.0 192.168.122.111:12049 check\n' haproxy_expected_conf = { 'files': {'haproxy.cfg': haproxy_txt} } @@ -2425,7 +2425,7 @@ def fake_keys(): ' balance source\n' ' hash-type consistent\n' ' default-server send-proxy-v2\n' - ' server nfs.foo.0 192.168.122.111:12049\n' + ' server nfs.foo.0 192.168.122.111:12049 check\n' ) haproxy_expected_conf = { 'files': {'haproxy.cfg': haproxy_txt} From 6caf2c437128809b294506f61237f1aecf16aea8 Mon Sep 17 00:00:00 2001 From: Matan Breizman Date: Mon, 9 Oct 2023 12:06:42 +0000 Subject: [PATCH 0114/2492] tools/ceph_monstore_tool: add ceph-specific option example Signed-off-by: Matan Breizman --- src/tools/ceph_monstore_tool.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc index 9da7f5f5c40e..437b45cd6760 100644 --- a/src/tools/ceph_monstore_tool.cc +++ b/src/tools/ceph_monstore_tool.cc @@ -232,6 +232,7 @@ void usage(const char *n, po::options_description &d) << "\nPlease Note:\n" << "* Ceph-specific options should be in the format --option-name=VAL\n" << " (specifically, do not forget the '='!!)\n" + << " e.g., 'dump-keys --debug-rocksdb=0'\n" << "* Command-specific options need to be passed after a '--'\n" << " e.g., 'get monmap -- --version 10 --out /tmp/foo'" << std::endl; From 23c7d1e749022f7a15c92296f5367bc651cecc5d Mon Sep 17 00:00:00 2001 From: Matan Breizman Date: Sun, 6 Aug 2023 13:04:11 +0000 Subject: [PATCH 0115/2492] tools/ceph_monstore_tool: add get-key command Signed-off-by: Matan Breizman --- src/tools/ceph_monstore_tool.cc | 98 +++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc index 437b45cd6760..f6b0dc62ae55 100644 --- a/src/tools/ceph_monstore_tool.cc +++ b/src/tools/ceph_monstore_tool.cc @@ -210,6 +210,7 @@ void usage(const char *n, po::options_description &d) << " (default: last committed)\n" << " get crushmap [-- options] get crushmap (version VER if specified)\n" << " (default: last committed)\n" + << " get-key PREFIX KEY [-- options] get key\n" << " show-versions [-- options] show the first&last committed version of map\n" << " (show-versions -- --help for more info)\n" << " dump-keys dumps store keys to FILE\n" @@ -999,6 +1000,103 @@ int main(int argc, char **argv) { << " version " << v << " to " << outpath << std::endl; } +} else if (cmd == "get-key") { + string outpath; + string prefix; + string key; + + // visible options for this command + po::options_description op_desc("Allowed 'get-key' options"); + op_desc.add_options() + ("help,h", "produce this help message") + ("out,o", po::value(&outpath), + "output file (default: stdout)") + ("readable,r", "print the map information in human readable format") + ; + // this is going to be a positional argument; we don't want to show + // it as an option during --help, but we do want to have it captured + // when parsing. + po::options_description hidden_op_desc("Hidden 'get-key' options"); + hidden_op_desc.add_options() + ("prefix", po::value(&prefix),"prefix") + ("key", po::value(&key),"key") + ; + po::positional_options_description op_positional; + op_positional.add("prefix", 1); + op_positional.add("key", 1); + + + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional, + subcmds, &op_vm); + if (r < 0) { + return -r; + } + + if (op_vm.count("help") || prefix.empty()) { + usage(argv[0], op_desc); + return 0; + } + + int fd = STDOUT_FILENO; + if (!outpath.empty()){ + fd = ::open(outpath.c_str(), O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0666); + if (fd < 0) { + std::cerr << "error opening output file: " + << cpp_strerror(errno) << std::endl; + return EINVAL; + } + } + + auto close_fd = make_scope_guard([&] { + ::close(fd); + if (r < 0 && fd != STDOUT_FILENO) { + ::remove(outpath.c_str()); + } + }); + bufferlist bl; + r = 0; + std::cout << prefix << " " << key << std::endl; + r = st.get(prefix, key, bl); + if (r < 0) { + std::cerr << "Error getting key: " << cpp_strerror(r) << std::endl; + return EINVAL; + } + + if (op_vm.count("readable")) { + try { + if (prefix == "osd_snap") { + auto p = bl.cbegin(); + if (key.starts_with("purged_epoch_")) { + map val; + ceph::decode(val, p); + std::cout << val << std::endl; + } else if (key.starts_with("purged_snap_")) { + snapid_t first_snap, end_snap; + epoch_t epoch; + ceph::decode(first_snap, p); + ceph::decode(end_snap, p); + ceph::decode(epoch, p); + std::cout << "first_snap:" << first_snap + << " end_snap: " << end_snap + << " epoch: " << epoch + << std::endl; + } + } else { + std::cerr << "This type of readable key does not exist: " << prefix + << std::endl << "You can only specify[osd_snap]" << std::endl; + } + } catch (const buffer::error &err) { + std::cerr << "Could not decode for human readable output (you may still" + " use non-readable mode). Detail: " << err.what() << std::endl; + } + } + + bl.write_fd(fd); + + if (!outpath.empty()) { + std::cout << "wrote " << prefix << " " << key << " to " << outpath << std::endl; + } } else if (cmd == "show-versions") { string map_type; //map type:osdmap,monmap... // visible options for this command From cfd2cc9449ddf291d0fc03d96eb18806afc39578 Mon Sep 17 00:00:00 2001 From: Matan Breizman Date: Sun, 6 Aug 2023 13:03:19 +0000 Subject: [PATCH 0116/2492] tools/ceph_monstore_tool: add remove-key command Signed-off-by: Matan Breizman --- src/mon/MonitorDBStore.h | 8 ++++++++ src/tools/ceph_monstore_tool.cc | 27 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h index 11608e9e553c..b7dfc50b0a17 100644 --- a/src/mon/MonitorDBStore.h +++ b/src/mon/MonitorDBStore.h @@ -604,6 +604,14 @@ class MonitorDBStore return combine_strings(prefix, os.str()); } + int clear_key(const std::string& prefix, const std::string& key) { + ceph_assert(!prefix.empty()); + ceph_assert(!key.empty()); + KeyValueDB::Transaction dbt = db->get_transaction(); + dbt->rmkey(prefix, key); + return db->submit_transaction_sync(dbt); + } + void clear(std::set& prefixes) { KeyValueDB::Transaction dbt = db->get_transaction(); diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc index f6b0dc62ae55..8c6c9c7233b2 100644 --- a/src/tools/ceph_monstore_tool.cc +++ b/src/tools/ceph_monstore_tool.cc @@ -211,6 +211,7 @@ void usage(const char *n, po::options_description &d) << " get crushmap [-- options] get crushmap (version VER if specified)\n" << " (default: last committed)\n" << " get-key PREFIX KEY [-- options] get key\n" + << " remove-key PREFIX KEY remove key\n" << " show-versions [-- options] show the first&last committed version of map\n" << " (show-versions -- --help for more info)\n" << " dump-keys dumps store keys to FILE\n" @@ -1410,6 +1411,32 @@ int main(int argc, char **argv) { err = rewrite_crush(argv[0], subcmds, st); } else if (cmd == "rebuild") { err = rebuild_monstore(argv[0], subcmds, st); + } else if (cmd == "remove-key") { + string prefix, key; + // No visible options for this command + po::options_description op_desc("Allowed 'get' options"); + po::options_description hidden_op_desc("Hidden 'get' options"); + hidden_op_desc.add_options() + ("prefix", po::value(&prefix),"prefix") + ("key", po::value(&key),"key") + ; + po::positional_options_description op_positional; + op_positional.add("prefix", 1); + op_positional.add("key", 1); + + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional, + subcmds, &op_vm); + if (r < 0) { + return -r; + } + r = st.clear_key(prefix, key); + if (r < 0) { + std::cerr << "error removing (" + << prefix << "," << key << ")" + << std::endl; + return r; + } } else { std::cerr << "Unrecognized command: " << cmd << std::endl; usage(argv[0], desc); From 42f94dd88fab7d46fb7478ab911c9882b124af86 Mon Sep 17 00:00:00 2001 From: Matan Breizman Date: Mon, 9 Oct 2023 13:43:28 +0000 Subject: [PATCH 0117/2492] doc/man/8/ceph-monstore-tool.rst: doc fixes Signed-off-by: Matan Breizman --- doc/man/8/ceph-monstore-tool.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/man/8/ceph-monstore-tool.rst b/doc/man/8/ceph-monstore-tool.rst index 9396df0b6057..02700a50b22e 100644 --- a/doc/man/8/ceph-monstore-tool.rst +++ b/doc/man/8/ceph-monstore-tool.rst @@ -18,13 +18,13 @@ Description :program:`ceph-monstore-tool` is used to manipulate MonitorDBStore's data (monmap, osdmap, etc.) offline. It is similar to `ceph-kvstore-tool`. -The default RocksDB debug level is `0`. This can be changed using `--debug`. - Note: Ceph-specific options take the format `--option-name=VAL` DO NOT FORGET THE EQUALS SIGN. ('=') + for example, `dump-keys --debug-rocksdb=0` + Command-specific options must be passed after a `--` - for example, `get monmap --debug -- --version 10 --out /tmp/foo` + for example, `get monmap -- --version 10 --out /tmp/foo` Commands ======== @@ -49,8 +49,11 @@ Commands :command:`get crushmap [-- options]` Get crushmap (version VER if specified) (default: last committed). -:command:`get osd_snap [-- options]` - Get osd_snap key (`purged_snap` or `purged_epoch`). +:command:`get-key [-- options]` + Get key to FILE (default: stdout). + +:command:`remove-key [-- options]` + Remove key. :command:`dump-keys` Dump store keys to FILE (default: stdout). @@ -73,9 +76,6 @@ Commands :command:`rebuild` Rebuild store. -:command:`rm ` - Remove specified key from the store. - Availability ============ From a10f6e37e4291c682e9b7dccf1d819a04f9b3cbb Mon Sep 17 00:00:00 2001 From: Vedansh Bhartia Date: Wed, 14 Jun 2023 18:14:10 +0530 Subject: [PATCH 0118/2492] rgw: Add a wrapper for librados::AioCompletion to prevent memory leaks Signed-off-by: Vedansh Bhartia --- src/rgw/driver/rados/rgw_gc.cc | 30 +++++++++++++++--------------- src/rgw/rgw_aio.cc | 1 + src/rgw/rgw_common.h | 6 ++++++ 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/rgw/driver/rados/rgw_gc.cc b/src/rgw/driver/rados/rgw_gc.cc index 4705c46fff4b..4b30219734cc 100644 --- a/src/rgw/driver/rados/rgw_gc.cc +++ b/src/rgw/driver/rados/rgw_gc.cc @@ -170,9 +170,9 @@ void RGWGC::on_defer_canceled(const cls_rgw_gc_obj_info& info) cls_rgw_gc_queue_defer_entry(op, cct->_conf->rgw_gc_obj_min_wait, info); cls_rgw_gc_remove(op, {tag}); - auto c = librados::Rados::aio_create_completion(nullptr, nullptr); - store->gc_aio_operate(obj_names[i], c, &op); - c->release(); + aio_completion_ptr c{librados::Rados::aio_create_completion(nullptr, nullptr)}; + + store->gc_aio_operate(obj_names[i], c.get(), &op); } int RGWGC::async_defer_chain(const string& tag, const cls_rgw_obj_chain& chain) @@ -191,9 +191,9 @@ int RGWGC::async_defer_chain(const string& tag, const cls_rgw_obj_chain& chain) // enqueue succeeds cls_rgw_gc_remove(op, {tag}); - auto c = librados::Rados::aio_create_completion(nullptr, nullptr); - int ret = store->gc_aio_operate(obj_names[i], c, &op); - c->release(); + aio_completion_ptr c{librados::Rados::aio_create_completion(nullptr, nullptr)}; + + int ret = store->gc_aio_operate(obj_names[i], c.get(), &op); return ret; } @@ -225,12 +225,11 @@ int RGWGC::remove(int index, const std::vector& tags, AioCompletion **pc ObjectWriteOperation op; cls_rgw_gc_remove(op, tags); - auto c = librados::Rados::aio_create_completion(nullptr, nullptr); - int ret = store->gc_aio_operate(obj_names[index], c, &op); - if (ret < 0) { - c->release(); - } else { - *pc = c; + aio_completion_ptr c{librados::Rados::aio_create_completion(nullptr, nullptr)}; + int ret = store->gc_aio_operate(obj_names[index], c.get(), &op); + if (ret >= 0) { + *pc = c.get(); + c.release(); } return ret; } @@ -391,12 +390,13 @@ class RGWGCIOManager { } } - auto c = librados::Rados::aio_create_completion(nullptr, nullptr); - int ret = ioctx->aio_operate(oid, c, op); + aio_completion_ptr c{librados::Rados::aio_create_completion(nullptr, nullptr)}; + int ret = ioctx->aio_operate(oid, c.get(), op); if (ret < 0) { return ret; } - ios.push_back(IO{IO::TailIO, c, oid, index, tag}); + ios.push_back(IO{IO::TailIO, c.get(), oid, index, tag}); + c.release(); return 0; } diff --git a/src/rgw/rgw_aio.cc b/src/rgw/rgw_aio.cc index 02e3411858ac..cd85ea6d7fab 100644 --- a/src/rgw/rgw_aio.cc +++ b/src/rgw/rgw_aio.cc @@ -33,6 +33,7 @@ struct state { state(Aio* aio, librados::IoCtx ctx, AioResult& r) : aio(aio), ctx(std::move(ctx)), + // coverity[ctor_dtor_leak:SUPPRESS] c(librados::Rados::aio_create_completion(&r, &cb)) {} }; diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index de2f7443164a..c1d19e49d948 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -1842,3 +1842,9 @@ rgw_global_init(const std::map *defaults, std::vector < const char* >& args, uint32_t module_type, code_environment_t code_env, int flags); + + +struct AioCompletionDeleter { + void operator()(librados::AioCompletion* c) { c->release(); } +}; +using aio_completion_ptr = std::unique_ptr; From 41029edcac9b2509414c8d2e5d51e23fe6590c8a Mon Sep 17 00:00:00 2001 From: Vedansh Bhartia Date: Mon, 9 Oct 2023 21:21:23 +0530 Subject: [PATCH 0119/2492] rgw: Add coverity annotations for uncaught exceptions in standalone binaries Signed-off-by: Vedansh Bhartia --- src/rgw/driver/dbstore/dbstore_main.cc | 3 +++ src/rgw/rgw_admin.cc | 3 +++ src/rgw/rgw_es_main.cc | 3 +++ src/rgw/rgw_jsonparser.cc | 4 +++- src/rgw/rgw_main.cc | 4 ++++ src/rgw/rgw_multiparser.cc | 4 ++++ src/rgw/rgw_object_expirer.cc | 3 +++ src/rgw/rgw_polparser.cc | 3 +++ src/rgw/rgw_token.cc | 3 +++ 9 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/rgw/driver/dbstore/dbstore_main.cc b/src/rgw/driver/dbstore/dbstore_main.cc index 4fff38ced279..46d4106ca436 100644 --- a/src/rgw/driver/dbstore/dbstore_main.cc +++ b/src/rgw/driver/dbstore/dbstore_main.cc @@ -119,6 +119,9 @@ void* process(void *arg) return 0; } +// This has an uncaught exception. Even if the exception is caught, the program +// would need to be terminated, so the warning is simply suppressed. +// coverity[root_function:SUPPRESS] int main(int argc, char *argv[]) { string tenant = "Redhat"; diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index a4e17b8895ae..01a07f2c2dea 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -3305,6 +3305,9 @@ void init_realm_param(CephContext *cct, string& var, std::optional& opt_ } } +// This has an uncaught exception. Even if the exception is caught, the program +// would need to be terminated, so the warning is simply suppressed. +// coverity[root_function:SUPPRESS] int main(int argc, const char **argv) { auto args = argv_to_vec(argc, argv); diff --git a/src/rgw/rgw_es_main.cc b/src/rgw/rgw_es_main.cc index 6cfbc9352926..d84f9ecadd58 100644 --- a/src/rgw/rgw_es_main.cc +++ b/src/rgw/rgw_es_main.cc @@ -14,6 +14,9 @@ using namespace std; +// This has an uncaught exception. Even if the exception is caught, the program +// would need to be terminated, so the warning is simply suppressed. +// coverity[root_function:SUPPRESS] int main(int argc, char *argv[]) { auto args = argv_to_vec(argc, argv); diff --git a/src/rgw/rgw_jsonparser.cc b/src/rgw/rgw_jsonparser.cc index 6541630b286d..a6c99c639890 100644 --- a/src/rgw/rgw_jsonparser.cc +++ b/src/rgw/rgw_jsonparser.cc @@ -56,7 +56,9 @@ struct UserInfo { } }; - +// This has an uncaught exception. Even if the exception is caught, the program +// would need to be terminated, so the warning is simply suppressed. +// coverity[root_function:SUPPRESS] int main(int argc, char **argv) { JSONParser parser; diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc index 27b02f841951..4d31db8072f5 100644 --- a/src/rgw/rgw_main.cc +++ b/src/rgw/rgw_main.cc @@ -57,7 +57,11 @@ static int usage() /* * start up the RADOS connection and then handle HTTP messages as they come in + * + * This has an uncaught exception. Even if the exception is caught, the program + * would need to be terminated, so the warning is simply suppressed. */ +// coverity[root_function:SUPPRESS] int main(int argc, char *argv[]) { int r{0}; diff --git a/src/rgw/rgw_multiparser.cc b/src/rgw/rgw_multiparser.cc index a8778abd9a08..c7a37213c4db 100644 --- a/src/rgw/rgw_multiparser.cc +++ b/src/rgw/rgw_multiparser.cc @@ -14,6 +14,10 @@ using namespace std; + +// This has an uncaught exception. Even if the exception is caught, the program +// would need to be terminated, so the warning is simply suppressed. +// coverity[root_function:SUPPRESS] int main(int argc, char **argv) { RGWMultiXMLParser parser; diff --git a/src/rgw/rgw_object_expirer.cc b/src/rgw/rgw_object_expirer.cc index 7a49fc8d161e..0470b1a6e6fc 100644 --- a/src/rgw/rgw_object_expirer.cc +++ b/src/rgw/rgw_object_expirer.cc @@ -51,6 +51,9 @@ static void usage() generic_server_usage(); } +// This has an uncaught exception. Even if the exception is caught, the program +// would need to be terminated, so the warning is simply suppressed. +// coverity[root_function:SUPPRESS] int main(const int argc, const char **argv) { auto args = argv_to_vec(argc, argv); diff --git a/src/rgw/rgw_polparser.cc b/src/rgw/rgw_polparser.cc index eca5066b3ce3..3991181809ee 100644 --- a/src/rgw/rgw_polparser.cc +++ b/src/rgw/rgw_polparser.cc @@ -50,6 +50,9 @@ void usage(std::string_view cmdname) << std::endl; } +// This has an uncaught exception. Even if the exception is caught, the program +// would need to be terminated, so the warning is simply suppressed. +// coverity[root_function:SUPPRESS] int main(int argc, const char** argv) { std::string_view cmdname = argv[0]; diff --git a/src/rgw/rgw_token.cc b/src/rgw/rgw_token.cc index 999d46e0e229..8ffac69c8313 100644 --- a/src/rgw/rgw_token.cc +++ b/src/rgw/rgw_token.cc @@ -60,6 +60,9 @@ void usage() generic_client_usage(); } +// This has an uncaught exception. Even if the exception is caught, the program +// would need to be terminated, so the warning is simply suppressed. +// coverity[root_function:SUPPRESS] int main(int argc, char **argv) { auto args = argv_to_vec(argc, argv); From 2ee29047ca564f04f1d27dc0aee09dcf42ab9c5a Mon Sep 17 00:00:00 2001 From: cuiming_yewu Date: Fri, 22 Sep 2023 10:45:30 +0800 Subject: [PATCH 0120/2492] kv:resolve three complication warnings Resolve three warning issues in the derived classes overriding base class virtual functions with mismatched parameter lists at lines 69, 72, and 74 in src/kv/rocksdb_cache/ShardedCache.h. with ../src/kv/rocksdb_cache/ShardedCache.h:74:16: warning: by 'virtual bool rocksdb_cache::ShardedCache::Release(rocksdb::Cache::Handle*, bool)' [-Woverloaded-virtual] virtual bool Release(rocksdb::Cache::Handle* handle, bool force_erase = false) override; ^~~~~~~ ../src/kv/rocksdb_cache/ShardedCache.h:72:35: warning: by 'virtual rocksdb::Cache::Handle* rocksdb_cache::ShardedCache::Lookup(const rocksdb::Slice&, rocksdb::Statistics*)' [-Woverloaded-virtual] virtual rocksdb::Cache::Handle* Lookup(const rocksdb::Slice& key, rocksdb::Statistics* stats) override; ^~~~~~ ../src/kv/rocksdb_cache/ShardedCache.h:69:27: warning: by 'virtual rocksdb::Status rocksdb_cache::ShardedCache::Insert(const rocksdb::Slice&, void*, size_t, rocksdb::Cache::DeleterFn, rocksdb::Cache::Handle**, rocksdb::Cache::Priority)' [-Woverloaded-virtual] virtual rocksdb::Status Insert(const rocksdb::Slice& key, void* value, size_t charge, ^~~~~~ Signed-off-by: cuiming --- src/kv/rocksdb_cache/ShardedCache.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/kv/rocksdb_cache/ShardedCache.h b/src/kv/rocksdb_cache/ShardedCache.h index ef4b10d8f273..63a56c4577eb 100644 --- a/src/kv/rocksdb_cache/ShardedCache.h +++ b/src/kv/rocksdb_cache/ShardedCache.h @@ -66,11 +66,14 @@ class ShardedCache : public rocksdb::Cache, public PriorityCache::PriCache { virtual ~ShardedCache() = default; // rocksdb::Cache virtual const char* Name() const override = 0; + using rocksdb::Cache::Insert; virtual rocksdb::Status Insert(const rocksdb::Slice& key, void* value, size_t charge, DeleterFn, rocksdb::Cache::Handle** handle, Priority priority) override; + using rocksdb::Cache::Lookup; virtual rocksdb::Cache::Handle* Lookup(const rocksdb::Slice& key, rocksdb::Statistics* stats) override; virtual bool Ref(rocksdb::Cache::Handle* handle) override; + using rocksdb::Cache::Release; virtual bool Release(rocksdb::Cache::Handle* handle, bool force_erase = false) override; virtual void* Value(Handle* handle) override = 0; virtual void Erase(const rocksdb::Slice& key) override; From 4ff02f53fe722c20dbf0bb51d9b786d5f94d18ab Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Sat, 7 Oct 2023 14:01:32 +0800 Subject: [PATCH 0121/2492] crimson/os/seastore/onode_manager: avoid unnecessary delta related overhead Signed-off-by: Xuehan Xu --- .../staged-fltree/fltree_onode_manager.cc | 126 ++++++++++++- .../staged-fltree/fltree_onode_manager.h | 175 ++++++++++++------ .../onode_tree/test_fltree_onode_manager.cc | 9 +- 3 files changed, 246 insertions(+), 64 deletions(-) diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc index dc6c183f5385..6243252682aa 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc @@ -9,6 +9,125 @@ SET_SUBSYS(seastore_onode); namespace crimson::os::seastore::onode { +void FLTreeOnode::Recorder::apply_value_delta( + ceph::bufferlist::const_iterator &bliter, + NodeExtentMutable &value, + laddr_t value_addr) +{ + LOG_PREFIX(FLTreeOnode::Recorder::apply_value_delta); + delta_op_t op; + try { + ceph::decode(op, bliter); + auto &mlayout = *reinterpret_cast(value.get_write()); + switch (op) { + case delta_op_t::UPDATE_ONODE_SIZE: + DEBUG("update onode size"); + bliter.copy(sizeof(mlayout.size), (char *)&mlayout.size); + break; + case delta_op_t::UPDATE_OMAP_ROOT: + DEBUG("update omap root"); + bliter.copy(sizeof(mlayout.omap_root), (char *)&mlayout.omap_root); + break; + case delta_op_t::UPDATE_XATTR_ROOT: + DEBUG("update xattr root"); + bliter.copy(sizeof(mlayout.xattr_root), (char *)&mlayout.xattr_root); + break; + case delta_op_t::UPDATE_OBJECT_DATA: + DEBUG("update object data"); + bliter.copy(sizeof(mlayout.object_data), (char *)&mlayout.object_data); + break; + case delta_op_t::UPDATE_OBJECT_INFO: + DEBUG("update object info"); + bliter.copy(onode_layout_t::MAX_OI_LENGTH, (char *)&mlayout.oi[0]); + ceph::decode(mlayout.oi_size, bliter); + break; + case delta_op_t::UPDATE_SNAPSET: + DEBUG("update snapset"); + bliter.copy(onode_layout_t::MAX_SS_LENGTH, (char *)&mlayout.ss[0]); + ceph::decode(mlayout.ss_size, bliter); + break; + case delta_op_t::CLEAR_OBJECT_INFO: + DEBUG("clear object info"); + memset(&mlayout.oi[0], 0, mlayout.oi_size); + mlayout.oi_size = 0; + break; + case delta_op_t::CLEAR_SNAPSET: + DEBUG("clear snapset"); + memset(&mlayout.ss[0], 0, mlayout.ss_size); + mlayout.ss_size = 0; + break; + case delta_op_t::CREATE_DEFAULT: + mlayout = onode_layout_t{}; + break; + default: + ceph_abort(); + } + } catch (buffer::error& e) { + ceph_abort(); + } +} + +void FLTreeOnode::Recorder::encode_update( + NodeExtentMutable &payload_mut, delta_op_t op) +{ + LOG_PREFIX(FLTreeOnode::Recorder::encode_update); + auto &layout = *reinterpret_cast( + payload_mut.get_read()); + auto &encoded = get_encoded(payload_mut); + ceph::encode(op, encoded); + switch(op) { + case delta_op_t::UPDATE_ONODE_SIZE: + DEBUG("update onode size"); + encoded.append( + (const char *)&layout.size, + sizeof(layout.size)); + break; + case delta_op_t::UPDATE_OMAP_ROOT: + DEBUG("update omap root"); + encoded.append( + (const char *)&layout.omap_root, + sizeof(layout.omap_root)); + break; + case delta_op_t::UPDATE_XATTR_ROOT: + DEBUG("update xattr root"); + encoded.append( + (const char *)&layout.xattr_root, + sizeof(layout.xattr_root)); + break; + case delta_op_t::UPDATE_OBJECT_DATA: + DEBUG("update object data"); + encoded.append( + (const char *)&layout.object_data, + sizeof(layout.object_data)); + break; + case delta_op_t::UPDATE_OBJECT_INFO: + DEBUG("update object info"); + encoded.append( + (const char *)&layout.oi[0], + onode_layout_t::MAX_OI_LENGTH); + ceph::encode(layout.oi_size, encoded); + break; + case delta_op_t::UPDATE_SNAPSET: + DEBUG("update snapset"); + encoded.append( + (const char *)&layout.ss[0], + onode_layout_t::MAX_SS_LENGTH); + ceph::encode(layout.ss_size, encoded); + break; + case delta_op_t::CREATE_DEFAULT: + DEBUG("create default layout"); + [[fallthrough]]; + case delta_op_t::CLEAR_OBJECT_INFO: + DEBUG("clear object info"); + [[fallthrough]]; + case delta_op_t::CLEAR_SNAPSET: + DEBUG("clear snapset"); + break; + default: + ceph_abort(); + } +} + FLTreeOnodeManager::contains_onode_ret FLTreeOnodeManager::contains_onode( Transaction &trans, const ghobject_t &hoid) @@ -57,9 +176,7 @@ FLTreeOnodeManager::get_or_create_onode( cursor.value()); if (created) { DEBUGT("created onode for entry for {}", trans, hoid); - onode->with_mutable_layout(trans, [](onode_layout_t &mlayout) { - mlayout = onode_layout_t{}; - }); + onode->create_default_layout(trans); } return get_or_create_onode_iertr::make_ready_future(onode); }); @@ -93,9 +210,6 @@ FLTreeOnodeManager::erase_onode_ret FLTreeOnodeManager::erase_onode( { auto &flonode = static_cast(*onode); assert(flonode.is_alive()); - if (flonode.status == FLTreeOnode::status_t::MUTATED) { - flonode.populate_recorder(trans); - } flonode.mark_delete(); return tree.erase(trans, flonode); } diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h index d5a0dfc7bd5a..86f5cea883b2 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h @@ -47,6 +47,17 @@ struct FLTreeOnode final : Onode, Value { Value(std::forward(args)...) {} struct Recorder : public ValueDeltaRecorder { + enum class delta_op_t : uint8_t { + UPDATE_ONODE_SIZE, + UPDATE_OMAP_ROOT, + UPDATE_XATTR_ROOT, + UPDATE_OBJECT_DATA, + UPDATE_OBJECT_INFO, + UPDATE_SNAPSET, + CLEAR_OBJECT_INFO, + CLEAR_SNAPSET, + CREATE_DEFAULT + }; Recorder(bufferlist &bl) : ValueDeltaRecorder(bl) {} value_magic_t get_header_magic() const final { @@ -56,18 +67,9 @@ struct FLTreeOnode final : Onode, Value { void apply_value_delta( ceph::bufferlist::const_iterator &bliter, NodeExtentMutable &value, - laddr_t) final { - assert(value.get_length() == sizeof(onode_layout_t)); - bliter.copy(value.get_length(), value.get_write()); - } + laddr_t value_addr) final; - void record_delta(NodeExtentMutable &value) { - // TODO: probably could use versioning, etc - assert(value.get_length() == sizeof(onode_layout_t)); - ceph::buffer::ptr bptr(value.get_length()); - memcpy(bptr.c_str(), value.get_read(), value.get_length()); - get_encoded(value).append(bptr); - } + void encode_update(NodeExtentMutable &payload_mut, delta_op_t op); }; bool is_alive() const { @@ -78,86 +80,153 @@ struct FLTreeOnode final : Onode, Value { return *read_payload(); } - template - void with_mutable_layout(Transaction &t, Func&& f) { + template + void with_mutable_layout( + Transaction &t, + layout_func_t &&layout_func) { assert(status != status_t::DELETED); auto p = prepare_mutate_payload< onode_layout_t, Recorder>(t); status = status_t::MUTATED; - f(*reinterpret_cast(p.first.get_write())); - populate_recorder(t); + layout_func(p.first, p.second); + status = status_t::STABLE; } - void populate_recorder(Transaction &t) { - assert(status == status_t::MUTATED); - auto p = prepare_mutate_payload< - onode_layout_t, - Recorder>(t); - if (p.second) { - p.second->record_delta( - p.first); - } - status = status_t::STABLE; + void create_default_layout(Transaction &t) { + with_mutable_layout( + t, + [](NodeExtentMutable &payload_mut, Recorder *recorder) { + auto &mlayout = *reinterpret_cast( + payload_mut.get_write()); + mlayout = onode_layout_t{}; + if (recorder) { + recorder->encode_update( + payload_mut, Recorder::delta_op_t::CREATE_DEFAULT); + } + }); } void update_onode_size(Transaction &t, uint32_t size) final { - with_mutable_layout(t, [size](onode_layout_t &mlayout) { - mlayout.size = size; + with_mutable_layout( + t, + [size](NodeExtentMutable &payload_mut, Recorder *recorder) { + auto &mlayout = *reinterpret_cast( + payload_mut.get_write()); + mlayout.size = size; + if (recorder) { + recorder->encode_update( + payload_mut, Recorder::delta_op_t::UPDATE_ONODE_SIZE); + } }); } void update_omap_root(Transaction &t, omap_root_t &oroot) final { - with_mutable_layout(t, [&oroot](onode_layout_t &mlayout) { - mlayout.omap_root.update(oroot); + with_mutable_layout( + t, + [&oroot](NodeExtentMutable &payload_mut, Recorder *recorder) { + auto &mlayout = *reinterpret_cast( + payload_mut.get_write()); + mlayout.omap_root.update(oroot); + if (recorder) { + recorder->encode_update( + payload_mut, Recorder::delta_op_t::UPDATE_OMAP_ROOT); + } }); } void update_xattr_root(Transaction &t, omap_root_t &xroot) final { - with_mutable_layout(t, [&xroot](onode_layout_t &mlayout) { - mlayout.xattr_root.update(xroot); + with_mutable_layout( + t, + [&xroot](NodeExtentMutable &payload_mut, Recorder *recorder) { + auto &mlayout = *reinterpret_cast( + payload_mut.get_write()); + mlayout.xattr_root.update(xroot); + if (recorder) { + recorder->encode_update( + payload_mut, Recorder::delta_op_t::UPDATE_XATTR_ROOT); + } }); } void update_object_data(Transaction &t, object_data_t &odata) final { - with_mutable_layout(t, [&odata](onode_layout_t &mlayout) { - mlayout.object_data.update(odata); + with_mutable_layout( + t, + [&odata](NodeExtentMutable &payload_mut, Recorder *recorder) { + auto &mlayout = *reinterpret_cast( + payload_mut.get_write()); + mlayout.object_data.update(odata); + if (recorder) { + recorder->encode_update( + payload_mut, Recorder::delta_op_t::UPDATE_OBJECT_DATA); + } }); } void update_object_info(Transaction &t, ceph::bufferlist &oi_bl) final { - with_mutable_layout(t, [&oi_bl](onode_layout_t &mlayout) { - maybe_inline_memcpy( - &mlayout.oi[0], - oi_bl.c_str(), - oi_bl.length(), - onode_layout_t::MAX_OI_LENGTH); - mlayout.oi_size = oi_bl.length(); + with_mutable_layout( + t, + [&oi_bl](NodeExtentMutable &payload_mut, Recorder *recorder) { + auto &mlayout = *reinterpret_cast( + payload_mut.get_write()); + maybe_inline_memcpy( + &mlayout.oi[0], + oi_bl.c_str(), + oi_bl.length(), + onode_layout_t::MAX_OI_LENGTH); + mlayout.oi_size = oi_bl.length(); + if (recorder) { + recorder->encode_update( + payload_mut, Recorder::delta_op_t::UPDATE_OBJECT_INFO); + } }); } void clear_object_info(Transaction &t) final { - with_mutable_layout(t, [](onode_layout_t &mlayout) { - memset(&mlayout.oi[0], 0, mlayout.oi_size); - mlayout.oi_size = 0; + with_mutable_layout( + t, [](NodeExtentMutable &payload_mut, Recorder *recorder) { + auto &mlayout = *reinterpret_cast( + payload_mut.get_write()); + memset(&mlayout.oi[0], 0, mlayout.oi_size); + mlayout.oi_size = 0; + if (recorder) { + recorder->encode_update( + payload_mut, Recorder::delta_op_t::CLEAR_OBJECT_INFO); + } }); } void update_snapset(Transaction &t, ceph::bufferlist &ss_bl) final { - with_mutable_layout(t, [&ss_bl](onode_layout_t &mlayout) { - maybe_inline_memcpy( - &mlayout.ss[0], - ss_bl.c_str(), - ss_bl.length(), - onode_layout_t::MAX_OI_LENGTH); - mlayout.ss_size = ss_bl.length(); + with_mutable_layout( + t, + [&ss_bl](NodeExtentMutable &payload_mut, Recorder *recorder) { + auto &mlayout = *reinterpret_cast( + payload_mut.get_write()); + maybe_inline_memcpy( + &mlayout.ss[0], + ss_bl.c_str(), + ss_bl.length(), + onode_layout_t::MAX_OI_LENGTH); + mlayout.ss_size = ss_bl.length(); + if (recorder) { + recorder->encode_update( + payload_mut, Recorder::delta_op_t::UPDATE_SNAPSET); + } }); } void clear_snapset(Transaction &t) final { - with_mutable_layout(t, [](onode_layout_t &mlayout) { - memset(&mlayout.ss[0], 0, mlayout.ss_size); - mlayout.ss_size = 0; + with_mutable_layout( + t, + [](NodeExtentMutable &payload_mut, Recorder *recorder) { + auto &mlayout = *reinterpret_cast( + payload_mut.get_write()); + memset(&mlayout.ss[0], 0, mlayout.ss_size); + mlayout.ss_size = 0; + if (recorder) { + recorder->encode_update( + payload_mut, Recorder::delta_op_t::CLEAR_SNAPSET); + } }); } diff --git a/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc b/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc index cd5dd4407f80..92ab147ef89b 100644 --- a/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc +++ b/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc @@ -31,11 +31,10 @@ struct onode_item_t { void initialize(Transaction& t, Onode& value) const { auto &ftvalue = static_cast(value); - ftvalue.with_mutable_layout(t, [this, &value](auto &mlayout) { - mlayout.size = size; - mlayout.omap_root.update(omap_root_t(id, cnt_modify, - value.get_metadata_hint(block_size))); - }); + ftvalue.update_onode_size(t, size); + auto oroot = omap_root_t(id, cnt_modify, + value.get_metadata_hint(block_size)); + ftvalue.update_omap_root(t, oroot); validate(value); } From 23c73912c7ca4483b59558c3e1735d61e545a92e Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 10 Oct 2023 11:47:38 +0800 Subject: [PATCH 0122/2492] crimson/os/seastore/onode_manager: drop unnecessary status Signed-off-by: Xuehan Xu --- .../onode_manager/staged-fltree/fltree_onode_manager.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h index 86f5cea883b2..33109e50f6a6 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h @@ -25,10 +25,9 @@ struct FLTreeOnode final : Onode, Value { }; enum class status_t { - STABLE, - MUTATED, + ALIVE, DELETED - } status = status_t::STABLE; + } status = status_t::ALIVE; FLTreeOnode(FLTreeOnode&&) = default; FLTreeOnode& operator=(FLTreeOnode&&) = delete; @@ -88,9 +87,7 @@ struct FLTreeOnode final : Onode, Value { auto p = prepare_mutate_payload< onode_layout_t, Recorder>(t); - status = status_t::MUTATED; layout_func(p.first, p.second); - status = status_t::STABLE; } void create_default_layout(Transaction &t) { From 194dd09263b23a7c5e0a06cd59841bca5f89c7f4 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 30 Sep 2023 13:34:44 +0200 Subject: [PATCH 0123/2492] qa/suites/rbd: drop cache tiering workload tests Cache tiering facets have been a constant source of job timeouts accompanied by "slow request" warnings on the OSDs for at least two years. Same workloads pass without pool/small-cache-pool.yaml or thrashers/cache.yaml. See cache tiering deprecation note added in commit 535b8db33ea0 ("doc: deprecate the cache tiering"). Fixes: https://tracker.ceph.com/issues/63149 Signed-off-by: Ilya Dryomov --- qa/suites/rbd/basic/cachepool/.qa | 1 - qa/suites/rbd/basic/cachepool/none.yaml | 0 qa/suites/rbd/basic/cachepool/small.yaml | 17 --------------- qa/suites/rbd/cli/pool/small-cache-pool.yaml | 17 --------------- qa/suites/rbd/cli_v1/pool/.qa | 1 - qa/suites/rbd/cli_v1/pool/none.yaml | 0 .../rbd/cli_v1/pool/small-cache-pool.yaml | 17 --------------- .../rbd/encryption/pool/ec-cache-pool.yaml | 21 ------------------- .../rbd/encryption/pool/small-cache-pool.yaml | 17 --------------- .../rbd/librbd/pool/small-cache-pool.yaml | 17 --------------- qa/suites/rbd/qemu/pool/ec-cache-pool.yaml | 21 ------------------- qa/suites/rbd/qemu/pool/small-cache-pool.yaml | 17 --------------- qa/suites/rbd/thrash/thrashers/cache.yaml | 21 ------------------- 13 files changed, 167 deletions(-) delete mode 120000 qa/suites/rbd/basic/cachepool/.qa delete mode 100644 qa/suites/rbd/basic/cachepool/none.yaml delete mode 100644 qa/suites/rbd/basic/cachepool/small.yaml delete mode 100644 qa/suites/rbd/cli/pool/small-cache-pool.yaml delete mode 120000 qa/suites/rbd/cli_v1/pool/.qa delete mode 100644 qa/suites/rbd/cli_v1/pool/none.yaml delete mode 100644 qa/suites/rbd/cli_v1/pool/small-cache-pool.yaml delete mode 100644 qa/suites/rbd/encryption/pool/ec-cache-pool.yaml delete mode 100644 qa/suites/rbd/encryption/pool/small-cache-pool.yaml delete mode 100644 qa/suites/rbd/librbd/pool/small-cache-pool.yaml delete mode 100644 qa/suites/rbd/qemu/pool/ec-cache-pool.yaml delete mode 100644 qa/suites/rbd/qemu/pool/small-cache-pool.yaml delete mode 100644 qa/suites/rbd/thrash/thrashers/cache.yaml diff --git a/qa/suites/rbd/basic/cachepool/.qa b/qa/suites/rbd/basic/cachepool/.qa deleted file mode 120000 index a602a0353e75..000000000000 --- a/qa/suites/rbd/basic/cachepool/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/rbd/basic/cachepool/none.yaml b/qa/suites/rbd/basic/cachepool/none.yaml deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/qa/suites/rbd/basic/cachepool/small.yaml b/qa/suites/rbd/basic/cachepool/small.yaml deleted file mode 100644 index bad95eaddf25..000000000000 --- a/qa/suites/rbd/basic/cachepool/small.yaml +++ /dev/null @@ -1,17 +0,0 @@ -overrides: - ceph: - log-ignorelist: - - overall HEALTH_ - - \(CACHE_POOL_NEAR_FULL\) - - \(CACHE_POOL_NO_HIT_SET\) -tasks: -- exec: - client.0: - - sudo ceph osd pool create cache 4 - - sudo ceph osd tier add rbd cache - - sudo ceph osd tier cache-mode cache writeback - - sudo ceph osd tier set-overlay rbd cache - - sudo ceph osd pool set cache hit_set_type bloom - - sudo ceph osd pool set cache hit_set_count 8 - - sudo ceph osd pool set cache hit_set_period 60 - - sudo ceph osd pool set cache target_max_objects 250 diff --git a/qa/suites/rbd/cli/pool/small-cache-pool.yaml b/qa/suites/rbd/cli/pool/small-cache-pool.yaml deleted file mode 100644 index bad95eaddf25..000000000000 --- a/qa/suites/rbd/cli/pool/small-cache-pool.yaml +++ /dev/null @@ -1,17 +0,0 @@ -overrides: - ceph: - log-ignorelist: - - overall HEALTH_ - - \(CACHE_POOL_NEAR_FULL\) - - \(CACHE_POOL_NO_HIT_SET\) -tasks: -- exec: - client.0: - - sudo ceph osd pool create cache 4 - - sudo ceph osd tier add rbd cache - - sudo ceph osd tier cache-mode cache writeback - - sudo ceph osd tier set-overlay rbd cache - - sudo ceph osd pool set cache hit_set_type bloom - - sudo ceph osd pool set cache hit_set_count 8 - - sudo ceph osd pool set cache hit_set_period 60 - - sudo ceph osd pool set cache target_max_objects 250 diff --git a/qa/suites/rbd/cli_v1/pool/.qa b/qa/suites/rbd/cli_v1/pool/.qa deleted file mode 120000 index a602a0353e75..000000000000 --- a/qa/suites/rbd/cli_v1/pool/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/rbd/cli_v1/pool/none.yaml b/qa/suites/rbd/cli_v1/pool/none.yaml deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/qa/suites/rbd/cli_v1/pool/small-cache-pool.yaml b/qa/suites/rbd/cli_v1/pool/small-cache-pool.yaml deleted file mode 100644 index bad95eaddf25..000000000000 --- a/qa/suites/rbd/cli_v1/pool/small-cache-pool.yaml +++ /dev/null @@ -1,17 +0,0 @@ -overrides: - ceph: - log-ignorelist: - - overall HEALTH_ - - \(CACHE_POOL_NEAR_FULL\) - - \(CACHE_POOL_NO_HIT_SET\) -tasks: -- exec: - client.0: - - sudo ceph osd pool create cache 4 - - sudo ceph osd tier add rbd cache - - sudo ceph osd tier cache-mode cache writeback - - sudo ceph osd tier set-overlay rbd cache - - sudo ceph osd pool set cache hit_set_type bloom - - sudo ceph osd pool set cache hit_set_count 8 - - sudo ceph osd pool set cache hit_set_period 60 - - sudo ceph osd pool set cache target_max_objects 250 diff --git a/qa/suites/rbd/encryption/pool/ec-cache-pool.yaml b/qa/suites/rbd/encryption/pool/ec-cache-pool.yaml deleted file mode 100644 index a0f88b4096d0..000000000000 --- a/qa/suites/rbd/encryption/pool/ec-cache-pool.yaml +++ /dev/null @@ -1,21 +0,0 @@ -overrides: - ceph: - log-ignorelist: - - overall HEALTH_ - - \(CACHE_POOL_NEAR_FULL\) - - \(CACHE_POOL_NO_HIT_SET\) -tasks: -- exec: - client.0: - - sudo ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2 - - sudo ceph osd pool delete rbd rbd --yes-i-really-really-mean-it - - sudo ceph osd pool create rbd 4 4 erasure teuthologyprofile - - sudo ceph osd pool create cache 4 - - sudo ceph osd tier add rbd cache - - sudo ceph osd tier cache-mode cache writeback - - sudo ceph osd tier set-overlay rbd cache - - sudo ceph osd pool set cache hit_set_type bloom - - sudo ceph osd pool set cache hit_set_count 8 - - sudo ceph osd pool set cache hit_set_period 60 - - sudo ceph osd pool set cache target_max_objects 250 - - rbd pool init rbd diff --git a/qa/suites/rbd/encryption/pool/small-cache-pool.yaml b/qa/suites/rbd/encryption/pool/small-cache-pool.yaml deleted file mode 100644 index bad95eaddf25..000000000000 --- a/qa/suites/rbd/encryption/pool/small-cache-pool.yaml +++ /dev/null @@ -1,17 +0,0 @@ -overrides: - ceph: - log-ignorelist: - - overall HEALTH_ - - \(CACHE_POOL_NEAR_FULL\) - - \(CACHE_POOL_NO_HIT_SET\) -tasks: -- exec: - client.0: - - sudo ceph osd pool create cache 4 - - sudo ceph osd tier add rbd cache - - sudo ceph osd tier cache-mode cache writeback - - sudo ceph osd tier set-overlay rbd cache - - sudo ceph osd pool set cache hit_set_type bloom - - sudo ceph osd pool set cache hit_set_count 8 - - sudo ceph osd pool set cache hit_set_period 60 - - sudo ceph osd pool set cache target_max_objects 250 diff --git a/qa/suites/rbd/librbd/pool/small-cache-pool.yaml b/qa/suites/rbd/librbd/pool/small-cache-pool.yaml deleted file mode 100644 index bad95eaddf25..000000000000 --- a/qa/suites/rbd/librbd/pool/small-cache-pool.yaml +++ /dev/null @@ -1,17 +0,0 @@ -overrides: - ceph: - log-ignorelist: - - overall HEALTH_ - - \(CACHE_POOL_NEAR_FULL\) - - \(CACHE_POOL_NO_HIT_SET\) -tasks: -- exec: - client.0: - - sudo ceph osd pool create cache 4 - - sudo ceph osd tier add rbd cache - - sudo ceph osd tier cache-mode cache writeback - - sudo ceph osd tier set-overlay rbd cache - - sudo ceph osd pool set cache hit_set_type bloom - - sudo ceph osd pool set cache hit_set_count 8 - - sudo ceph osd pool set cache hit_set_period 60 - - sudo ceph osd pool set cache target_max_objects 250 diff --git a/qa/suites/rbd/qemu/pool/ec-cache-pool.yaml b/qa/suites/rbd/qemu/pool/ec-cache-pool.yaml deleted file mode 100644 index a0f88b4096d0..000000000000 --- a/qa/suites/rbd/qemu/pool/ec-cache-pool.yaml +++ /dev/null @@ -1,21 +0,0 @@ -overrides: - ceph: - log-ignorelist: - - overall HEALTH_ - - \(CACHE_POOL_NEAR_FULL\) - - \(CACHE_POOL_NO_HIT_SET\) -tasks: -- exec: - client.0: - - sudo ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2 - - sudo ceph osd pool delete rbd rbd --yes-i-really-really-mean-it - - sudo ceph osd pool create rbd 4 4 erasure teuthologyprofile - - sudo ceph osd pool create cache 4 - - sudo ceph osd tier add rbd cache - - sudo ceph osd tier cache-mode cache writeback - - sudo ceph osd tier set-overlay rbd cache - - sudo ceph osd pool set cache hit_set_type bloom - - sudo ceph osd pool set cache hit_set_count 8 - - sudo ceph osd pool set cache hit_set_period 60 - - sudo ceph osd pool set cache target_max_objects 250 - - rbd pool init rbd diff --git a/qa/suites/rbd/qemu/pool/small-cache-pool.yaml b/qa/suites/rbd/qemu/pool/small-cache-pool.yaml deleted file mode 100644 index bad95eaddf25..000000000000 --- a/qa/suites/rbd/qemu/pool/small-cache-pool.yaml +++ /dev/null @@ -1,17 +0,0 @@ -overrides: - ceph: - log-ignorelist: - - overall HEALTH_ - - \(CACHE_POOL_NEAR_FULL\) - - \(CACHE_POOL_NO_HIT_SET\) -tasks: -- exec: - client.0: - - sudo ceph osd pool create cache 4 - - sudo ceph osd tier add rbd cache - - sudo ceph osd tier cache-mode cache writeback - - sudo ceph osd tier set-overlay rbd cache - - sudo ceph osd pool set cache hit_set_type bloom - - sudo ceph osd pool set cache hit_set_count 8 - - sudo ceph osd pool set cache hit_set_period 60 - - sudo ceph osd pool set cache target_max_objects 250 diff --git a/qa/suites/rbd/thrash/thrashers/cache.yaml b/qa/suites/rbd/thrash/thrashers/cache.yaml deleted file mode 100644 index b434e28be8a8..000000000000 --- a/qa/suites/rbd/thrash/thrashers/cache.yaml +++ /dev/null @@ -1,21 +0,0 @@ -overrides: - ceph: - log-ignorelist: - - but it is still running - - objects unfound and apparently lost - - overall HEALTH_ - - \(CACHE_POOL_NEAR_FULL\) - - \(CACHE_POOL_NO_HIT_SET\) -tasks: -- exec: - client.0: - - sudo ceph osd pool create cache 4 - - sudo ceph osd tier add rbd cache - - sudo ceph osd tier cache-mode cache writeback - - sudo ceph osd tier set-overlay rbd cache - - sudo ceph osd pool set cache hit_set_type bloom - - sudo ceph osd pool set cache hit_set_count 8 - - sudo ceph osd pool set cache hit_set_period 60 - - sudo ceph osd pool set cache target_max_objects 250 -- thrashosds: - timeout: 1200 From 83880580aa15a12fb238a8da2e9355da9cf69cbf Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 30 Sep 2023 11:39:32 +0200 Subject: [PATCH 0124/2492] qa/suites/rbd: deduplicate (data) pool facets With cache tiering facets gone, "pool" facets are strictly about --data-pool option now. Rename to "data-pool" and create symlinks to a common directory. Signed-off-by: Ilya Dryomov --- .../data-pool/ec.yaml} | 0 .../rbd/cli/pool => rbd/data-pool}/none.yaml | 0 .../data-pool/replicated.yaml} | 0 qa/suites/rbd/cli/data-pool | 1 + qa/suites/rbd/cli/pool/.qa | 1 - qa/suites/rbd/encryption/data-pool | 1 + qa/suites/rbd/encryption/pool/.qa | 1 - .../rbd/encryption/pool/ec-data-pool.yaml | 24 ------------------- qa/suites/rbd/encryption/pool/none.yaml | 0 .../encryption/pool/replicated-data-pool.yaml | 11 --------- qa/suites/rbd/librbd/data-pool | 1 + qa/suites/rbd/librbd/pool/.qa | 1 - qa/suites/rbd/librbd/pool/ec-data-pool.yaml | 24 ------------------- qa/suites/rbd/librbd/pool/none.yaml | 0 .../rbd/librbd/pool/replicated-data-pool.yaml | 11 --------- qa/suites/rbd/migration/5-data-pool | 1 + qa/suites/rbd/migration/5-pool/.qa | 1 - .../rbd/migration/5-pool/ec-data-pool.yaml | 24 ------------------- qa/suites/rbd/migration/5-pool/none.yaml | 0 .../5-pool/replicated-data-pool.yaml | 11 --------- qa/suites/rbd/qemu/data-pool | 1 + qa/suites/rbd/qemu/pool/.qa | 1 - qa/suites/rbd/qemu/pool/ec-data-pool.yaml | 24 ------------------- qa/suites/rbd/qemu/pool/none.yaml | 0 .../rbd/qemu/pool/replicated-data-pool.yaml | 11 --------- 25 files changed, 5 insertions(+), 145 deletions(-) rename qa/{suites/rbd/cli/pool/ec-data-pool.yaml => rbd/data-pool/ec.yaml} (100%) rename qa/{suites/rbd/cli/pool => rbd/data-pool}/none.yaml (100%) rename qa/{suites/rbd/cli/pool/replicated-data-pool.yaml => rbd/data-pool/replicated.yaml} (100%) create mode 120000 qa/suites/rbd/cli/data-pool delete mode 120000 qa/suites/rbd/cli/pool/.qa create mode 120000 qa/suites/rbd/encryption/data-pool delete mode 120000 qa/suites/rbd/encryption/pool/.qa delete mode 100644 qa/suites/rbd/encryption/pool/ec-data-pool.yaml delete mode 100644 qa/suites/rbd/encryption/pool/none.yaml delete mode 100644 qa/suites/rbd/encryption/pool/replicated-data-pool.yaml create mode 120000 qa/suites/rbd/librbd/data-pool delete mode 120000 qa/suites/rbd/librbd/pool/.qa delete mode 100644 qa/suites/rbd/librbd/pool/ec-data-pool.yaml delete mode 100644 qa/suites/rbd/librbd/pool/none.yaml delete mode 100644 qa/suites/rbd/librbd/pool/replicated-data-pool.yaml create mode 120000 qa/suites/rbd/migration/5-data-pool delete mode 120000 qa/suites/rbd/migration/5-pool/.qa delete mode 100644 qa/suites/rbd/migration/5-pool/ec-data-pool.yaml delete mode 100644 qa/suites/rbd/migration/5-pool/none.yaml delete mode 100644 qa/suites/rbd/migration/5-pool/replicated-data-pool.yaml create mode 120000 qa/suites/rbd/qemu/data-pool delete mode 120000 qa/suites/rbd/qemu/pool/.qa delete mode 100644 qa/suites/rbd/qemu/pool/ec-data-pool.yaml delete mode 100644 qa/suites/rbd/qemu/pool/none.yaml delete mode 100644 qa/suites/rbd/qemu/pool/replicated-data-pool.yaml diff --git a/qa/suites/rbd/cli/pool/ec-data-pool.yaml b/qa/rbd/data-pool/ec.yaml similarity index 100% rename from qa/suites/rbd/cli/pool/ec-data-pool.yaml rename to qa/rbd/data-pool/ec.yaml diff --git a/qa/suites/rbd/cli/pool/none.yaml b/qa/rbd/data-pool/none.yaml similarity index 100% rename from qa/suites/rbd/cli/pool/none.yaml rename to qa/rbd/data-pool/none.yaml diff --git a/qa/suites/rbd/cli/pool/replicated-data-pool.yaml b/qa/rbd/data-pool/replicated.yaml similarity index 100% rename from qa/suites/rbd/cli/pool/replicated-data-pool.yaml rename to qa/rbd/data-pool/replicated.yaml diff --git a/qa/suites/rbd/cli/data-pool b/qa/suites/rbd/cli/data-pool new file mode 120000 index 000000000000..3df827572804 --- /dev/null +++ b/qa/suites/rbd/cli/data-pool @@ -0,0 +1 @@ +.qa/rbd/data-pool/ \ No newline at end of file diff --git a/qa/suites/rbd/cli/pool/.qa b/qa/suites/rbd/cli/pool/.qa deleted file mode 120000 index a602a0353e75..000000000000 --- a/qa/suites/rbd/cli/pool/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/rbd/encryption/data-pool b/qa/suites/rbd/encryption/data-pool new file mode 120000 index 000000000000..3df827572804 --- /dev/null +++ b/qa/suites/rbd/encryption/data-pool @@ -0,0 +1 @@ +.qa/rbd/data-pool/ \ No newline at end of file diff --git a/qa/suites/rbd/encryption/pool/.qa b/qa/suites/rbd/encryption/pool/.qa deleted file mode 120000 index a602a0353e75..000000000000 --- a/qa/suites/rbd/encryption/pool/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/rbd/encryption/pool/ec-data-pool.yaml b/qa/suites/rbd/encryption/pool/ec-data-pool.yaml deleted file mode 100644 index f39a5bb4ca62..000000000000 --- a/qa/suites/rbd/encryption/pool/ec-data-pool.yaml +++ /dev/null @@ -1,24 +0,0 @@ -tasks: -- exec: - client.0: - - sudo ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2 - - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile - - sudo ceph osd pool set datapool allow_ec_overwrites true - - rbd pool init datapool - -overrides: - thrashosds: - bdev_inject_crash: 2 - bdev_inject_crash_probability: .5 - ceph: - fs: xfs - conf: - client: - rbd default data pool: datapool - osd: # force bluestore since it's required for ec overwrites - osd objectstore: bluestore - bluestore block size: 96636764160 - enable experimental unrecoverable data corrupting features: "*" - osd debug randomize hobject sort order: false -# this doesn't work with failures bc the log writes are not atomic across the two backends -# bluestore bluefs env mirror: true diff --git a/qa/suites/rbd/encryption/pool/none.yaml b/qa/suites/rbd/encryption/pool/none.yaml deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/qa/suites/rbd/encryption/pool/replicated-data-pool.yaml b/qa/suites/rbd/encryption/pool/replicated-data-pool.yaml deleted file mode 100644 index c5647dba1c68..000000000000 --- a/qa/suites/rbd/encryption/pool/replicated-data-pool.yaml +++ /dev/null @@ -1,11 +0,0 @@ -tasks: -- exec: - client.0: - - sudo ceph osd pool create datapool 4 - - rbd pool init datapool - -overrides: - ceph: - conf: - client: - rbd default data pool: datapool diff --git a/qa/suites/rbd/librbd/data-pool b/qa/suites/rbd/librbd/data-pool new file mode 120000 index 000000000000..3df827572804 --- /dev/null +++ b/qa/suites/rbd/librbd/data-pool @@ -0,0 +1 @@ +.qa/rbd/data-pool/ \ No newline at end of file diff --git a/qa/suites/rbd/librbd/pool/.qa b/qa/suites/rbd/librbd/pool/.qa deleted file mode 120000 index a602a0353e75..000000000000 --- a/qa/suites/rbd/librbd/pool/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/rbd/librbd/pool/ec-data-pool.yaml b/qa/suites/rbd/librbd/pool/ec-data-pool.yaml deleted file mode 100644 index f39a5bb4ca62..000000000000 --- a/qa/suites/rbd/librbd/pool/ec-data-pool.yaml +++ /dev/null @@ -1,24 +0,0 @@ -tasks: -- exec: - client.0: - - sudo ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2 - - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile - - sudo ceph osd pool set datapool allow_ec_overwrites true - - rbd pool init datapool - -overrides: - thrashosds: - bdev_inject_crash: 2 - bdev_inject_crash_probability: .5 - ceph: - fs: xfs - conf: - client: - rbd default data pool: datapool - osd: # force bluestore since it's required for ec overwrites - osd objectstore: bluestore - bluestore block size: 96636764160 - enable experimental unrecoverable data corrupting features: "*" - osd debug randomize hobject sort order: false -# this doesn't work with failures bc the log writes are not atomic across the two backends -# bluestore bluefs env mirror: true diff --git a/qa/suites/rbd/librbd/pool/none.yaml b/qa/suites/rbd/librbd/pool/none.yaml deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/qa/suites/rbd/librbd/pool/replicated-data-pool.yaml b/qa/suites/rbd/librbd/pool/replicated-data-pool.yaml deleted file mode 100644 index c5647dba1c68..000000000000 --- a/qa/suites/rbd/librbd/pool/replicated-data-pool.yaml +++ /dev/null @@ -1,11 +0,0 @@ -tasks: -- exec: - client.0: - - sudo ceph osd pool create datapool 4 - - rbd pool init datapool - -overrides: - ceph: - conf: - client: - rbd default data pool: datapool diff --git a/qa/suites/rbd/migration/5-data-pool b/qa/suites/rbd/migration/5-data-pool new file mode 120000 index 000000000000..3df827572804 --- /dev/null +++ b/qa/suites/rbd/migration/5-data-pool @@ -0,0 +1 @@ +.qa/rbd/data-pool/ \ No newline at end of file diff --git a/qa/suites/rbd/migration/5-pool/.qa b/qa/suites/rbd/migration/5-pool/.qa deleted file mode 120000 index a602a0353e75..000000000000 --- a/qa/suites/rbd/migration/5-pool/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/rbd/migration/5-pool/ec-data-pool.yaml b/qa/suites/rbd/migration/5-pool/ec-data-pool.yaml deleted file mode 100644 index f39a5bb4ca62..000000000000 --- a/qa/suites/rbd/migration/5-pool/ec-data-pool.yaml +++ /dev/null @@ -1,24 +0,0 @@ -tasks: -- exec: - client.0: - - sudo ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2 - - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile - - sudo ceph osd pool set datapool allow_ec_overwrites true - - rbd pool init datapool - -overrides: - thrashosds: - bdev_inject_crash: 2 - bdev_inject_crash_probability: .5 - ceph: - fs: xfs - conf: - client: - rbd default data pool: datapool - osd: # force bluestore since it's required for ec overwrites - osd objectstore: bluestore - bluestore block size: 96636764160 - enable experimental unrecoverable data corrupting features: "*" - osd debug randomize hobject sort order: false -# this doesn't work with failures bc the log writes are not atomic across the two backends -# bluestore bluefs env mirror: true diff --git a/qa/suites/rbd/migration/5-pool/none.yaml b/qa/suites/rbd/migration/5-pool/none.yaml deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/qa/suites/rbd/migration/5-pool/replicated-data-pool.yaml b/qa/suites/rbd/migration/5-pool/replicated-data-pool.yaml deleted file mode 100644 index c5647dba1c68..000000000000 --- a/qa/suites/rbd/migration/5-pool/replicated-data-pool.yaml +++ /dev/null @@ -1,11 +0,0 @@ -tasks: -- exec: - client.0: - - sudo ceph osd pool create datapool 4 - - rbd pool init datapool - -overrides: - ceph: - conf: - client: - rbd default data pool: datapool diff --git a/qa/suites/rbd/qemu/data-pool b/qa/suites/rbd/qemu/data-pool new file mode 120000 index 000000000000..3df827572804 --- /dev/null +++ b/qa/suites/rbd/qemu/data-pool @@ -0,0 +1 @@ +.qa/rbd/data-pool/ \ No newline at end of file diff --git a/qa/suites/rbd/qemu/pool/.qa b/qa/suites/rbd/qemu/pool/.qa deleted file mode 120000 index a602a0353e75..000000000000 --- a/qa/suites/rbd/qemu/pool/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/rbd/qemu/pool/ec-data-pool.yaml b/qa/suites/rbd/qemu/pool/ec-data-pool.yaml deleted file mode 100644 index f39a5bb4ca62..000000000000 --- a/qa/suites/rbd/qemu/pool/ec-data-pool.yaml +++ /dev/null @@ -1,24 +0,0 @@ -tasks: -- exec: - client.0: - - sudo ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2 - - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile - - sudo ceph osd pool set datapool allow_ec_overwrites true - - rbd pool init datapool - -overrides: - thrashosds: - bdev_inject_crash: 2 - bdev_inject_crash_probability: .5 - ceph: - fs: xfs - conf: - client: - rbd default data pool: datapool - osd: # force bluestore since it's required for ec overwrites - osd objectstore: bluestore - bluestore block size: 96636764160 - enable experimental unrecoverable data corrupting features: "*" - osd debug randomize hobject sort order: false -# this doesn't work with failures bc the log writes are not atomic across the two backends -# bluestore bluefs env mirror: true diff --git a/qa/suites/rbd/qemu/pool/none.yaml b/qa/suites/rbd/qemu/pool/none.yaml deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/qa/suites/rbd/qemu/pool/replicated-data-pool.yaml b/qa/suites/rbd/qemu/pool/replicated-data-pool.yaml deleted file mode 100644 index c5647dba1c68..000000000000 --- a/qa/suites/rbd/qemu/pool/replicated-data-pool.yaml +++ /dev/null @@ -1,11 +0,0 @@ -tasks: -- exec: - client.0: - - sudo ceph osd pool create datapool 4 - - rbd pool init datapool - -overrides: - ceph: - conf: - client: - rbd default data pool: datapool From f987b4daa097a84ca35db4037de1985fc0acaf01 Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Tue, 10 Oct 2023 09:04:39 +0000 Subject: [PATCH 0125/2492] os/kv_test: Fix estimate functions We need to use random content to estimate DB size. Otherwise, compression will cause DB to be unreasonably small. Fixes: https://tracker.ceph.com/issues/63121 Signed-off-by: Adam Kupczyk --- src/test/objectstore/test_kv.cc | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/test/objectstore/test_kv.cc b/src/test/objectstore/test_kv.cc index 33ffd6ab3968..95c712ceffa5 100644 --- a/src/test/objectstore/test_kv.cc +++ b/src/test/objectstore/test_kv.cc @@ -29,6 +29,14 @@ using namespace std; +std::string gen_random_string(size_t size) { + std::string s; + for (size_t i = 0; i < size; i++) { + s.push_back(rand()); + } + return s; +} + class KVTest : public ::testing::TestWithParam { public: boost::scoped_ptr db; @@ -556,10 +564,11 @@ TEST_P(KVTest, RocksDB_estimate_size) { for(int test = 0; test < 20; test++) { KeyValueDB::Transaction t = db->get_transaction(); - bufferlist v1; - v1.append(string(1000, '1')); - for (int i = 0; i < 100; i++) + for (int i = 0; i < 100; i++) { + bufferlist v1; + v1.append(gen_random_string(1000)); t->set("A", to_string(rand()%100000), v1); + } db->submit_transaction_sync(t); db->compact(); @@ -588,10 +597,11 @@ TEST_P(KVTest, RocksDB_estimate_size_column_family) { for(int test = 0; test < 20; test++) { KeyValueDB::Transaction t = db->get_transaction(); - bufferlist v1; - v1.append(string(1000, '1')); - for (int i = 0; i < 100; i++) + for (int i = 0; i < 100; i++) { + bufferlist v1; + v1.append(gen_random_string(1000)); t->set("cf1", to_string(rand()%100000), v1); + } db->submit_transaction_sync(t); db->compact(); From f3c0424bb583423db80009c29f73e484aad840b4 Mon Sep 17 00:00:00 2001 From: Milind Changire Date: Mon, 25 Sep 2023 18:19:50 +0530 Subject: [PATCH 0126/2492] mds: do not simplify fragset Problem: Frags in simplified fragset aren't found as is on the replica during scrub. fragset::simplify() computes the bit representation of the least common ancestor of the frags. When this representation is forwarded to the replicas, the ScrubStack::handle_scrub() method in the OP_QUEUEDIR case simply searches this simplified representation in the list of (unsimplified) frags ... which it fails to find. Hence we get to see the "no frag 10*" type of logs. Solution: Do not simplify fragset when forwarding fragset to replica for scrub. Fixes: https://tracker.ceph.com/issues/62658 Signed-off-by: Milind Changire --- src/mds/ScrubStack.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc index 6d799343f149..742c464f4d37 100644 --- a/src/mds/ScrubStack.cc +++ b/src/mds/ScrubStack.cc @@ -320,7 +320,7 @@ void ScrubStack::scrub_dir_inode(CInode *in, bool *added_children, bool *done) frag_vec_t frags; in->dirfragtree.get_leaves(frags); - dout(20) << __func__ << "recursive mode, frags " << frags << dendl; + dout(20) << __func__ << " recursive mode, frags " << frags << dendl; for (auto &fg : frags) { if (queued.contains(fg)) continue; @@ -366,7 +366,6 @@ void ScrubStack::scrub_dir_inode(CInode *in, bool *added_children, bool *done) scrub_r.tag = header->get_tag(); for (auto& p : scrub_remote) { - p.second.simplify(); dout(20) << __func__ << " forward " << p.second << " to mds." << p.first << dendl; auto r = make_message(MMDSScrub::OP_QUEUEDIR, in->ino(), std::move(p.second), header->get_tag(), From 90ae55f6d1ecc9c951e5dfa7d5a5d2169d3a917f Mon Sep 17 00:00:00 2001 From: Redouane Kachach Date: Tue, 10 Oct 2023 11:54:44 +0200 Subject: [PATCH 0127/2492] mgr/rook: fixing rook-ceph-exporter daemon type handling Fixes: https://tracker.ceph.com/issues/63107 Signed-off-by: Redouane Kachach --- src/pybind/mgr/rook/module.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/pybind/mgr/rook/module.py b/src/pybind/mgr/rook/module.py index b67349d1bff6..32699fa0117a 100644 --- a/src/pybind/mgr/rook/module.py +++ b/src/pybind/mgr/rook/module.py @@ -457,7 +457,17 @@ def _list_daemons(self, for p in pods: sd = orchestrator.DaemonDescription() sd.hostname = p['hostname'] - sd.daemon_type = p['labels']['app'].replace('rook-ceph-', '') + + # In Rook environments, the 'ceph-exporter' daemon is named 'exporter' whereas + # in the orchestrator interface, it is named 'ceph-exporter'. The purpose of the + # following adjustment is to ensure that the 'daemon_type' is correctly set. + # Without this adjustment, the 'service_to_daemon_types' lookup would fail, as + # it would be searching for a non-existent entry called 'exporter + if p['labels']['app'] == 'rook-ceph-exporter': + sd.daemon_type = 'ceph-exporter' + else: + sd.daemon_type = p['labels']['app'].replace('rook-ceph-', '') + status = { 'Pending': orchestrator.DaemonDescriptionStatus.starting, 'Running': orchestrator.DaemonDescriptionStatus.running, From e40752ec25155ab232a68cd44093a9764ebf4091 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 10 Oct 2023 12:31:28 +0200 Subject: [PATCH 0128/2492] qa/suites/rbd: drop redundant ignorelist entries CACHE_POOL_NO_HIT_SET is retained in *api_tests*.yaml and rbd_mirror.yaml snippets for TestLibRBD.ListChildrenTiered and TestClusterWatcher.CachePools tests. Signed-off-by: Ilya Dryomov --- qa/rbd/data-pool/ec.yaml | 3 --- qa/suites/rbd/singleton/all/rbd_tasks.yaml | 3 --- 2 files changed, 6 deletions(-) diff --git a/qa/rbd/data-pool/ec.yaml b/qa/rbd/data-pool/ec.yaml index db289c7e7e60..f39a5bb4ca62 100644 --- a/qa/rbd/data-pool/ec.yaml +++ b/qa/rbd/data-pool/ec.yaml @@ -12,9 +12,6 @@ overrides: bdev_inject_crash_probability: .5 ceph: fs: xfs - log-ignorelist: - - overall HEALTH_ - - \(CACHE_POOL_NO_HIT_SET\) conf: client: rbd default data pool: datapool diff --git a/qa/suites/rbd/singleton/all/rbd_tasks.yaml b/qa/suites/rbd/singleton/all/rbd_tasks.yaml index 4723eb6800ce..782b0214135e 100644 --- a/qa/suites/rbd/singleton/all/rbd_tasks.yaml +++ b/qa/suites/rbd/singleton/all/rbd_tasks.yaml @@ -4,9 +4,6 @@ tasks: - install: - ceph: fs: xfs - log-ignorelist: - - overall HEALTH_ - - \(CACHE_POOL_NO_HIT_SET\) - workunit: clients: all: [rbd/test_rbd_tasks.sh] From be78b3ea68f75319da05700276630f7f331bbd63 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Tue, 3 Oct 2023 01:17:09 +0530 Subject: [PATCH 0129/2492] mds/FSMap: add logging facility Signed-off-by: Rishabh Dave --- src/mds/FSMap.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc index 5dfaaf0e3441..fe7ee5588919 100644 --- a/src/mds/FSMap.cc +++ b/src/mds/FSMap.cc @@ -15,7 +15,7 @@ #include #include "FSMap.h" - +#include "common/debug.h" #include "common/StackStringStream.h" #ifdef WITH_SEASTAR @@ -26,6 +26,11 @@ #include "global/global_context.h" #include "mon/health_check.h" +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "FSMap " + using std::list; using std::pair; using std::ostream; From 69e4c9e4729fd0d29ed43fb1f6e94cb1804a3f74 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Tue, 3 Oct 2023 01:17:42 +0530 Subject: [PATCH 0130/2492] qa/cephfs: import only sleep() from time Signed-off-by: Rishabh Dave --- qa/tasks/cephfs/test_admin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py index 923871f88e3a..b2e10e4d3cf4 100644 --- a/qa/tasks/cephfs/test_admin.py +++ b/qa/tasks/cephfs/test_admin.py @@ -1,10 +1,10 @@ import errno import json import logging -import time import uuid from io import StringIO from os.path import join as os_path_join +from time import sleep from teuthology.exceptions import CommandFailedError @@ -802,7 +802,7 @@ def test_fsmap_trim(self): self.fs.set_joinable(b) b = not b - time.sleep(10) # for tick/compaction + sleep(10) # for tick/compaction try: self.fs.status(epoch=epoch) @@ -826,7 +826,7 @@ def test_fsmap_force_trim(self): # force a new fsmap self.fs.set_joinable(False) - time.sleep(10) # for tick/compaction + sleep(10) # for tick/compaction status = self.fs.status() log.debug(f"new epoch is {status['epoch']}") From 6ac58b0a12324ea13c724cbba3107e1eae9a96c3 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Tue, 3 Oct 2023 13:41:54 +0530 Subject: [PATCH 0131/2492] qa/cephfs: minor improvement caps_helper.py When assert fails for equality of two variables and when both the variables are printed in error message, print each variable on a new line. Signed-off-by: Rishabh Dave --- qa/tasks/cephfs/caps_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/tasks/cephfs/caps_helper.py b/qa/tasks/cephfs/caps_helper.py index f083c788337a..75a40ac83fe8 100644 --- a/qa/tasks/cephfs/caps_helper.py +++ b/qa/tasks/cephfs/caps_helper.py @@ -124,7 +124,7 @@ def get_fsnames_from_moncap(moncap): def assert_equal(first, second): - msg = f'Variables are not equal.\nfirst = {first}\nsecond = {second}' + msg = f'Variables are not equal.\nfirst -\n{first}\nsecond -\n{second}' assert first == second, msg From f8acdf01cb38c208c2ca5f32ae34fa1c0dba8cfb Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Thu, 28 Sep 2023 11:04:49 -0400 Subject: [PATCH 0132/2492] cephadm: convert ceph class to a ContainerDaemonForm Signed-off-by: John Mulligan --- src/cephadm/cephadm.py | 83 ++++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 6c87d7983b6f..f312ca663436 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -208,7 +208,7 @@ def __eq__(self, other: Any) -> bool: @register_daemon_form -class Ceph(DaemonForm): +class Ceph(ContainerDaemonForm): daemons = ('mon', 'mgr', 'osd', 'mds', 'rgw', 'rbd-mirror', 'crash', 'cephfs-mirror', 'ceph-exporter') @@ -235,6 +235,37 @@ def firewall_service_name(self) -> str: return 'ceph' return '' + def container(self, ctx: CephadmContext) -> CephContainer: + # previous to being a ContainerDaemonForm, this make_var_run + # call was hard coded in the deploy path. Eventually, it would be + # good to move this somwhere cleaner and avoid needing to know the + # uid/gid here. + uid, gid = self.uid_gid(ctx) + make_var_run(ctx, ctx.fsid, uid, gid) + + ctr = get_deployment_container(ctx, self.identity) + config_json = fetch_configs(ctx) + if self.identity.daemon_type == 'mon' and config_json is not None: + if 'crush_location' in config_json: + c_loc = config_json['crush_location'] + # was originally "c.args.extend(['--set-crush-location', c_loc])" + # but that doesn't seem to persist in the object after it's passed + # in further function calls + ctr.args = ctr.args + ['--set-crush-location', c_loc] + return ctr + + _uid_gid: Optional[Tuple[int, int]] = None + + def uid_gid(self, ctx: CephadmContext) -> Tuple[int, int]: + if self._uid_gid is None: + self._uid_gid = extract_uid_gid(ctx) + return self._uid_gid + + def config_and_keyring( + self, ctx: CephadmContext + ) -> Tuple[Optional[str], Optional[str]]: + return get_config_and_keyring(ctx) + ################################## @@ -245,6 +276,21 @@ def for_daemon_type(cls, daemon_type: str) -> bool: # TODO: figure out a way to un-special-case osd return daemon_type == 'osd' + def __init__( + self, ident: DaemonIdentity, osd_fsid: Optional[str] = None + ) -> None: + super().__init__(ident) + self._osd_fsid = osd_fsid + + @classmethod + def create(cls, ctx: CephadmContext, ident: DaemonIdentity) -> 'OSD': + osd_fsid = getattr(ctx, 'osd_fsid', None) + if osd_fsid is None: + logger.info( + 'Creating an OSD daemon form without an OSD FSID value' + ) + return cls(ident, osd_fsid) + @staticmethod def get_sysctl_settings() -> List[str]: return [ @@ -256,6 +302,10 @@ def get_sysctl_settings() -> List[str]: def firewall_service_name(self) -> str: return 'ceph' + @property + def osd_fsid(self) -> Optional[str]: + return self._osd_fsid + ################################## @@ -5196,37 +5246,8 @@ def _dispatch_deploy( deployment_type: DeploymentType, ) -> None: daemon_type = ident.daemon_type - if daemon_type in Ceph.daemons: - config, keyring = get_config_and_keyring(ctx) - uid, gid = extract_uid_gid(ctx) - make_var_run(ctx, ctx.fsid, uid, gid) - - config_json = fetch_configs(ctx) - - c = get_deployment_container(ctx, ident, ptrace=ctx.allow_ptrace) - - if daemon_type == 'mon' and config_json is not None: - if 'crush_location' in config_json: - c_loc = config_json['crush_location'] - # was originally "c.args.extend(['--set-crush-location', c_loc])" - # but that doesn't seem to persist in the object after it's passed - # in further function calls - c.args = c.args + ['--set-crush-location', c_loc] - - deploy_daemon( - ctx, - ident, - c, - uid, - gid, - config=config, - keyring=keyring, - osd_fsid=ctx.osd_fsid, - deployment_type=deployment_type, - endpoints=daemon_endpoints, - ) - elif daemon_type == CephadmAgent.daemon_type: + if daemon_type == CephadmAgent.daemon_type: # get current user gid and uid uid = os.getuid() gid = os.getgid() From af2058536b49309d7408d6cb19d04dc292218419 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Thu, 28 Sep 2023 14:15:55 -0400 Subject: [PATCH 0133/2492] cephadm: eliminate _dispatch_deploy function Eliminate the _dispatch_deploy function, folding it into the _common_deploy function, because the mass of if-elif lines have been replaced and keeping it as a separate function no longer serves much of a useful purpose. Signed-off-by: John Mulligan --- src/cephadm/cephadm.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index f312ca663436..f9041317f4db 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -5236,18 +5236,8 @@ def _common_deploy(ctx: CephadmContext) -> None: # Get and check ports explicitly required to be opened endpoints = fetch_endpoints(ctx) - _dispatch_deploy(ctx, ident, endpoints, deployment_type) - -def _dispatch_deploy( - ctx: CephadmContext, - ident: 'DaemonIdentity', - daemon_endpoints: List[EndPoint], - deployment_type: DeploymentType, -) -> None: - daemon_type = ident.daemon_type - - if daemon_type == CephadmAgent.daemon_type: + if ident.daemon_type == CephadmAgent.daemon_type: # get current user gid and uid uid = os.getuid() gid = os.getgid() @@ -5258,17 +5248,15 @@ def _dispatch_deploy( uid, gid, deployment_type=deployment_type, - endpoints=daemon_endpoints, + endpoints=endpoints, ) else: try: - _deploy_daemon_container( - ctx, ident, daemon_endpoints, deployment_type - ) + _deploy_daemon_container(ctx, ident, endpoints, deployment_type) except UnexpectedDaemonTypeError: raise Error('daemon type {} not implemented in command_deploy function' - .format(daemon_type)) + .format(ident.daemon_type)) def _deploy_daemon_container( From 28fe98bd5949c3a6bcae4e78cdae5bbcde4e8d3b Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 3 Oct 2023 16:43:59 -0400 Subject: [PATCH 0134/2492] cephadm: stop directly using Ceph.daemons property The Ceph.daemons property has two unfortunate behaviors: most important, it includes ceph-exporter which causes the other CephExporter class to be over-shadowed the DaemonForms mechanism. Second, it couples all functions that want to know the names of ceph daemon types to the Ceph class preventing future refactoring of that class. Break the existing coupling by adding a new `ceph_daemons` function similar to `get_supported_daemons` but returning the same value that Ceph.daemons used to provide. This will permit future fixes and improvements. Signed-off-by: John Mulligan --- src/cephadm/cephadm.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index f9041317f4db..a3b2c22c4813 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -209,13 +209,13 @@ def __eq__(self, other: Any) -> bool: @register_daemon_form class Ceph(ContainerDaemonForm): - daemons = ('mon', 'mgr', 'osd', 'mds', 'rgw', 'rbd-mirror', - 'crash', 'cephfs-mirror', 'ceph-exporter') + _daemons = ('mon', 'mgr', 'osd', 'mds', 'rgw', 'rbd-mirror', + 'crash', 'cephfs-mirror', 'ceph-exporter') @classmethod def for_daemon_type(cls, daemon_type: str) -> bool: # TODO: figure out a way to un-special-case osd - return daemon_type in cls.daemons and daemon_type != 'osd' + return daemon_type in cls._daemons and daemon_type != 'osd' def __init__(self, ident: DaemonIdentity) -> None: self._identity = ident @@ -1708,7 +1708,7 @@ def uid_gid(self, ctx: CephadmContext) -> Tuple[int, int]: def get_supported_daemons(): # type: () -> List[str] - supported_daemons = list(Ceph.daemons) + supported_daemons = ceph_daemons() supported_daemons.extend(Monitoring.components) supported_daemons.append(NFSGanesha.daemon_type) supported_daemons.append(CephIscsi.daemon_type) @@ -1722,6 +1722,10 @@ def get_supported_daemons(): assert len(supported_daemons) == len(set(supported_daemons)) return supported_daemons + +def ceph_daemons() -> List[str]: + return list(Ceph._daemons) + ################################## @@ -2014,7 +2018,7 @@ def infer_local_ceph_image(ctx: CephadmContext, container_path: str) -> Optional container_info = None daemon_name = ctx.name if ('name' in ctx and ctx.name and '.' in ctx.name) else None - daemons_ls = [daemon_name] if daemon_name is not None else Ceph.daemons # daemon types: 'mon', 'mgr', etc + daemons_ls = [daemon_name] if daemon_name is not None else ceph_daemons() # daemon types: 'mon', 'mgr', etc for daemon in daemons_ls: container_info = get_container_info(ctx, daemon, daemon_name is not None) if container_info is not None: @@ -2190,7 +2194,7 @@ def get_daemon_args(ctx: CephadmContext, ident: 'DaemonIdentity') -> List[str]: r = list() # type: List[str] daemon_type = ident.daemon_type - if daemon_type in Ceph.daemons and daemon_type not in ['crash', 'ceph-exporter']: + if daemon_type in ceph_daemons() and daemon_type not in ['crash', 'ceph-exporter']: r += [ '--setuser', 'ceph', '--setgroup', 'ceph', @@ -2309,7 +2313,7 @@ def create_daemon_dirs( fsid, daemon_type = ident.fsid, ident.daemon_type data_dir = make_data_dir(ctx, ident, uid=uid, gid=gid) - if daemon_type in Ceph.daemons: + if daemon_type in ceph_daemons(): make_log_dir(ctx, fsid, uid=uid, gid=gid) if config: @@ -2479,7 +2483,7 @@ def _get_container_mounts_for_type( """ mounts = dict() - if daemon_type in Ceph.daemons: + if daemon_type in ceph_daemons(): if fsid: run_path = os.path.join('/var/run/ceph', fsid) if os.path.exists(run_path): @@ -2548,7 +2552,7 @@ def get_container_mounts( assert ident.fsid assert ident.daemon_id - if daemon_type in Ceph.daemons: + if daemon_type in ceph_daemons(): data_dir = ident.data_dir(ctx.data_dir) if daemon_type == 'rgw': cdata_dir = '/var/lib/ceph/radosgw/ceph-rgw.%s' % (ident.daemon_id) @@ -2693,11 +2697,11 @@ def get_container( host_network: bool = True daemon_type = ident.daemon_type - if daemon_type in Ceph.daemons: + if daemon_type in ceph_daemons(): envs.append('TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=134217728') if container_args is None: container_args = [] - unlimited_daemons = set(Ceph.daemons) + unlimited_daemons = set(ceph_daemons()) unlimited_daemons.add(CephIscsi.daemon_type) unlimited_daemons.add(CephNvmeof.daemon_type) unlimited_daemons.add(NFSGanesha.daemon_type) @@ -2777,7 +2781,7 @@ def get_container( container_args.extend(['--security-opt', 'label=disable']) elif daemon_type == 'crash': ceph_args = ['-n', name] - elif daemon_type in Ceph.daemons: + elif daemon_type in ceph_daemons(): ceph_args = ['-n', name, '-f'] elif daemon_type == SNMPGateway.daemon_type: sg = SNMPGateway.init(ctx, ident.fsid, ident.daemon_id) @@ -2966,7 +2970,7 @@ def deploy_daemon( # If this was a reconfig and the daemon is not a Ceph daemon, restart it # so it can pick up potential changes to its configuration files - if deployment_type == DeploymentType.RECONFIG and daemon_type not in Ceph.daemons: + if deployment_type == DeploymentType.RECONFIG and daemon_type not in ceph_daemons(): # ceph daemons do not need a restart; others (presumably) do to pick # up the new config call_throws(ctx, ['systemctl', 'reset-failed', ident.unit_name]) @@ -3115,7 +3119,7 @@ def deploy_daemon_units( f.write('set -e\n') - if daemon_type in Ceph.daemons: + if daemon_type in ceph_daemons(): install_path = find_program('install') f.write('{install_path} -d -m0770 -o {uid} -g {gid} /var/run/ceph/{fsid}\n'.format(install_path=install_path, fsid=fsid, uid=uid, gid=gid)) @@ -5320,7 +5324,7 @@ def command_shell(ctx): daemon_type = 'osd' # get the most mounts daemon_id = None - if ctx.fsid and daemon_type in Ceph.daemons: + if ctx.fsid and daemon_type in ceph_daemons(): make_log_dir(ctx, ctx.fsid) if daemon_id and not ctx.fsid: @@ -5667,7 +5671,7 @@ def list_daemons(ctx, detail=True, legacy_dir=None): if daemon_type == CephNvmeof.daemon_type: version = CephNvmeof.get_version(ctx, container_id) elif not version: - if daemon_type in Ceph.daemons: + if daemon_type in ceph_daemons(): out, err, code = call(ctx, [container_path, 'exec', container_id, 'ceph', '-v'], @@ -5856,7 +5860,7 @@ def command_adopt(ctx): lock.acquire() # call correct adoption - if daemon_type in Ceph.daemons: + if daemon_type in ceph_daemons(): command_adopt_ceph(ctx, daemon_type, daemon_id, fsid) elif daemon_type == 'prometheus': command_adopt_prometheus(ctx, daemon_id, fsid) From d9314780a59e991afce036236d425a5ed8368d09 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 3 Oct 2023 16:51:49 -0400 Subject: [PATCH 0135/2492] cephadm: mock os.path.listdir in daemon forms test Prevent classes that want to check the filesystem from breaking the simple daemon forms instantiation test case. A better future fix would be avoiding checking the file system during __init__ of the class but that is left for future improvements. Signed-off-by: John Mulligan --- src/cephadm/tests/test_daemon_form.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cephadm/tests/test_daemon_form.py b/src/cephadm/tests/test_daemon_form.py index 428183aaa3e1..07896cc58559 100644 --- a/src/cephadm/tests/test_daemon_form.py +++ b/src/cephadm/tests/test_daemon_form.py @@ -61,7 +61,7 @@ def test_is_sysctl_daemon_form(dt, is_sdf): assert isinstance(inst, daemon_form.SysctlDaemonForm) == is_sdf -def test_can_create_all_daemon_forms(): +def test_can_create_all_daemon_forms(monkeypatch): uuid = 'daeb985e-58c7-11ee-a536-201e8814f771' ctx = mock.MagicMock() ctx.config_blobs = { @@ -69,6 +69,8 @@ def test_can_create_all_daemon_forms(): 'pool': 'swimming', 'destination': 'earth', } + _os_path_isdir = mock.MagicMock(return_value=True) + monkeypatch.setattr('os.path.isdir', _os_path_isdir) dtypes = _cephadm.get_supported_daemons() for daemon_type in dtypes: if daemon_type == 'agent': From ead4cf25c959ea6edb04e68fcc45eed1a80be4f9 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 3 Oct 2023 16:52:09 -0400 Subject: [PATCH 0136/2492] cephadm: convert ceph exporter type to a ContainerDaemonForm CephExporter was being (partially) over-shadowed by the Ceph class as the Ceph class listed 'ceph-exporter' as one of the daemon types it handled. This change updates CephExporter to a ContainerDaemonForm while simultaneously breaking the link between Ceph and 'ceph-exporter', allowing CephExporter to handle all the duty of managing ceph-exporter, continuing the process of having clearer logical responsibilities and class hierarchy in cephadm. Signed-off-by: John Mulligan --- src/cephadm/cephadm.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index a3b2c22c4813..a64b0b49404d 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -210,7 +210,7 @@ def __eq__(self, other: Any) -> bool: @register_daemon_form class Ceph(ContainerDaemonForm): _daemons = ('mon', 'mgr', 'osd', 'mds', 'rgw', 'rbd-mirror', - 'crash', 'cephfs-mirror', 'ceph-exporter') + 'crash', 'cephfs-mirror') @classmethod def for_daemon_type(cls, daemon_type: str) -> bool: @@ -1196,7 +1196,7 @@ def config_and_keyring( @register_daemon_form -class CephExporter(DaemonForm): +class CephExporter(ContainerDaemonForm): """Defines a Ceph exporter container""" daemon_type = 'ceph-exporter' @@ -1264,6 +1264,17 @@ def validate(self) -> None: if not os.path.isdir(self.sock_dir): raise Error(f'Directory does not exist. Got: {self.sock_dir}') + def container(self, ctx: CephadmContext) -> CephContainer: + return get_deployment_container(ctx, self.identity) + + def uid_gid(self, ctx: CephadmContext) -> Tuple[int, int]: + return extract_uid_gid(ctx) + + def config_and_keyring( + self, ctx: CephadmContext + ) -> Tuple[Optional[str], Optional[str]]: + return get_config_and_keyring(ctx) + ################################## @@ -1724,7 +1735,9 @@ def get_supported_daemons(): def ceph_daemons() -> List[str]: - return list(Ceph._daemons) + cds = list(Ceph._daemons) + cds.append(CephExporter.daemon_type) + return cds ################################## From 2f2cd3bcff82afc3a4d251143eb462e700e7fc60 Mon Sep 17 00:00:00 2001 From: Ramana Raja Date: Sun, 17 Sep 2023 22:52:56 -0400 Subject: [PATCH 0137/2492] qa/suites/rbd: add test to check rbd_support module recovery ... on repeated blocklisting of its client. There were issues with rbd_support module not being able to recover from its RADOS client being repeatedly blocklisted. This occured for example in clusters with OSDs slow to process RBD requests while the module's mirror_snapshot_scheduler was taking mirror snapshots by requesting exclusive locks on the RBD images and workloads were running on the snapshotted images via kernel clients. Fixes: https://tracker.ceph.com/issues/62891 Signed-off-by: Ramana Raja --- .../rbd_support_module_recovery.yaml | 13 ++++ .../rbd/rbd_support_module_recovery.sh | 77 +++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml create mode 100755 qa/workunits/rbd/rbd_support_module_recovery.sh diff --git a/qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml b/qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml new file mode 100644 index 000000000000..aa4d0001fc09 --- /dev/null +++ b/qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml @@ -0,0 +1,13 @@ +overrides: + ceph: + conf: + mgr: + debug rbd: 20 +tasks: +- install: + extra_system_packages: + - fio +- workunit: + clients: + client.0: + - rbd/rbd_support_module_recovery.sh diff --git a/qa/workunits/rbd/rbd_support_module_recovery.sh b/qa/workunits/rbd/rbd_support_module_recovery.sh new file mode 100755 index 000000000000..e9defced24a8 --- /dev/null +++ b/qa/workunits/rbd/rbd_support_module_recovery.sh @@ -0,0 +1,77 @@ +#!/bin/bash +set -ex + +POOL=rbd +IMAGE_PREFIX=image +NUM_IMAGES=20 +RUN_TIME=3600 + +rbd mirror pool enable ${POOL} image +rbd mirror pool peer add ${POOL} dummy + +# Create images and schedule their mirror snapshots +for ((i = 1; i <= ${NUM_IMAGES}; i++)); do + rbd create -s 1G --image-feature exclusive-lock ${POOL}/${IMAGE_PREFIX}$i + rbd mirror image enable ${POOL}/${IMAGE_PREFIX}$i snapshot + rbd mirror snapshot schedule add -p ${POOL} --image ${IMAGE_PREFIX}$i 1m +done + +# Run fio workloads on images via kclient +# Test the recovery of the rbd_support module and its scheduler from their +# librbd client being blocklisted while a exclusive lock gets passed around +# between their librbd client and a kclient trying to take mirror snapshots +# and perform I/O on the same image. +for ((i = 1; i <= ${NUM_IMAGES}; i++)); do + DEVS[$i]=$(sudo rbd device map ${POOL}/${IMAGE_PREFIX}$i) + fio --name=fiotest --filename=${DEVS[$i]} --rw=randrw --bs=4K --direct=1 \ + --ioengine=libaio --iodepth=2 --runtime=43200 --time_based \ + &> /dev/null & +done + +# Repeatedly blocklist rbd_support module's client ~10s after the module +# recovers from previous blocklisting +CURRENT_TIME=$(date +%s) +END_TIME=$((CURRENT_TIME + RUN_TIME)) +PREV_CLIENT_ADDR="" +CLIENT_ADDR="" +while ((CURRENT_TIME <= END_TIME)); do + if [[ -n "${CLIENT_ADDR}" ]] && + [[ "${CLIENT_ADDR}" != "${PREV_CLIENT_ADDR}" ]]; then + ceph osd blocklist add ${CLIENT_ADDR} + # Confirm rbd_support module's client is blocklisted + ceph osd blocklist ls | grep -q ${CLIENT_ADDR} + PREV_CLIENT_ADDR=${CLIENT_ADDR} + fi + sleep 10 + CLIENT_ADDR=$(ceph mgr dump | + jq .active_clients[] | + jq 'select(.name == "rbd_support")' | + jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add') + CURRENT_TIME=$(date +%s) +done + +# Confirm that rbd_support module recovered from repeated blocklisting +# Check that you can add a mirror snapshot schedule after a few retries +for ((i = 1; i <= 24; i++)); do + rbd mirror snapshot schedule add -p ${POOL} \ + --image ${IMAGE_PREFIX}1 2m && break + sleep 10 +done +rbd mirror snapshot schedule ls -p ${POOL} --image ${IMAGE_PREFIX}1 | + grep 'every 2m' +# Verify that the schedule present before client blocklisting is preserved +rbd mirror snapshot schedule ls -p ${POOL} --image ${IMAGE_PREFIX}1 | + grep 'every 1m' +rbd mirror snapshot schedule rm -p ${POOL} --image ${IMAGE_PREFIX}1 2m +for ((i = 1; i <= ${NUM_IMAGES}; i++)); do + rbd mirror snapshot schedule rm -p ${POOL} --image ${IMAGE_PREFIX}$i 1m +done + +# cleanup +killall fio || true +wait +for ((i = 1; i <= ${NUM_IMAGES}; i++)); do + sudo rbd device unmap ${DEVS[$i]} +done + +echo OK From f328c2d7e75c4ce969d21f7c8b9ae2401baee5e1 Mon Sep 17 00:00:00 2001 From: Adam King Date: Tue, 10 Oct 2023 12:42:57 -0400 Subject: [PATCH 0138/2492] mgr/cephadm: add unit test to for upgrade check with --ceph-version This is actually meant to make sure we don't screw up the image base. See https://tracker.ceph.com/issues/63150 to see what we're trying to avoid happening again Signed-off-by: Adam King --- src/pybind/mgr/cephadm/tests/test_upgrade.py | 25 ++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/pybind/mgr/cephadm/tests/test_upgrade.py b/src/pybind/mgr/cephadm/tests/test_upgrade.py index 7aa46f902769..3b5c305b5f0f 100644 --- a/src/pybind/mgr/cephadm/tests/test_upgrade.py +++ b/src/pybind/mgr/cephadm/tests/test_upgrade.py @@ -7,6 +7,7 @@ from cephadm import CephadmOrchestrator from cephadm.upgrade import CephadmUpgrade, UpgradeState from cephadm.ssh import HostConnectionError +from cephadm.utils import ContainerInspectInfo from orchestrator import OrchestratorError, DaemonDescription from .fixtures import _run_cephadm, wait, with_host, with_service, \ receive_agent_metadata, async_side_effect @@ -80,6 +81,30 @@ def test_upgrade_resume_clear_health_warnings(_rm_health_warning, cephadm_module _rm_health_warning.assert_has_calls(calls_list, any_order=True) +@mock.patch('cephadm.upgrade.CephadmUpgrade._get_current_version', lambda _: (17, 2, 6)) +@mock.patch("cephadm.serve.CephadmServe._get_container_image_info") +def test_upgrade_check_with_ceph_version(_get_img_info, cephadm_module: CephadmOrchestrator): + # This test was added to avoid screwing up the image base so that + # when the version was added to it it made an incorrect image + # The issue caused the image to come out as + # quay.io/ceph/ceph:v18:v18.2.0 + # see https://tracker.ceph.com/issues/63150 + _img = '' + + def _fake_get_img_info(img_name): + nonlocal _img + _img = img_name + return ContainerInspectInfo( + 'image_id', + '18.2.0', + 'digest' + ) + + _get_img_info.side_effect = _fake_get_img_info + cephadm_module.upgrade_check('', '18.2.0') + assert _img == 'quay.io/ceph/ceph:v18.2.0' + + @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) @pytest.mark.parametrize("use_repo_digest", [ From 4fe08f167c1cf3df6a8d2a02573cfc328f1a9b77 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Tue, 3 Oct 2023 01:17:42 +0530 Subject: [PATCH 0139/2492] qa/cephfs: import only sleep() from time Signed-off-by: Rishabh Dave --- qa/tasks/cephfs/test_admin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py index 923871f88e3a..b2e10e4d3cf4 100644 --- a/qa/tasks/cephfs/test_admin.py +++ b/qa/tasks/cephfs/test_admin.py @@ -1,10 +1,10 @@ import errno import json import logging -import time import uuid from io import StringIO from os.path import join as os_path_join +from time import sleep from teuthology.exceptions import CommandFailedError @@ -802,7 +802,7 @@ def test_fsmap_trim(self): self.fs.set_joinable(b) b = not b - time.sleep(10) # for tick/compaction + sleep(10) # for tick/compaction try: self.fs.status(epoch=epoch) @@ -826,7 +826,7 @@ def test_fsmap_force_trim(self): # force a new fsmap self.fs.set_joinable(False) - time.sleep(10) # for tick/compaction + sleep(10) # for tick/compaction status = self.fs.status() log.debug(f"new epoch is {status['epoch']}") From 56cbf3f0716b556c815487d719abe86021125925 Mon Sep 17 00:00:00 2001 From: shimin Date: Sun, 8 Oct 2023 19:15:09 +0800 Subject: [PATCH 0140/2492] mon: fix mds metadata lost in one case. In most cases, peon's pending_metadata is inconsistent with mon's db. When a peon turns into leader, and at the same time a active mds stops, the new leader may flush wrong mds metadata into db. So we meed to update mds metadata from db at every fsmap change. This phenomenon can be reproduce like this: A Cluster with 3 mon and 3 mds (one active, other two standby), 6 osd. step 1. stop two standby mds; step 2. restart all mon; (make pending_medata consistent with db) step 3. start other two mds step 4. stop leader mon step 5. run "ceph mds metadata" command to check mds metadata step 6. stop active mds step 7. run "ceph mds metadata" command to check mds metadata again In step 7, we would find mds metadata lost. Fixes: https://tracker.ceph.com/issues/63166 Signed-off-by: shimin --- src/mon/MDSMonitor.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 3042bdca30d8..e24d7388f9cd 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -136,6 +136,7 @@ void MDSMonitor::update_from_paxos(bool *need_bootstrap) << ", my e " << get_fsmap().get_epoch() << dendl; ceph_assert(version > get_fsmap().get_epoch()); + load_metadata(pending_metadata); load_health(); // read and decode From a29e6a86733fbf8554089dd486ca6c9cf748886b Mon Sep 17 00:00:00 2001 From: Aashish Sharma Date: Mon, 25 Sep 2023 17:05:50 +0530 Subject: [PATCH 0141/2492] mgr/dashboard: Show the OSD's Out and Down panels as red whenever an OSD is in Out or Down state in Ceph Cluster grafana dashboard Fixes: https://tracker.ceph.com/issues/62969 Signed-off-by: Aashish Sharma --- .../dashboards_out/ceph-cluster.json | 90 ++++++++++++++++++- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/monitoring/ceph-mixin/dashboards_out/ceph-cluster.json b/monitoring/ceph-mixin/dashboards_out/ceph-cluster.json index dc9e75382595..240c17677e3d 100644 --- a/monitoring/ceph-mixin/dashboards_out/ceph-cluster.json +++ b/monitoring/ceph-mixin/dashboards_out/ceph-cluster.json @@ -256,7 +256,93 @@ } ], "title": "OSDs", - "type": "stat" + "type": "stat", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "All" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Out" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.1 + }, + { + "value": 10, + "color": "red" + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Down" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.1 + }, + { + "value": 10, + "color": "red" + } + ] + } + } + ] + } + ] + } }, { "clusterName": "", @@ -1276,4 +1362,4 @@ "timezone": "", "title": "Ceph - Cluster", "version": 13 - } +} From fca554cee093771646a0c4a37827d7fe3fe95a4a Mon Sep 17 00:00:00 2001 From: neeraj pratap singh Date: Mon, 25 Sep 2023 16:32:31 +0530 Subject: [PATCH 0142/2492] mgr/volumes: fix `subvolume group rm` error message Currently, if we try to delete subvolumegroup using `fs subvolumegroup rm` when there's one or more subvolume(s) present under that subvolumegroup we see the error something like : `Error ENOTEMPTY: error in rmdir /volumes/group1` which causes confusion. Make it more descriptive Fixes: https://tracker.ceph.com/issues/62968 Signed-off-by: Neeraj Pratap Singh --- src/pybind/mgr/volumes/fs/operations/group.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/pybind/mgr/volumes/fs/operations/group.py b/src/pybind/mgr/volumes/fs/operations/group.py index 8b40610332dc..efc10e0797aa 100644 --- a/src/pybind/mgr/volumes/fs/operations/group.py +++ b/src/pybind/mgr/volumes/fs/operations/group.py @@ -269,6 +269,9 @@ def remove_group(fs, vol_spec, groupname): except cephfs.Error as e: if e.args[0] == errno.ENOENT: raise VolumeException(-errno.ENOENT, "subvolume group '{0}' does not exist".format(groupname)) + elif e.args[0] == errno.ENOTEMPTY: + raise VolumeException(-errno.ENOTEMPTY, f"subvolume group {groupname} contains subvolume(s) " + "or retained snapshots of deleted subvolume(s)") raise VolumeException(-e.args[0], e.args[1]) From 1245e5c1dfd7bf78f1da534753c8954fa163138e Mon Sep 17 00:00:00 2001 From: neeraj pratap singh Date: Tue, 26 Sep 2023 08:43:43 +0530 Subject: [PATCH 0143/2492] qa: add test_subvolume_group_rm_when_its_not_empty Fixes: https://tracker.ceph.com/issues/62968 Signed-off-by: neeraj pratap singh --- qa/tasks/cephfs/test_volumes.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py index 44e28937bcbd..16ed00998d36 100644 --- a/qa/tasks/cephfs/test_volumes.py +++ b/qa/tasks/cephfs/test_volumes.py @@ -1871,6 +1871,32 @@ def test_subvolume_group_exists_without_subvolumegroup_and_with_subvolume(self): ret = self._fs_cmd("subvolumegroup", "exist", self.volname) self.assertEqual(ret.strip('\n'), "no subvolumegroup exists") + def test_subvolume_group_rm_when_its_not_empty(self): + group = self._generate_random_group_name() + subvolume = self._generate_random_subvolume_name() + + # create subvolumegroup + self._fs_cmd("subvolumegroup", "create", self.volname, group) + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + # try, remove subvolume group + try: + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOTEMPTY, "invalid error code on deleting " + "subvolumegroup when it is not empty") + else: + self.fail("expected the 'fs subvolumegroup rm' command to fail") + + # delete subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group) + + # delete subvolumegroup + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + class TestSubvolumes(TestVolumesHelper): """Tests for FS subvolume operations, except snapshot and snapshot clone.""" From 3774d7319baec7a55ea98c54a205b23f52a33a7e Mon Sep 17 00:00:00 2001 From: Teng Jie Date: Wed, 23 Aug 2023 17:01:10 +0800 Subject: [PATCH 0144/2492] cephfs-shell: bump up acceptable xattr value len to 64K Fixes: https://tracker.ceph.com/issues/62545 Signed-off-by: teng jie --- src/tools/cephfs/shell/cephfs-shell | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/cephfs/shell/cephfs-shell b/src/tools/cephfs/shell/cephfs-shell index 60ad79c6126c..96a606f24d44 100755 --- a/src/tools/cephfs/shell/cephfs-shell +++ b/src/tools/cephfs/shell/cephfs-shell @@ -1633,7 +1633,7 @@ class CephFSShell(Cmd): """ try: poutput('{}'.format(cephfs.getxattr(args.path, - to_bytes(args.name)).decode('utf-8'))) + to_bytes(args.name), size=65536).decode('utf-8'))) except libcephfs.Error as e: set_exit_code_msg(msg=e) From 15cebf2ad9edc453f17d7d5265e9e10f0d1ff7ba Mon Sep 17 00:00:00 2001 From: Redouane Kachach Date: Wed, 11 Oct 2023 10:59:33 +0200 Subject: [PATCH 0145/2492] mgr/rook: Addint UT for rook cluster Fixes: https://tracker.ceph.com/issues/63107 Signed-off-by: Redouane Kachach --- src/pybind/mgr/rook/tests/fixtures.py | 11 +++ src/pybind/mgr/rook/tests/test_rook.py | 120 +++++++++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 src/pybind/mgr/rook/tests/fixtures.py create mode 100644 src/pybind/mgr/rook/tests/test_rook.py diff --git a/src/pybind/mgr/rook/tests/fixtures.py b/src/pybind/mgr/rook/tests/fixtures.py new file mode 100644 index 000000000000..65a5197430c4 --- /dev/null +++ b/src/pybind/mgr/rook/tests/fixtures.py @@ -0,0 +1,11 @@ +from rook.module import RookOrchestrator +from orchestrator import raise_if_exception, OrchResult + +try: + from typing import Any +except ImportError: + pass + + +def wait(m: RookOrchestrator, c: OrchResult) -> Any: + return raise_if_exception(c) diff --git a/src/pybind/mgr/rook/tests/test_rook.py b/src/pybind/mgr/rook/tests/test_rook.py new file mode 100644 index 000000000000..08028ba85502 --- /dev/null +++ b/src/pybind/mgr/rook/tests/test_rook.py @@ -0,0 +1,120 @@ +import orchestrator +from .fixtures import wait +import pytest +from unittest.mock import patch, PropertyMock + +from rook.module import RookOrchestrator +from rook.rook_cluster import RookCluster + + +# we use this intermediate class as .rook_cluster property +# is read only in the paretn class RookCluster +class FakeRookCluster(RookCluster): + def __init__(self): + pass + + +class TestRook(object): + + @pytest.mark.parametrize("pods, expected_daemon_types", [ + ( + [ + { + 'name': 'ceph-rook-exporter', + 'hostname': 'host1', + "labels": {'app': 'rook-ceph-exporter', + 'ceph_daemon_id': 'exporter'}, + 'phase': 'Pending', + 'container_image_name': 'quay.io/ceph/ceph:v18', + 'container_image_id': 'docker-pullable://quay.io/ceph/ceph@sha256:f239715e1c7756e32a202a572e2763a4ce15248e09fc6e8990985f8a09ffa784', + 'refreshed': 'pod1_ts', + 'started': 'pod1_ts', + 'created': 'pod1_1ts', + }, + { + 'name': 'rook-ceph-mgr-a-68c7b9b6d8-vjjhl', + 'hostname': 'host1', + "labels": {'app': 'rook-ceph-mgr', + 'ceph_daemon_type': 'mgr', + 'ceph_daemon_id': 'a'}, + 'phase': 'Failed', + 'container_image_name': 'quay.io/ceph/ceph:v18', + 'container_image_id': '', + 'refreshed': 'pod2_ts', + 'started': 'pod2_ts', + 'created': 'pod2_1ts', + }, + { + 'name': 'rook-ceph-mon-a-65fb8694b4-mmtl5', + 'hostname': 'host1', + "labels": {'app': 'rook-ceph-mon', + 'ceph_daemon_type': 'mon', + 'ceph_daemon_id': 'b'}, + 'phase': 'Running', + 'container_image_name': 'quay.io/ceph/ceph:v18', + 'container_image_id': '', + 'refreshed': 'pod3_ts', + 'started': 'pod3_ts', + 'created': 'pod3_1ts', + }, + { + 'name': 'rook-ceph-osd-0-58cbd7b65c-6cjnr', + 'hostname': 'host1', + "labels": {'app': 'rook-ceph-osd', + 'ceph-osd-id': '0', + 'ceph_daemon_type': 'osd', + 'ceph_daemon_id': '0'}, + 'phase': 'Succeeded', + 'container_image_name': 'quay.io/ceph/ceph:v18', + 'container_image_id': '', + 'refreshed': 'pod4_ts', + 'started': 'pod4_ts', + 'created': 'pod4_1ts', + }, + # unknown pod: has no labels are provided, it shouldn't + # be part of the output + { + 'name': 'unknown-pod', + 'hostname': '', + "labels": {'app': 'unkwon'}, + 'phase': 'Pending', + 'container_image_name': 'quay.io/ceph/ceph:v18', + 'container_image_id': '', + 'refreshed': '', + 'started': '', + 'created': '', + } + ], + ['ceph-exporter', 'mgr', 'mon', 'osd'] + ) + ]) + def test_list_daemons(self, pods, expected_daemon_types): + + status = { + 'Pending': orchestrator.DaemonDescriptionStatus.starting, + 'Running': orchestrator.DaemonDescriptionStatus.running, + 'Succeeded': orchestrator.DaemonDescriptionStatus.stopped, + 'Failed': orchestrator.DaemonDescriptionStatus.error, + 'Unknown': orchestrator.DaemonDescriptionStatus.unknown, + } + + fake_rook_cluster = FakeRookCluster() + ro = RookOrchestrator('rook', None, self) + with patch('rook.RookOrchestrator.rook_cluster', + new_callable=PropertyMock, + return_value=fake_rook_cluster): + with patch.object(fake_rook_cluster, 'describe_pods') as mock_describe_pods: + mock_describe_pods.return_value = pods + dds = wait(ro, ro.list_daemons()) + assert len(dds) == len(expected_daemon_types) + for i in range(0, len(dds)): + assert dds[i].daemon_type == expected_daemon_types[i] + assert dds[i].hostname == pods[i]['hostname'] + assert dds[i].status == status[pods[i]['phase']] + assert dds[i].container_image_name == pods[i]['container_image_name'] + assert dds[i].container_image_id == pods[i]['container_image_id'] + assert dds[i].created == pods[i]['created'] + assert dds[i].last_configured == pods[i]['created'] + assert dds[i].last_deployed == pods[i]['created'] + assert dds[i].started == pods[i]['started'] + assert dds[i].last_refresh == pods[i]['refreshed'] From 8a5677f956d1b18ebae22c27b690b83e82db13cc Mon Sep 17 00:00:00 2001 From: Dhairya Parmar Date: Thu, 5 Oct 2023 17:41:38 +0530 Subject: [PATCH 0146/2492] mds: report clients laggy due laggy OSDs only after checking any OSD is laggy Fixes: https://tracker.ceph.com/issues/63105 Signed-off-by: Dhairya Parmar --- src/mds/Beacon.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc index 861cfa4378f1..4990f0b249f9 100644 --- a/src/mds/Beacon.cc +++ b/src/mds/Beacon.cc @@ -500,13 +500,17 @@ void Beacon::notify_health(MDSRank const *mds) // Report laggy client(s) due to laggy OSDs { + bool defer_client_eviction = + g_conf().get_val("defer_client_eviction_on_laggy_osds") + && mds->objecter->with_osdmap([](const OSDMap &map) { + return map.any_osd_laggy(); }); auto&& laggy_clients = mds->server->get_laggy_clients(); - if (!laggy_clients.empty()) { + if (defer_client_eviction && !laggy_clients.empty()) { std::vector laggy_clients_metrics; for (const auto& laggy_client: laggy_clients) { CachedStackStringStream css; *css << "Client " << laggy_client << " is laggy; not evicted" - << " because some OSD(s) is/are laggy"; + << " because some OSD(s) is/are laggy"; MDSHealthMetric m(MDS_HEALTH_CLIENTS_LAGGY, HEALTH_WARN, css->strv()); laggy_clients_metrics.emplace_back(std::move(m)); } From 754b6022fb9fda075d38dcb1d058482f75dcff4d Mon Sep 17 00:00:00 2001 From: Dhairya Parmar Date: Wed, 11 Oct 2023 12:57:04 +0530 Subject: [PATCH 0147/2492] mds: erase clients getting evicted from laggy_clients Fixes: https://tracker.ceph.com/issues/63105 Signed-off-by: Dhairya Parmar --- src/mds/Server.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index f162025ddd1c..5e3110314b53 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1263,6 +1263,8 @@ void Server::find_idle_sessions() kill_session(session, NULL); } } + // clear as there's no use to keep the evicted clients in laggy_clients + clear_laggy_clients(); } void Server::evict_cap_revoke_non_responders() { From 17bd0c6aca24a8d10b1f346462d6df2e29afd80e Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Wed, 11 Oct 2023 21:08:19 +0000 Subject: [PATCH 0148/2492] src/mon: Added more loggings for disallowed_leaders Added more loggings regarding the disallowed_leaders set so that it is easier to debug problems like: https://tracker.ceph.com/issues/63183 Fixes: https://tracker.ceph.com/issues/63183 Signed-off-by: Kamoltat --- src/mon/ElectionLogic.cc | 3 ++- src/mon/MonMap.cc | 1 + src/mon/Monitor.cc | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/mon/ElectionLogic.cc b/src/mon/ElectionLogic.cc index e22a85bed56f..0c1b30c417c5 100644 --- a/src/mon/ElectionLogic.cc +++ b/src/mon/ElectionLogic.cc @@ -398,7 +398,8 @@ void ElectionLogic::propose_connectivity_handler(int from, epoch_t mepoch, ldout(cct, 10) << "propose from rank=" << from << ",from_score=" << from_score << "; my score=" << my_score << "; currently acked " << leader_acked - << ",leader_score=" << leader_score << dendl; + << ",leader_score=" << leader_score + << ",disallowed_leaders=" << elector->get_disallowed_leaders() << dendl; bool my_win = (my_score >= 0) && // My score is non-zero; I am allowed to lead ((my_rank < from && my_score >= from_score) || // We have same scores and I have lower rank, or diff --git a/src/mon/MonMap.cc b/src/mon/MonMap.cc index 33b9aa8fa288..bb8a4b19455d 100644 --- a/src/mon/MonMap.cc +++ b/src/mon/MonMap.cc @@ -369,6 +369,7 @@ void MonMap::print_summary(ostream& out) const has_printed = true; } out << "}" << " removed_ranks: {" << removed_ranks << "}"; + out << " disallowed_leaders: {" << disallowed_leaders << "}"; } void MonMap::print(ostream& out) const diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 349ca30a8981..6866536d0654 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -2001,6 +2001,7 @@ void Monitor::handle_probe_reply(MonOpRequestRef op) dout(10) << " got newer/committed monmap epoch " << newmap->get_epoch() << ", mine was " << monmap->get_epoch() << dendl; int epoch_diff = newmap->get_epoch() - monmap->get_epoch(); + dout(20) << " new monmap is " << *newmap << dendl; delete newmap; monmap->decode(m->monmap_bl); dout(20) << "has_ever_joined: " << has_ever_joined << dendl; From 163f79a810309c92ab7002ad92ea60175d8d836a Mon Sep 17 00:00:00 2001 From: Yuri Weinstein Date: Wed, 11 Oct 2023 15:07:02 -0700 Subject: [PATCH 0149/2492] qa/tests: fixed typo and added `--force-priority` in missing places Signed-off-by: Yuri Weinstein --- qa/crontab/teuthology-cronjobs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/qa/crontab/teuthology-cronjobs b/qa/crontab/teuthology-cronjobs index f169af75ac44..2b7435de608a 100644 --- a/qa/crontab/teuthology-cronjobs +++ b/qa/crontab/teuthology-cronjobs @@ -125,15 +125,15 @@ CEPH_QA_EMAIL="ceph-qa@ceph.io" ## !!!! the client suites below MUST use --suite-branch octopus, pacific (see https://tracker.ceph.com/issues/24021) -20 01 * * 4 CEPH_BRANCH=quincy; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade-clients/client-upgrade-octopus-quincy -k distro -e $CEPH_QA_EMAIL --suite-branch octopus --suite-repo https://github.com/ceph/ceph.git --ceph-repo https://github.com/ceph/ceph.git -p 50 +20 01 * * 4 CEPH_BRANCH=quincy; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade-clients/client-upgrade-octopus-quincy -k distro -e $CEPH_QA_EMAIL --suite-branch octopus --suite-repo https://github.com/ceph/ceph.git --ceph-repo https://github.com/ceph/ceph.git -p 50 --force-priority -25 01 * * 4 CEPH_BRANCH=quincy; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade-clients/client-upgrade-pacific-quincy -k distro -e $CEPH_QA_EMAIL --suite-branch pacific --suite-repo https://github.com/ceph/ceph.git --ceph-repo https://github.com/ceph/ceph.git -p 50 +25 01 * * 4 CEPH_BRANCH=quincy; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade-clients/client-upgrade-pacific-quincy -k distro -e $CEPH_QA_EMAIL --suite-branch pacific --suite-repo https://github.com/ceph/ceph.git --ceph-repo https://github.com/ceph/ceph.git -p 50 --force-priority 22 14 * * 5 CEPH_BRANCH=quincy; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade:octopus-x --subset 111/120000 -e $CEPH_QA_EMAIL -k distro -p 100 --force-priority 23 14 * * 5 CEPH_BRANCH=quincy; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade:pacific-x --subset 111/120000 -e $CEPH_QA_EMAIL -k distro -p 100 --force-priority -35 01 * * 7 CEPH_BRANCH=quincy; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade/quincy-p2p -k distro -e $CEPH_QA_EMAIL -p 75 +35 01 * * 7 CEPH_BRANCH=quincy; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade/quincy-p2p -k distro -e $CEPH_QA_EMAIL -p 75 --force-priority ### upgrade runs for reef release @@ -144,13 +144,14 @@ CEPH_QA_EMAIL="ceph-qa@ceph.io" 23 14 * * 6 CEPH_BRANCH=reef; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade:quincy-x -k distro -e $CEPH_QA_EMAIL -p 100 --force-priority -20 01 * * 4 CEPH_BRANCH=reef; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade-clients/client-upgrade-octopus-reef -k distro -e $CEPH_QA_EMAIL --suite-branch octopus --suite-repo https://github.com/ceph/ceph.git --ceph-repo https://github.com/ceph/ceph.git -p 50 +20 01 * * 4 CEPH_BRANCH=reef; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade-clients/client-upgrade-octopus-reef -k distro -e $CEPH_QA_EMAIL --suite-branch octopus --suite-repo https://github.com/ceph/ceph.git --ceph-repo https://github.com/ceph/ceph.git -p 50 --force-priority -25 01 * * 4 CEPH_BRANCH=reef; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade-clients/client-upgrade-pacific-reef -k distro -e $CEPH_QA_EMAIL --suite-branch pacific --suite-repo https://github.com/ceph/ceph.git --ceph-repo https://github.com/ceph/ceph.git -p 50 +25 01 * * 4 CEPH_BRANCH=reef; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade-clients/client-upgrade-pacific-reef -k distro -e $CEPH_QA_EMAIL --suite-branch pacific --suite-repo https://github.com/ceph/ceph.git --ceph-repo https://github.com/ceph/ceph.git -p 50 --force-priority -25 01 * * 4 CEPH_BRANCH=reef; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade-clients/client-upgrade-quincy-reef -k distro -e $CEPH_QA_EMAIL --suite-branch quincy --suite-repo https://github.com/ceph/ceph.git --ceph-repo https://github.com/ceph/ceph.git -p 50 +25 01 * * 4 CEPH_BRANCH=reef; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade-clients/client-upgrade-quincy-reef -k distro -e $CEPH_QA_EMAIL --suite-branch quincy --suite-repo https://github.com/ceph/ceph.git --ceph-repo https://github.com/ceph/ceph.git -p 50 --force-priority ### upgrade runs for squid release ###### on smithi -25 02 * * 4 CEPH_BRANCH=main; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade:reef-x -k distro -e $CEPH_QA_EMAIL --suite-branch quincy --suite-repo https://github.com/ceph/ceph.git --ceph-repo https://github.com/ceph/ceph.git -p 50 +25 02 * * 4 CEPH_BRANCH=main; MACHINE_NAME=smithi; /home/teuthology/bin/cron_wrapper teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s upgrade:reef-x -k distro -e $CEPH_QA_EMAIL --suite-branch main --suite-repo https://github.com/ceph/ceph.git --ceph-repo https://github.com/ceph/ceph.git -p 50 --force-priority + From c992ab86d644410c4a03b48d4a7b372dbc2efd02 Mon Sep 17 00:00:00 2001 From: Yuval Lifshitz Date: Wed, 20 Sep 2023 09:46:54 +0000 Subject: [PATCH 0150/2492] rgw: adding request context structure this structure should be created at the frontend and trickle all the way to the RADOS layer. holding: dout prefix, optional yield and trace. in this commit, so far it was only added to the "complete()" sal interface, and to the "write_meta()" rados interface. in the future, it should be added to more sal interfaces, replacing the current way where dpp and optional yield are passed as sepearte arguments to all functions. in addition, if more information would be needed, it should be possible to add that information to the request context struct without changing many function prototypes basic test instructions: https://gist.github.com/yuvalif/1c7f1e80126bed5fa79345efb27fe1b1 Signed-off-by: Yuval Lifshitz --- src/common/tracer.h | 19 ++- src/rgw/driver/d4n/rgw_sal_d4n.cc | 8 +- src/rgw/driver/d4n/rgw_sal_d4n.h | 2 +- src/rgw/driver/daos/rgw_sal_daos.cc | 2 +- src/rgw/driver/daos/rgw_sal_daos.h | 2 +- src/rgw/driver/motr/rgw_sal_motr.cc | 2 +- src/rgw/driver/motr/rgw_sal_motr.h | 2 +- src/rgw/driver/posix/rgw_sal_posix.cc | 18 +-- src/rgw/driver/posix/rgw_sal_posix.h | 4 +- src/rgw/driver/rados/rgw_cr_rados.cc | 5 +- src/rgw/driver/rados/rgw_putobj_processor.cc | 28 ++-- src/rgw/driver/rados/rgw_putobj_processor.h | 6 +- src/rgw/driver/rados/rgw_rados.cc | 160 ++++++++++--------- src/rgw/driver/rados/rgw_rados.h | 13 +- src/rgw/driver/rados/rgw_sal_rados.cc | 27 ++-- src/rgw/driver/rados/rgw_sal_rados.h | 8 +- src/rgw/driver/rados/rgw_tools.cc | 3 +- src/rgw/rgw_file.cc | 3 +- src/rgw/rgw_lc.cc | 4 +- src/rgw/rgw_op.cc | 9 +- src/rgw/rgw_req_context.h | 18 +++ src/rgw/rgw_sal.h | 5 +- src/rgw/rgw_sal_dbstore.cc | 8 +- src/rgw/rgw_sal_dbstore.h | 8 +- src/rgw/rgw_sal_filter.cc | 4 +- src/rgw/rgw_sal_filter.h | 2 +- src/test/rgw/test_d4n_filter.cc | 11 +- 27 files changed, 209 insertions(+), 172 deletions(-) create mode 100644 src/rgw/rgw_req_context.h diff --git a/src/common/tracer.h b/src/common/tracer.h index 8a19db39021a..94efedbed6e1 100644 --- a/src/common/tracer.h +++ b/src/common/tracer.h @@ -67,7 +67,8 @@ struct jspan_context { jspan_context(bool sampled_flag, bool is_remote) {} }; -struct span_stub { +namespace opentelemetry::trace { +struct Span { jspan_context _ctx; template void SetAttribute(std::string_view key, const T& value) const noexcept {} @@ -78,17 +79,21 @@ struct span_stub { void UpdateName(std::string_view) {} bool IsRecording() { return false; } }; +} class jspan { - span_stub span; - public: - span_stub& operator*() { return span; } - const span_stub& operator*() const { return span; } + opentelemetry::trace::Span span; +public: + opentelemetry::trace::Span& operator*() { return span; } + const opentelemetry::trace::Span& operator*() const { return span; } - span_stub* operator->() { return &span; } - const span_stub* operator->() const { return &span; } + opentelemetry::trace::Span* operator->() { return &span; } + const opentelemetry::trace::Span* operator->() const { return &span; } operator bool() const { return false; } + + opentelemetry::trace::Span* get() { return &span; } + const opentelemetry::trace::Span* get() const { return &span; } }; namespace tracing { diff --git a/src/rgw/driver/d4n/rgw_sal_d4n.cc b/src/rgw/driver/d4n/rgw_sal_d4n.cc index ff2ed7d9a204..3195d87eac42 100644 --- a/src/rgw/driver/d4n/rgw_sal_d4n.cc +++ b/src/rgw/driver/d4n/rgw_sal_d4n.cc @@ -445,7 +445,7 @@ int D4NFilterWriter::complete(size_t accounted_size, const std::string& etag, const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) + const req_context& rctx) { cache_block* temp_cache_block = filter->get_cache_block(); RGWBlockDirectory* temp_block_dir = filter->get_block_dir(); @@ -467,9 +467,9 @@ int D4NFilterWriter::complete(size_t accounted_size, const std::string& etag, RGWObjState* astate; int ret = next->complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data, zones_trace, - canceled, y); - obj->get_obj_attrs(y, save_dpp, NULL); - obj->get_obj_state(save_dpp, &astate, y); + canceled, rctx); + obj->get_obj_attrs(rctx.y, save_dpp, NULL); + obj->get_obj_state(save_dpp, &astate, rctx.y); /* Append additional metadata to attributes */ rgw::sal::Attrs baseAttrs = obj->get_attrs(); diff --git a/src/rgw/driver/d4n/rgw_sal_d4n.h b/src/rgw/driver/d4n/rgw_sal_d4n.h index 62c13f0abed6..5a2cd88896db 100644 --- a/src/rgw/driver/d4n/rgw_sal_d4n.h +++ b/src/rgw/driver/d4n/rgw_sal_d4n.h @@ -191,7 +191,7 @@ class D4NFilterWriter : public FilterWriter { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; + const req_context& rctx) override; bool is_atomic() { return atomic; }; const DoutPrefixProvider* dpp() { return save_dpp; } }; diff --git a/src/rgw/driver/daos/rgw_sal_daos.cc b/src/rgw/driver/daos/rgw_sal_daos.cc index 46db3dd654ca..73eec5b3e09c 100644 --- a/src/rgw/driver/daos/rgw_sal_daos.cc +++ b/src/rgw/driver/daos/rgw_sal_daos.cc @@ -2047,7 +2047,7 @@ int DaosMultipartWriter::complete( ceph::real_time set_mtime, std::map& attrs, ceph::real_time delete_at, const char* if_match, const char* if_nomatch, const std::string* user_data, rgw_zone_set* zones_trace, bool* canceled, - optional_yield y) { + const req_context& rctx) { ldpp_dout(dpp, 20) << "DaosMultipartWriter::complete(): enter part=" << part_num_str << dendl; diff --git a/src/rgw/driver/daos/rgw_sal_daos.h b/src/rgw/driver/daos/rgw_sal_daos.h index 0eaf495d2e2b..429c6160488d 100644 --- a/src/rgw/driver/daos/rgw_sal_daos.h +++ b/src/rgw/driver/daos/rgw_sal_daos.h @@ -748,7 +748,7 @@ class DaosAtomicWriter : public StoreWriter { ceph::real_time delete_at, const char* if_match, const char* if_nomatch, const std::string* user_data, rgw_zone_set* zones_trace, bool* canceled, - optional_yield y) override; + const req_context& rctx) override; }; class DaosMultipartWriter : public StoreWriter { diff --git a/src/rgw/driver/motr/rgw_sal_motr.cc b/src/rgw/driver/motr/rgw_sal_motr.cc index 06df127594e7..83c6153a92a5 100644 --- a/src/rgw/driver/motr/rgw_sal_motr.cc +++ b/src/rgw/driver/motr/rgw_sal_motr.cc @@ -2355,7 +2355,7 @@ int MotrAtomicWriter::complete(size_t accounted_size, const std::string& etag, const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) + const req_context& rctx) { int rc = 0; diff --git a/src/rgw/driver/motr/rgw_sal_motr.h b/src/rgw/driver/motr/rgw_sal_motr.h index eee843d7effa..ce5fc2b95b1f 100644 --- a/src/rgw/driver/motr/rgw_sal_motr.h +++ b/src/rgw/driver/motr/rgw_sal_motr.h @@ -814,7 +814,7 @@ class MotrAtomicWriter : public StoreWriter { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; + const req_context& rctx) override; unsigned populate_bvec(unsigned len, bufferlist::iterator &bi); void cleanup(); diff --git a/src/rgw/driver/posix/rgw_sal_posix.cc b/src/rgw/driver/posix/rgw_sal_posix.cc index 052471562e43..40091d06d0f9 100644 --- a/src/rgw/driver/posix/rgw_sal_posix.cc +++ b/src/rgw/driver/posix/rgw_sal_posix.cc @@ -2914,7 +2914,7 @@ int POSIXMultipartWriter::complete(size_t accounted_size, const std::string& eta const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) + const req_context& rctx) { int ret; POSIXUploadPartInfo info; @@ -2945,16 +2945,16 @@ int POSIXMultipartWriter::complete(size_t accounted_size, const std::string& eta attrs[RGW_POSIX_ATTR_MPUPLOAD] = bl; for (auto& attr : attrs) { - ret = obj->write_attr(dpp, y, attr.first, attr.second); + ret = obj->write_attr(rctx.dpp, rctx.y, attr.first, attr.second); if (ret < 0) { - ldpp_dout(dpp, 20) << "ERROR: failed writing attr " << attr.first << dendl; + ldpp_dout(rctx.dpp, 20) << "ERROR: failed writing attr " << attr.first << dendl; return ret; } } ret = obj->close(); if (ret < 0) { - ldpp_dout(dpp, 20) << "ERROR: failed closing file" << dendl; + ldpp_dout(rctx.dpp, 20) << "ERROR: failed closing file" << dendl; return ret; } @@ -2981,7 +2981,7 @@ int POSIXAtomicWriter::complete(size_t accounted_size, const std::string& etag, const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) + const req_context& rctx) { int ret; @@ -3023,14 +3023,14 @@ int POSIXAtomicWriter::complete(size_t accounted_size, const std::string& etag, attrs[RGW_POSIX_ATTR_OWNER] = bl; for (auto attr : attrs) { - ret = obj.write_attr(dpp, y, attr.first, attr.second); + ret = obj.write_attr(rctx.dpp, rctx.y, attr.first, attr.second); if (ret < 0) { - ldpp_dout(dpp, 20) << "ERROR: POSIXAtomicWriter failed writing attr " << attr.first << dendl; + ldpp_dout(rctx.dpp, 20) << "ERROR: POSIXAtomicWriter failed writing attr " << attr.first << dendl; return ret; } } - ret = obj.link_temp_file(dpp, y); + ret = obj.link_temp_file(rctx.dpp, rctx.y); if (ret < 0) { ldpp_dout(dpp, 20) << "ERROR: POSIXAtomicWriter failed writing temp file" << dendl; return ret; @@ -3038,7 +3038,7 @@ int POSIXAtomicWriter::complete(size_t accounted_size, const std::string& etag, ret = obj.close(); if (ret < 0) { - ldpp_dout(dpp, 20) << "ERROR: POSIXAtomicWriter failed closing file" << dendl; + ldpp_dout(rctx.dpp, 20) << "ERROR: POSIXAtomicWriter failed closing file" << dendl; return ret; } diff --git a/src/rgw/driver/posix/rgw_sal_posix.h b/src/rgw/driver/posix/rgw_sal_posix.h index a2a5e5fdda9d..739e7ef7a610 100644 --- a/src/rgw/driver/posix/rgw_sal_posix.h +++ b/src/rgw/driver/posix/rgw_sal_posix.h @@ -625,7 +625,7 @@ class POSIXAtomicWriter : public StoreWriter { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; + const req_context& rctx) override; }; class POSIXMultipartWriter : public StoreWriter { @@ -664,7 +664,7 @@ class POSIXMultipartWriter : public StoreWriter { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; + const req_context& rctx) override; }; diff --git a/src/rgw/driver/rados/rgw_cr_rados.cc b/src/rgw/driver/rados/rgw_cr_rados.cc index 6556d116b8f4..1baff16ff42e 100644 --- a/src/rgw/driver/rados/rgw_cr_rados.cc +++ b/src/rgw/driver/rados/rgw_cr_rados.cc @@ -752,6 +752,7 @@ int RGWAsyncFetchRemoteObj::_send_request(const DoutPrefixProvider *dpp) std::string etag; std::optional bytes_transferred; + const req_context rctx{dpp, null_yield, nullptr}; int r = store->getRados()->fetch_remote_obj(obj_ctx, user_id.value_or(rgw_user()), NULL, /* req_info */ @@ -778,8 +779,8 @@ int RGWAsyncFetchRemoteObj::_send_request(const DoutPrefixProvider *dpp) &etag, /* string *petag, */ NULL, /* void (*progress_cb)(off_t, void *), */ NULL, /* void *progress_data*); */ - dpp, - filter.get(), null_yield, + rctx, + filter.get(), stat_follow_olh, stat_dest_obj, source_trace_entry, diff --git a/src/rgw/driver/rados/rgw_putobj_processor.cc b/src/rgw/driver/rados/rgw_putobj_processor.cc index 9eb2ef266683..65fbd5791d5e 100644 --- a/src/rgw/driver/rados/rgw_putobj_processor.cc +++ b/src/rgw/driver/rados/rgw_putobj_processor.cc @@ -341,7 +341,8 @@ int AtomicObjectProcessor::complete(size_t accounted_size, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, - bool *pcanceled, optional_yield y) + bool *pcanceled, + const req_context& rctx) { int r = writer.drain(); if (r < 0) { @@ -378,7 +379,7 @@ int AtomicObjectProcessor::complete(size_t accounted_size, read_cloudtier_info_from_attrs(attrs, obj_op.meta.category, manifest); - r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y); + r = obj_op.write_meta(actual_size, accounted_size, attrs, rctx); if (r < 0) { if (r == -ETIMEDOUT) { // The head object write may eventually succeed, clear the set of objects for deletion. if it @@ -482,7 +483,8 @@ int MultipartObjectProcessor::complete(size_t accounted_size, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, - bool *pcanceled, optional_yield y) + bool *pcanceled, + const req_context& rctx) { int r = writer.drain(); if (r < 0) { @@ -506,7 +508,7 @@ int MultipartObjectProcessor::complete(size_t accounted_size, obj_op.meta.zones_trace = zones_trace; obj_op.meta.modify_tail = true; - r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y); + r = obj_op.write_meta(actual_size, accounted_size, attrs, rctx); if (r < 0) return r; @@ -531,7 +533,7 @@ int MultipartObjectProcessor::complete(size_t accounted_size, bool compressed; r = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info); if (r < 0) { - ldpp_dout(dpp, 1) << "cannot get compression info" << dendl; + ldpp_dout(rctx.dpp, 1) << "cannot get compression info" << dendl; return r; } @@ -543,16 +545,16 @@ int MultipartObjectProcessor::complete(size_t accounted_size, store->obj_to_raw(bucket_info.placement_rule, meta_obj, &meta_raw_obj); rgw_rados_ref meta_obj_ref; - r = store->get_raw_obj_ref(dpp, meta_raw_obj, &meta_obj_ref); + r = store->get_raw_obj_ref(rctx.dpp, meta_raw_obj, &meta_obj_ref); if (r < 0) { - ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref of meta obj with ret=" << r << dendl; + ldpp_dout(rctx.dpp, -1) << "ERROR: failed to get obj ref of meta obj with ret=" << r << dendl; return r; } librados::ObjectWriteOperation op; cls_rgw_mp_upload_part_info_update(op, p, info); - r = rgw_rados_operate(dpp, meta_obj_ref.pool.ioctx(), meta_obj_ref.obj.oid, &op, y); - ldpp_dout(dpp, 20) << "Update meta: " << meta_obj_ref.obj.oid << " part " << p << " prefix " << info.manifest.get_prefix() << " return " << r << dendl; + r = rgw_rados_operate(rctx.dpp, meta_obj_ref.pool.ioctx(), meta_obj_ref.obj.oid, &op, rctx.y); + ldpp_dout(rctx.dpp, 20) << "Update meta: " << meta_obj_ref.obj.oid << " part " << p << " prefix " << info.manifest.get_prefix() << " return " << r << dendl; if (r == -EOPNOTSUPP) { // New CLS call to update part info is not yet supported. Fall back to the old handling. @@ -565,7 +567,7 @@ int MultipartObjectProcessor::complete(size_t accounted_size, op = librados::ObjectWriteOperation{}; op.assert_exists(); // detect races with abort op.omap_set(m); - r = rgw_rados_operate(dpp, meta_obj_ref.pool.ioctx(), meta_obj_ref.obj.oid, &op, y); + r = rgw_rados_operate(rctx.dpp, meta_obj_ref.pool.ioctx(), meta_obj_ref.obj.oid, &op, rctx.y); } if (r < 0) { return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r; @@ -686,7 +688,7 @@ int AppendObjectProcessor::complete(size_t accounted_size, const string &etag, c ceph::real_time set_mtime, rgw::sal::Attrs& attrs, ceph::real_time delete_at, const char *if_match, const char *if_nomatch, const string *user_data, rgw_zone_set *zones_trace, bool *pcanceled, - optional_yield y) + const req_context& rctx) { int r = writer.drain(); if (r < 0) @@ -742,9 +744,9 @@ int AppendObjectProcessor::complete(size_t accounted_size, const string &etag, c etag_bl.append(final_etag_str, strlen(final_etag_str) + 1); attrs[RGW_ATTR_ETAG] = etag_bl; } - r = obj_op.write_meta(dpp, actual_size + cur_size, + r = obj_op.write_meta(actual_size + cur_size, accounted_size + *cur_accounted_size, - attrs, y); + attrs, rctx); if (r < 0) { return r; } diff --git a/src/rgw/driver/rados/rgw_putobj_processor.h b/src/rgw/driver/rados/rgw_putobj_processor.h index fa9200f32dae..9a21c0c793a3 100644 --- a/src/rgw/driver/rados/rgw_putobj_processor.h +++ b/src/rgw/driver/rados/rgw_putobj_processor.h @@ -191,7 +191,7 @@ class AtomicObjectProcessor : public ManifestObjectProcessor { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; + const req_context& rctx) override; }; @@ -238,7 +238,7 @@ class MultipartObjectProcessor : public ManifestObjectProcessor { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; + const req_context& rctx) override; }; @@ -274,7 +274,7 @@ class MultipartObjectProcessor : public ManifestObjectProcessor { std::map& attrs, ceph::real_time delete_at, const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; + const req_context& rctx) override; }; } // namespace putobj diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc index 8814c5a1fbac..5437d12d4b76 100644 --- a/src/rgw/driver/rados/rgw_rados.cc +++ b/src/rgw/driver/rados/rgw_rados.cc @@ -2665,8 +2665,8 @@ int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp, RGWObjState *astate = nullptr; RGWObjManifest* manifest = nullptr; - RGWObjectCtx rctx(this->driver); - r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, &manifest, false, y); + RGWObjectCtx octx(this->driver); + r = get_obj_state(dpp, &octx, bucket_info, obj, &astate, &manifest, false, y); if (r < 0) return r; @@ -3047,11 +3047,10 @@ int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx, handler, y); } -int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, - uint64_t size, uint64_t accounted_size, +int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size, map& attrs, bool assume_noent, bool modify_tail, - void *_index_op, optional_yield y) + void *_index_op, const req_context& rctx) { RGWRados::Bucket::UpdateIndex *index_op = static_cast(_index_op); RGWRados *store = target->get_store(); @@ -3070,19 +3069,19 @@ int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, RGWObjState *state; RGWObjManifest *manifest = nullptr; - int r = target->get_state(dpp, &state, &manifest, false, y, assume_noent); + int r = target->get_state(rctx.dpp, &state, &manifest, false, rctx.y, assume_noent); if (r < 0) return r; rgw_obj& obj = target->get_obj(); if (obj.get_oid().empty()) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl; + ldpp_dout(rctx.dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl; return -EIO; } rgw_rados_ref ref; - r = store->get_obj_head_ref(dpp, target->get_meta_placement_rule(), obj, &ref); + r = store->get_obj_head_ref(rctx.dpp, target->get_meta_placement_rule(), obj, &ref); if (r < 0) return r; @@ -3094,7 +3093,7 @@ int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, if (!ptag && !index_op->get_optag()->empty()) { ptag = index_op->get_optag(); } - r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y); + r = target->prepare_atomic_modification(rctx.dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, rctx.y); if (r < 0) return r; @@ -3217,7 +3216,7 @@ int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, if (!index_op->is_prepared()) { tracepoint(rgw_rados, prepare_enter, req_id.c_str()); - r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y); + r = index_op->prepare(rctx.dpp, CLS_RGW_OP_ADD, &state->write_tag, rctx.y); tracepoint(rgw_rados, prepare_exit, req_id.c_str()); if (r < 0) return r; @@ -3226,7 +3225,7 @@ int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, auto& ioctx = ref.pool.ioctx(); tracepoint(rgw_rados, operate_enter, req_id.c_str()); - r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, y); + r = rgw_rados_operate(rctx.dpp, ref.pool.ioctx(), ref.obj.oid, &op, rctx.y); tracepoint(rgw_rados, operate_exit, req_id.c_str()); if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under, or -ENOENT if was removed, or -EEXIST if it did not exist @@ -3241,16 +3240,16 @@ int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, epoch = ioctx.get_last_version(); poolid = ioctx.get_id(); - r = target->complete_atomic_modification(dpp, y); + r = target->complete_atomic_modification(rctx.dpp, rctx.y); if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl; + ldpp_dout(rctx.dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl; } tracepoint(rgw_rados, complete_enter, req_id.c_str()); - r = index_op->complete(dpp, poolid, epoch, size, accounted_size, + r = index_op->complete(rctx.dpp, poolid, epoch, size, accounted_size, meta.set_mtime, etag, content_type, storage_class, &acl_bl, - meta.category, meta.remove_objs, y, + meta.category, meta.remove_objs, rctx.y, meta.user_data, meta.appendable); tracepoint(rgw_rados, complete_exit, req_id.c_str()); if (r < 0) @@ -3265,7 +3264,7 @@ int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, state = NULL; if (versioned_op && meta.olh_epoch) { - r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace); + r = store->set_olh(rctx.dpp, target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, rctx.y, meta.zones_trace); if (r < 0) { return r; } @@ -3275,10 +3274,10 @@ int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, rgw_obj_index_key obj_key; obj.key.get_index_key(&obj_key); - r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name, + r = store->obj_expirer->hint_add(rctx.dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key); if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl; + ldpp_dout(rctx.dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl; /* ignoring error, nothing we can do at this point */ } } @@ -3296,9 +3295,9 @@ int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, return 0; done_cancel: - int ret = index_op->cancel(dpp, meta.remove_objs, y); + int ret = index_op->cancel(rctx.dpp, meta.remove_objs, rctx.y); if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl; + ldpp_dout(rctx.dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl; } meta.canceled = true; @@ -3340,8 +3339,8 @@ int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, return r; } -int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size, - map& attrs, optional_yield y) +int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size, + map& attrs, const req_context& rctx) { RGWBucketInfo& bucket_info = target->get_bucket_info(); @@ -3352,13 +3351,13 @@ int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL); int r; if (assume_noent) { - r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y); + r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, rctx); if (r == -EEXIST) { assume_noent = false; } } if (!assume_noent) { - r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y); + r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, rctx); } return r; } @@ -3624,11 +3623,11 @@ static void set_copy_attrs(map& src_attrs, int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj, const DoutPrefixProvider *dpp, optional_yield y) { - RGWObjectCtx rctx(this->driver); + RGWObjectCtx octx(this->driver); rgw::sal::Attrs attrset; uint64_t obj_size; ceph::real_time mtime; - RGWRados::Object op_target(this, dest_bucket_info, rctx, obj); + RGWRados::Object op_target(this, dest_bucket_info, octx, obj); RGWRados::Object::Read read_op(&op_target); read_op.params.attrs = &attrset; @@ -3643,7 +3642,7 @@ int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj, c attrset.erase(RGW_ATTR_TAIL_TAG); attrset.erase(RGW_ATTR_STORAGE_CLASS); - return copy_obj_data(rctx, dest_bucket_info, dest_bucket_info.placement_rule, + return copy_obj_data(octx, dest_bucket_info, dest_bucket_info.placement_rule, read_op, obj_size - 1, obj, NULL, mtime, attrset, 0, real_time(), NULL, dpp, y); } @@ -4138,8 +4137,8 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, string *petag, void (*progress_cb)(off_t, void *), void *progress_data, - const DoutPrefixProvider *dpp, - RGWFetchObjFilter *filter, optional_yield y, + const req_context& rctx, + RGWFetchObjFilter *filter, bool stat_follow_olh, const rgw_obj& stat_dest_obj, const rgw_zone_set_entry& source_trace_entry, @@ -4160,7 +4159,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, using namespace rgw::putobj; AtomicObjectProcessor processor(&aio, this, dest_bucket_info, nullptr, user_id, obj_ctx, dest_obj, olh_epoch, - tag, dpp, y); + tag, rctx.dpp, rctx.y); RGWRESTConn *conn; auto& zone_conn_map = svc.zone->get_zone_conn_map(); auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map(); @@ -4171,7 +4170,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, } else { map::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup); if (iter == zonegroup_conn_map.end()) { - ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl; + ldpp_dout(rctx.dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl; return -ENOENT; } conn = iter->second; @@ -4179,7 +4178,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, } else { auto iter = zone_conn_map.find(source_zone); if (iter == zone_conn_map.end()) { - ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl; + ldpp_dout(rctx.dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl; return -ENOENT; } conn = iter->second; @@ -4195,7 +4194,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, std::optional override_owner; - RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data, + RGWRadosPutObj cb(rctx.dpp, cct, plugin, compressor, &processor, progress_cb, progress_data, [&](map& obj_attrs) { const rgw_placement_rule *ptail_rule; @@ -4207,7 +4206,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, &override_owner, &ptail_rule); if (ret < 0) { - ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl; + ldpp_dout(rctx.dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl; return ret; } @@ -4217,12 +4216,12 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, if (compression_type != "none") { plugin = Compressor::create(cct, compression_type); if (!plugin) { - ldpp_dout(dpp, 1) << "Cannot load plugin for compression type " + ldpp_dout(rctx.dpp, 1) << "Cannot load plugin for compression type " << compression_type << dendl; } } - ret = processor.prepare(y); + ret = processor.prepare(rctx.y); if (ret < 0) { return ret; } @@ -4243,7 +4242,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, if (copy_if_newer) { /* need to get mtime for destination */ - ret = get_obj_state(dpp, &obj_ctx, dest_bucket_info, stat_dest_obj, &dest_state, &manifest, stat_follow_olh, y); + ret = get_obj_state(rctx.dpp, &obj_ctx, dest_bucket_info, stat_dest_obj, &dest_state, &manifest, stat_follow_olh, rctx.y); if (ret < 0) goto set_err_state; @@ -4259,7 +4258,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, static constexpr bool sync_manifest = true; static constexpr bool skip_decrypt = true; static constexpr bool sync_cloudtiered = true; - ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr, + ret = conn->get_obj(rctx.dpp, user_id, info, src_obj, pmod, unmod_ptr, dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver, prepend_meta, get_op, rgwx_stat, sync_manifest, skip_decrypt, &dst_zone_trace, @@ -4270,7 +4269,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, } ret = conn->complete_request(in_stream_req, &etag, &set_mtime, - &expected_size, nullptr, nullptr, y); + &expected_size, nullptr, nullptr, rctx.y); if (ret < 0) { goto set_err_state; } @@ -4280,7 +4279,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, } if (cb.get_data_len() != expected_size) { ret = -EIO; - ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected " + ldpp_dout(rctx.dpp, 0) << "ERROR: object truncated during fetching, expected " << expected_size << " bytes but received " << cb.get_data_len() << dendl; goto set_err_state; } @@ -4301,8 +4300,8 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, auto& obj_attrs = cb.get_attrs(); RGWUserInfo owner_info; - if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, y) < 0) { - ldpp_dout(dpp, 10) << "owner info does not exist" << dendl; + if (ctl.user->get_info_by_uid(rctx.dpp, *override_owner, &owner_info, rctx.y) < 0) { + ldpp_dout(rctx.dpp, 10) << "owner info does not exist" << dendl; return -EINVAL; } @@ -4310,14 +4309,14 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, auto aiter = obj_attrs.find(RGW_ATTR_ACL); if (aiter == obj_attrs.end()) { - ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl; + ldpp_dout(rctx.dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl; acl.create_default(owner_info.user_id, owner_info.display_name); } else { auto iter = aiter->second.cbegin(); try { acl.decode(iter); } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl; + ldpp_dout(rctx.dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl; return -EIO; } } @@ -4341,7 +4340,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, try { decode(delete_at, iter->second); } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl; + ldpp_dout(rctx.dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl; } } } @@ -4395,7 +4394,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, try { decode(pg_ver, iter); } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl; + ldpp_dout(rctx.dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl; /* non critical error */ } } @@ -4413,7 +4412,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, if (verifier_etag != trimmed_etag) { ret = -EIO; - ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:" + ldpp_dout(rctx.dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:" << trimmed_etag << " Computed etag:" << verifier_etag << dendl; goto set_err_state; } @@ -4424,34 +4423,34 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, bool canceled = false; ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime, attrs, delete_at, nullptr, nullptr, nullptr, - zones_trace, &canceled, y); + zones_trace, &canceled, rctx); if (ret < 0) { goto set_err_state; } if (copy_if_newer && canceled) { - ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl; + ldpp_dout(rctx.dpp, 20) << "raced with another write of obj: " << dest_obj << dendl; obj_ctx.invalidate(dest_obj); /* object was overwritten */ - ret = get_obj_state(dpp, &obj_ctx, dest_bucket_info, stat_dest_obj, &dest_state, &manifest, stat_follow_olh, y); + ret = get_obj_state(rctx.dpp, &obj_ctx, dest_bucket_info, stat_dest_obj, &dest_state, &manifest, stat_follow_olh, rctx.y); if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl; + ldpp_dout(rctx.dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl; goto set_err_state; } dest_mtime_weight.init(dest_state); dest_mtime_weight.high_precision = high_precision_time; if (!dest_state->exists || dest_mtime_weight < set_mtime_weight) { - ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; + ldpp_dout(rctx.dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; continue; } else { - ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; + ldpp_dout(rctx.dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; } } break; } if (i == MAX_COMPLETE_RETRY) { - ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl; + ldpp_dout(rctx.dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl; ret = -EIO; goto set_err_state; } @@ -4466,8 +4465,8 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, // for OP_LINK_OLH to call set_olh() with a real olh_epoch if (olh_epoch && *olh_epoch > 0) { constexpr bool log_data_change = true; - ret = set_olh(dpp, obj_ctx, dest_bucket_info, dest_obj, false, nullptr, - *olh_epoch, real_time(), false, y, zones_trace, log_data_change); + ret = set_olh(rctx.dpp, obj_ctx, dest_bucket_info, dest_obj, false, nullptr, + *olh_epoch, real_time(), false, rctx.y, zones_trace, log_data_change); } else { // we already have the latest copy ret = 0; @@ -4590,13 +4589,14 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, if (remote_src || !source_zone.empty()) { rgw_zone_set_entry source_trace_entry{source_zone.id, std::nullopt}; + const req_context rctx{dpp, y, nullptr}; return fetch_remote_obj(obj_ctx, user_id, info, source_zone, dest_obj, src_obj, dest_bucket_info, &src_bucket_info, dest_placement, src_mtime, mtime, mod_ptr, unmod_ptr, high_precision_time, if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category, - olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp, - nullptr /* filter */, y, stat_follow_olh, stat_dest_obj, source_trace_entry); + olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, rctx, + nullptr /* filter */, stat_follow_olh, stat_dest_obj, source_trace_entry); } map src_attrs; @@ -4756,6 +4756,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, append_rand_alpha(cct, tag, tag, 32); } + const req_context rctx{dpp, y, nullptr}; std::unique_ptr aio; rgw::AioResultList all_results; if (!copy_itself) { @@ -4828,7 +4829,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, write_op.meta.delete_at = delete_at; write_op.meta.modify_tail = !copy_itself; - ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y); + ret = write_op.write_meta(obj_size, astate->accounted_size, attrs, rctx); if (ret < 0) { goto done_ret; } @@ -4951,8 +4952,9 @@ int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx, accounted_size = compressed ? cs_info.orig_size : ofs; } + const req_context rctx{dpp, y, nullptr}; return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, - nullptr, nullptr, nullptr, nullptr, nullptr, y); + nullptr, nullptr, nullptr, nullptr, nullptr, rctx); } int RGWRados::transition_obj(RGWObjectCtx& obj_ctx, @@ -5503,17 +5505,17 @@ int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketI return r; } -int RGWRados::defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y) +int RGWRados::defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* octx, RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y) { std::string oid, key; get_obj_bucket_and_oid_loc(obj, oid, key); - if (!rctx) + if (!octx) return 0; RGWObjState *state = NULL; RGWObjManifest *manifest = nullptr; - int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, &manifest, false, y); + int r = get_obj_state(dpp, octx, bucket_info, obj, &state, &manifest, false, y); if (r < 0) return r; @@ -5895,7 +5897,7 @@ int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& return 0; } -int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, +int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *octx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest, bool follow_olh, optional_yield y, bool assume_noent) @@ -5907,16 +5909,16 @@ int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rc bool need_follow_olh = follow_olh && obj.key.instance.empty(); *manifest = nullptr; - RGWObjStateManifest *sm = rctx->get_state(obj); + RGWObjStateManifest *sm = octx->get_state(obj); RGWObjState *s = &(sm->state); - ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl; + ldpp_dout(dpp, 20) << "get_obj_state: octx=" << (void *)octx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl; *state = s; if (sm->manifest) { *manifest = &(*sm->manifest); } if (s->has_attrs) { if (s->is_olh && need_follow_olh) { - return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y); + return get_olh_target_state(dpp, *octx, bucket_info, obj, s, state, manifest, y); } return 0; } @@ -6068,7 +6070,7 @@ int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rc ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl; if (need_follow_olh) { - return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y); + return get_olh_target_state(dpp, *octx, bucket_info, obj, s, state, manifest, y); } else if (obj.key.have_null_instance() && !sm->manifest) { // read null version, and the head object only have olh info s->exists = false; @@ -6079,13 +6081,13 @@ int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rc return 0; } -int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest, +int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *octx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest, bool follow_olh, optional_yield y, bool assume_noent) { int ret; do { - ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, manifest, follow_olh, y, assume_noent); + ret = get_obj_state_impl(dpp, octx, bucket_info, obj, state, manifest, follow_olh, y, assume_noent); } while (ret == -EAGAIN); return ret; @@ -6195,15 +6197,15 @@ int RGWRados::Object::Stat::finish(const DoutPrefixProvider *dpp) return 0; } -int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, +int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, RGWObjectCtx* octx, RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectOperation& op, RGWObjState **pstate, RGWObjManifest** pmanifest, optional_yield y) { - if (!rctx) + if (!octx) return 0; - int r = get_obj_state(dpp, rctx, bucket_info, obj, pstate, pmanifest, false, y); + int r = get_obj_state(dpp, octx, bucket_info, obj, pstate, pmanifest, false, y); if (r < 0) return r; @@ -6339,14 +6341,14 @@ int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp, * bl: the contents of the attr * Returns: 0 on success, -ERR# otherwise. */ -int RGWRados::set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, const char *name, bufferlist& bl, optional_yield y) +int RGWRados::set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* octx, RGWBucketInfo& bucket_info, const rgw_obj& obj, const char *name, bufferlist& bl, optional_yield y) { map attrs; attrs[name] = bl; - return set_attrs(dpp, rctx, bucket_info, obj, attrs, NULL, y); + return set_attrs(dpp, octx, bucket_info, obj, attrs, NULL, y); } -int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& src_obj, +int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* octx, RGWBucketInfo& bucket_info, const rgw_obj& src_obj, map& attrs, map* rmattrs, optional_yield y, @@ -6367,7 +6369,7 @@ int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBu RGWObjState *state = NULL; RGWObjManifest *manifest = nullptr; - r = append_atomic_test(dpp, rctx, bucket_info, obj, op, &state, &manifest, y); + r = append_atomic_test(dpp, octx, bucket_info, obj, op, &state, &manifest, y); if (r < 0) return r; @@ -9885,8 +9887,8 @@ int RGWRados::check_disk_state(const DoutPrefixProvider *dpp, RGWObjState *astate = NULL; RGWObjManifest *manifest = nullptr; - RGWObjectCtx rctx(this->driver); - int r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, &manifest, false, y); + RGWObjectCtx octx(this->driver); + int r = get_obj_state(dpp, &octx, bucket_info, obj, &astate, &manifest, false, y); if (r < 0) return r; diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h index 5ca604c971f6..2dca3cfaad4e 100644 --- a/src/rgw/driver/rados/rgw_rados.h +++ b/src/rgw/driver/rados/rgw_rados.h @@ -816,13 +816,12 @@ class RGWRados explicit Write(RGWRados::Object *_target) : target(_target) {} - int _do_write_meta(const DoutPrefixProvider *dpp, - uint64_t size, uint64_t accounted_size, + int _do_write_meta(uint64_t size, uint64_t accounted_size, std::map& attrs, bool modify_tail, bool assume_noent, - void *index_op, optional_yield y); - int write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size, - std::map& attrs, optional_yield y); + void *index_op, const req_context& rctx); + int write_meta(uint64_t size, uint64_t accounted_size, + std::map& attrs, const req_context& rctx); int write_data(const char *data, uint64_t ofs, uint64_t len, bool exclusive); const req_state* get_req_state() { return nullptr; /* XXX dang Only used by LTTng, and it handles null anyway */ @@ -1144,8 +1143,8 @@ class RGWRados std::string *petag, void (*progress_cb)(off_t, void *), void *progress_data, - const DoutPrefixProvider *dpp, - RGWFetchObjFilter *filter, optional_yield y, + const req_context& rctx, + RGWFetchObjFilter *filter, bool stat_follow_olh, const rgw_obj& stat_dest_obj, const rgw_zone_set_entry& source_trace_entry, diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc index 0c24a36a0a58..fb684a921e6d 100644 --- a/src/rgw/driver/rados/rgw_sal_rados.cc +++ b/src/rgw/driver/rados/rgw_sal_rados.cc @@ -2091,7 +2091,8 @@ int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp, attrs.erase(RGW_ATTR_ID_TAG); attrs.erase(RGW_ATTR_TAIL_TAG); - return obj_op.write_meta(dpp, 0, 0, attrs, y); + const req_context rctx{dpp, y, nullptr}; + return obj_op.write_meta(0, 0, attrs, rctx); } int RadosObject::get_max_chunk_size(const DoutPrefixProvider* dpp, rgw_placement_rule placement_rule, uint64_t* max_chunk_size, uint64_t* alignment) @@ -2178,12 +2179,12 @@ std::unique_ptr RadosObject::get_read_op() return std::make_unique(this, rados_ctx); } -RadosObject::RadosReadOp::RadosReadOp(RadosObject *_source, RGWObjectCtx *_rctx) : +RadosObject::RadosReadOp::RadosReadOp(RadosObject *_source, RGWObjectCtx *_octx) : source(_source), - rctx(_rctx), + octx(_octx), op_target(_source->store->getRados(), _source->get_bucket()->get_info(), - *static_cast(rctx), + *static_cast(octx), _source->get_obj()), parent_op(&op_target) { } @@ -2502,6 +2503,7 @@ int RadosMultipartUpload::init(const DoutPrefixProvider *dpp, optional_yield y, int ret; std::string oid = mp_obj.get_key(); RGWObjectCtx obj_ctx(store); + const req_context rctx{dpp, y, nullptr}; do { char buf[33]; @@ -2537,7 +2539,7 @@ int RadosMultipartUpload::init(const DoutPrefixProvider *dpp, optional_yield y, encode(upload_info, bl); obj_op.meta.data = &bl; - ret = obj_op.write_meta(dpp, bl.length(), 0, attrs, y); + ret = obj_op.write_meta(bl.length(), 0, attrs, rctx); } while (ret == -EEXIST); return ret; @@ -2820,7 +2822,8 @@ int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp, obj_op.meta.completeMultipart = true; obj_op.meta.olh_epoch = olh_epoch; - ret = obj_op.write_meta(dpp, ofs, accounted_size, attrs, y); + const req_context rctx{dpp, y, nullptr}; + ret = obj_op.write_meta(ofs, accounted_size, attrs, rctx); if (ret < 0) return ret; @@ -3099,10 +3102,10 @@ int RadosAtomicWriter::complete(size_t accounted_size, const std::string& etag, const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) + const req_context& rctx) { return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, - if_match, if_nomatch, user_data, zones_trace, canceled, y); + if_match, if_nomatch, user_data, zones_trace, canceled, rctx); } int RadosAppendWriter::prepare(optional_yield y) @@ -3122,10 +3125,10 @@ int RadosAppendWriter::complete(size_t accounted_size, const std::string& etag, const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) + const req_context& rctx) { return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, - if_match, if_nomatch, user_data, zones_trace, canceled, y); + if_match, if_nomatch, user_data, zones_trace, canceled, rctx); } int RadosMultipartWriter::prepare(optional_yield y) @@ -3145,10 +3148,10 @@ int RadosMultipartWriter::complete(size_t accounted_size, const std::string& eta const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) + const req_context& rctx) { return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, - if_match, if_nomatch, user_data, zones_trace, canceled, y); + if_match, if_nomatch, user_data, zones_trace, canceled, rctx); } const std::string& RadosZoneGroup::get_endpoint() const diff --git a/src/rgw/driver/rados/rgw_sal_rados.h b/src/rgw/driver/rados/rgw_sal_rados.h index 228ba532869c..e86a0870782a 100644 --- a/src/rgw/driver/rados/rgw_sal_rados.h +++ b/src/rgw/driver/rados/rgw_sal_rados.h @@ -324,7 +324,7 @@ class RadosObject : public StoreObject { struct RadosReadOp : public ReadOp { private: RadosObject* source; - RGWObjectCtx* rctx; + RGWObjectCtx* octx; RGWRados::Object op_target; RGWRados::Object::Read parent_op; @@ -793,7 +793,7 @@ class RadosAtomicWriter : public StoreWriter { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; + const req_context& rctx) override; }; class RadosAppendWriter : public StoreWriter { @@ -840,7 +840,7 @@ class RadosAppendWriter : public StoreWriter { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; + const req_context& rctx) override; }; class RadosMultipartWriter : public StoreWriter { @@ -885,7 +885,7 @@ class RadosMultipartWriter : public StoreWriter { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; + const req_context& rctx) override; }; class RadosLuaManager : public StoreLuaManager { diff --git a/src/rgw/driver/rados/rgw_tools.cc b/src/rgw/driver/rados/rgw_tools.cc index 66651da5cc8c..cdc4be83a054 100644 --- a/src/rgw/driver/rados/rgw_tools.cc +++ b/src/rgw/driver/rados/rgw_tools.cc @@ -417,12 +417,13 @@ int RGWDataAccess::Object::put(bufferlist& data, puser_data = &(*user_data); } + const req_context rctx{dpp, y, nullptr}; return processor->complete(obj_size, etag, &mtime, mtime, attrs, delete_at, nullptr, nullptr, puser_data, - nullptr, nullptr, y); + nullptr, nullptr, rctx); } void RGWDataAccess::Object::set_policy(const RGWAccessControlPolicy& policy) diff --git a/src/rgw/rgw_file.cc b/src/rgw/rgw_file.cc index 6a55d3f1d8f5..227c212ac6be 100644 --- a/src/rgw/rgw_file.cc +++ b/src/rgw/rgw_file.cc @@ -1937,6 +1937,7 @@ namespace rgw { char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; req_state* state = get_state(); + const req_context rctx{this, state->yield, nullptr}; size_t osize = rgw_fh->get_size(); struct timespec octime = rgw_fh->get_ctime(); @@ -2018,7 +2019,7 @@ namespace rgw { op_ret = processor->complete(state->obj_size, etag, &mtime, real_time(), attrs, (delete_at ? *delete_at : real_time()), if_match, if_nomatch, nullptr, nullptr, nullptr, - state->yield); + rctx); if (op_ret != 0) { /* revert attr updates */ rgw_fh->set_mtime(omtime); diff --git a/src/rgw/rgw_lc.cc b/src/rgw/rgw_lc.cc index ab2dda48ec7b..4887c9d146a6 100644 --- a/src/rgw/rgw_lc.cc +++ b/src/rgw/rgw_lc.cc @@ -480,7 +480,7 @@ struct lc_op_ctx { LCObjsLister& ol; std::unique_ptr obj; - RGWObjectCtx rctx; + RGWObjectCtx octx; const DoutPrefixProvider *dpp; WorkQ* wq; @@ -493,7 +493,7 @@ struct lc_op_ctx { : cct(env.driver->ctx()), env(env), o(o), next_key_name(next_key_name), effective_mtime(effective_mtime), driver(env.driver), bucket(env.bucket), op(env.op), ol(env.ol), - rctx(env.driver), dpp(dpp), wq(wq) + octx(env.driver), dpp(dpp), wq(wq) { obj = bucket->get_object(o.key); } diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 8c15e5bd2e3f..88c1a5abd1ab 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -4390,10 +4390,11 @@ void RGWPutObj::execute(optional_yield y) } tracepoint(rgw_op, processor_complete_enter, s->req_id.c_str()); + const req_context rctx{this, s->yield, s->trace.get()}; op_ret = processor->complete(s->obj_size, etag, &mtime, real_time(), attrs, (delete_at ? *delete_at : real_time()), if_match, if_nomatch, (user_data.empty() ? nullptr : &user_data), nullptr, nullptr, - s->yield); + rctx); tracepoint(rgw_op, processor_complete_exit, s->req_id.c_str()); // send request to notification manager @@ -4658,10 +4659,11 @@ void RGWPostObj::execute(optional_yield y) emplace_attr(RGW_ATTR_COMPRESSION, std::move(tmp)); } + const req_context rctx{this, s->yield, s->trace.get()}; op_ret = processor->complete(s->obj_size, etag, nullptr, real_time(), attrs, (delete_at ? *delete_at : real_time()), nullptr, nullptr, nullptr, nullptr, nullptr, - s->yield); + rctx); if (op_ret < 0) { return; } @@ -7772,10 +7774,11 @@ int RGWBulkUploadOp::handle_file(const std::string_view path, } /* Complete the transaction. */ + const req_context rctx{this, s->yield, s->trace.get()}; op_ret = processor->complete(size, etag, nullptr, ceph::real_time(), attrs, ceph::real_time() /* delete_at */, nullptr, nullptr, nullptr, nullptr, nullptr, - s->yield); + rctx); if (op_ret < 0) { ldpp_dout(this, 20) << "processor::complete returned op_ret=" << op_ret << dendl; } diff --git a/src/rgw/rgw_req_context.h b/src/rgw/rgw_req_context.h new file mode 100644 index 000000000000..b0030ca1a94b --- /dev/null +++ b/src/rgw/rgw_req_context.h @@ -0,0 +1,18 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/async/yield_context.h" +namespace opentelemetry::trace { + class Span; +} + +class DoutPrefixProvider; + +// this struct holds information which is created at the frontend +// and should trickle down through all function calls to the backend +struct req_context { + const DoutPrefixProvider* dpp{nullptr}; + optional_yield y; + const opentelemetry::trace::Span* span{nullptr}; +}; + diff --git a/src/rgw/rgw_sal.h b/src/rgw/rgw_sal.h index 84731f333d72..7cfd4923761f 100644 --- a/src/rgw/rgw_sal.h +++ b/src/rgw/rgw_sal.h @@ -20,6 +20,7 @@ #include "rgw_user.h" #include "rgw_notify_event_type.h" #include "common/tracer.h" +#include "rgw_req_context.h" #include "rgw_datalog_notify.h" #include "include/random.h" @@ -244,7 +245,7 @@ class ObjectProcessor : public DataProcessor { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) = 0; + const req_context& rctx) = 0; }; /** A list of key-value attributes */ @@ -1387,7 +1388,7 @@ class Writer : public ObjectProcessor { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) = 0; + const req_context& rctx) = 0; }; diff --git a/src/rgw/rgw_sal_dbstore.cc b/src/rgw/rgw_sal_dbstore.cc index aa1243fe5982..36d76cc12e15 100644 --- a/src/rgw/rgw_sal_dbstore.cc +++ b/src/rgw/rgw_sal_dbstore.cc @@ -738,9 +738,9 @@ namespace rgw::sal { return std::make_unique(this, nullptr); } - DBObject::DBReadOp::DBReadOp(DBObject *_source, RGWObjectCtx *_rctx) : + DBObject::DBReadOp::DBReadOp(DBObject *_source, RGWObjectCtx *_octx) : source(_source), - rctx(_rctx), + octx(_octx), op_target(_source->store->getDB(), _source->get_bucket()->get_info(), _source->get_obj()), @@ -1323,7 +1323,7 @@ namespace rgw::sal { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) + const req_context& rctx) { /* XXX: same as AtomicWriter..consolidate code */ parent_op.meta.mtime = mtime; @@ -1477,7 +1477,7 @@ namespace rgw::sal { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) + const req_context& rctx) { parent_op.meta.mtime = mtime; parent_op.meta.delete_at = delete_at; diff --git a/src/rgw/rgw_sal_dbstore.h b/src/rgw/rgw_sal_dbstore.h index 65ffd9091093..0c75f4b98cbb 100644 --- a/src/rgw/rgw_sal_dbstore.h +++ b/src/rgw/rgw_sal_dbstore.h @@ -524,12 +524,12 @@ class DBNotification : public StoreNotification { struct DBReadOp : public ReadOp { private: DBObject* source; - RGWObjectCtx* rctx; + RGWObjectCtx* octx; DB::Object op_target; DB::Object::Read parent_op; public: - DBReadOp(DBObject *_source, RGWObjectCtx *_rctx); + DBReadOp(DBObject *_source, RGWObjectCtx *_octx); virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) override; @@ -688,7 +688,7 @@ class DBNotification : public StoreNotification { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; + const req_context& rctx) override; }; class DBMultipartWriter : public StoreWriter { @@ -736,7 +736,7 @@ class DBNotification : public StoreNotification { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; + const req_context& rctx) override; }; class DBStore : public StoreDriver { diff --git a/src/rgw/rgw_sal_filter.cc b/src/rgw/rgw_sal_filter.cc index 13e9155c524b..dbf688a22ab0 100644 --- a/src/rgw/rgw_sal_filter.cc +++ b/src/rgw/rgw_sal_filter.cc @@ -1276,11 +1276,11 @@ int FilterWriter::complete(size_t accounted_size, const std::string& etag, const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) + const req_context& rctx) { return next->complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data, zones_trace, - canceled, y); + canceled, rctx); } int FilterLuaManager::get_script(const DoutPrefixProvider* dpp, optional_yield y, diff --git a/src/rgw/rgw_sal_filter.h b/src/rgw/rgw_sal_filter.h index 6db44a191003..dcc03df95194 100644 --- a/src/rgw/rgw_sal_filter.h +++ b/src/rgw/rgw_sal_filter.h @@ -879,7 +879,7 @@ class FilterWriter : public Writer { const char *if_match, const char *if_nomatch, const std::string *user_data, rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; + const req_context& rctx) override; }; class FilterLuaManager : public LuaManager { diff --git a/src/test/rgw/test_d4n_filter.cc b/src/test/rgw/test_d4n_filter.cc index 30a508cf7097..7ceb7092c64f 100644 --- a/src/test/rgw/test_d4n_filter.cc +++ b/src/test/rgw/test_d4n_filter.cc @@ -25,6 +25,7 @@ string redisHost = ""; vector args; class Environment* env; const DoutPrefixProvider* dpp; +const req_context rctx{dpp, null_yield, nullptr}; class StoreObject : public rgw::sal::StoreObject { friend class D4NFilterFixture; @@ -194,7 +195,7 @@ class D4NFilterFixture : public ::testing::Test { &if_match, &if_nomatch, &user_data, &zones_trace, &canceled, - null_yield); + rctx); return ret; } @@ -454,7 +455,7 @@ TEST_F(D4NFilterFixture, CopyObjectReplace) { &if_match, &if_nomatch, &user_data, &zones_trace, &canceled, - null_yield), 0); + rctx), 0); unique_ptr testObject_copy = testBucket->get_object(rgw_obj_key("test_object_copy")); @@ -579,7 +580,7 @@ TEST_F(D4NFilterFixture, CopyObjectMerge) { &if_match, &if_nomatch, &user_data, &zones_trace, &canceled, - null_yield), 0); + rctx), 0); unique_ptr testObject_copy = testBucket->get_object(rgw_obj_key("test_object_copy")); @@ -1913,7 +1914,7 @@ TEST_F(D4NFilterFixture, DataCheck) { &if_match, &if_nomatch, &user_data, &zones_trace, &canceled, - null_yield), 0); + rctx), 0); client.hget("rgw-object:test_object_DataCheck:cache", "data", [&data](cpp_redis::reply& reply) { if (reply.is_string()) { @@ -1938,7 +1939,7 @@ TEST_F(D4NFilterFixture, DataCheck) { &if_match, &if_nomatch, &user_data, &zones_trace, &canceled, - null_yield), 0); + rctx), 0); client.hget("rgw-object:test_object_DataCheck:cache", "data", [&dataNew](cpp_redis::reply& reply) { if (reply.is_string()) { From 8532f596e67f82016c1247fcd67385cd7890ee02 Mon Sep 17 00:00:00 2001 From: Yuval Lifshitz Date: Tue, 10 Oct 2023 17:30:54 +0000 Subject: [PATCH 0151/2492] tracing/osd/rgw: using jspan as alias to opentelemetry::Span and jspan_ptr as jspan shared pointer Signed-off-by: Yuval Lifshitz --- src/common/tracer.cc | 10 ++++----- src/common/tracer.h | 43 ++++++++++++++++++--------------------- src/osd/OpRequest.h | 2 +- src/rgw/rgw_common.h | 4 ++-- src/rgw/rgw_op.h | 8 ++++---- src/rgw/rgw_req_context.h | 6 ++---- src/rgw/rgw_sal.h | 2 +- 7 files changed, 35 insertions(+), 40 deletions(-) diff --git a/src/common/tracer.cc b/src/common/tracer.cc index 1146da319500..e98053735b48 100644 --- a/src/common/tracer.cc +++ b/src/common/tracer.cc @@ -17,7 +17,7 @@ namespace tracing { const opentelemetry::nostd::shared_ptr Tracer::noop_tracer = opentelemetry::trace::Provider::GetTracerProvider()->GetTracer("no-op", OPENTELEMETRY_SDK_VERSION); -const jspan Tracer::noop_span = noop_tracer->StartSpan("noop"); +const jspan_ptr Tracer::noop_span = noop_tracer->StartSpan("noop"); using bufferlist = ceph::buffer::list; @@ -38,7 +38,7 @@ void Tracer::init(CephContext* _cct, opentelemetry::nostd::string_view service_n } } -jspan Tracer::start_trace(opentelemetry::nostd::string_view trace_name) { +jspan_ptr Tracer::start_trace(opentelemetry::nostd::string_view trace_name) { ceph_assert(cct); if (is_enabled()) { ceph_assert(tracer); @@ -48,7 +48,7 @@ jspan Tracer::start_trace(opentelemetry::nostd::string_view trace_name) { return noop_span; } -jspan Tracer::start_trace(opentelemetry::nostd::string_view trace_name, bool trace_is_enabled) { +jspan_ptr Tracer::start_trace(opentelemetry::nostd::string_view trace_name, bool trace_is_enabled) { ceph_assert(cct); ldout(cct, 20) << "start trace enabled " << trace_is_enabled << " " << dendl; if (trace_is_enabled) { @@ -59,7 +59,7 @@ jspan Tracer::start_trace(opentelemetry::nostd::string_view trace_name, bool tra return noop_tracer->StartSpan(trace_name); } -jspan Tracer::add_span(opentelemetry::nostd::string_view span_name, const jspan& parent_span) { +jspan_ptr Tracer::add_span(opentelemetry::nostd::string_view span_name, const jspan_ptr& parent_span) { if (parent_span && parent_span->IsRecording()) { ceph_assert(tracer); opentelemetry::trace::StartSpanOptions span_opts; @@ -70,7 +70,7 @@ jspan Tracer::add_span(opentelemetry::nostd::string_view span_name, const jspan& return noop_span; } -jspan Tracer::add_span(opentelemetry::nostd::string_view span_name, const jspan_context& parent_ctx) { +jspan_ptr Tracer::add_span(opentelemetry::nostd::string_view span_name, const jspan_context& parent_ctx) { if (parent_ctx.IsValid()) { ceph_assert(tracer); opentelemetry::trace::StartSpanOptions span_opts; diff --git a/src/common/tracer.h b/src/common/tracer.h index 94efedbed6e1..291ff9de25a7 100644 --- a/src/common/tracer.h +++ b/src/common/tracer.h @@ -9,7 +9,8 @@ #ifdef HAVE_JAEGER #include "opentelemetry/trace/provider.h" -using jspan = opentelemetry::nostd::shared_ptr; +using jspan = opentelemetry::trace::Span; +using jspan_ptr = opentelemetry::nostd::shared_ptr; using jspan_context = opentelemetry::trace::SpanContext; using jspan_attribute = opentelemetry::common::AttributeValue; @@ -18,7 +19,7 @@ namespace tracing { class Tracer { private: const static opentelemetry::nostd::shared_ptr noop_tracer; - const static jspan noop_span; + const static jspan_ptr noop_span; CephContext* cct = nullptr;; opentelemetry::nostd::shared_ptr tracer; @@ -30,18 +31,18 @@ class Tracer { bool is_enabled() const; // creates and returns a new span with `trace_name` // this span represents a trace, since it has no parent. - jspan start_trace(opentelemetry::nostd::string_view trace_name); + jspan_ptr start_trace(opentelemetry::nostd::string_view trace_name); // creates and returns a new span with `trace_name` // if false is given to `trace_is_enabled` param, noop span will be returned - jspan start_trace(opentelemetry::nostd::string_view trace_name, bool trace_is_enabled); + jspan_ptr start_trace(opentelemetry::nostd::string_view trace_name, bool trace_is_enabled); // creates and returns a new span with `span_name` which parent span is `parent_span' - jspan add_span(opentelemetry::nostd::string_view span_name, const jspan& parent_span); + jspan_ptr add_span(opentelemetry::nostd::string_view span_name, const jspan_ptr& parent_span); // creates and return a new span with `span_name` // the span is added to the trace which it's context is `parent_ctx`. // parent_ctx contains the required information of the trace. - jspan add_span(opentelemetry::nostd::string_view span_name, const jspan_context& parent_ctx); + jspan_ptr add_span(opentelemetry::nostd::string_view span_name, const jspan_context& parent_ctx); }; @@ -67,9 +68,9 @@ struct jspan_context { jspan_context(bool sampled_flag, bool is_remote) {} }; -namespace opentelemetry::trace { -struct Span { +class jspan { jspan_context _ctx; +public: template void SetAttribute(std::string_view key, const T& value) const noexcept {} void AddEvent(std::string_view) {} @@ -79,21 +80,17 @@ struct Span { void UpdateName(std::string_view) {} bool IsRecording() { return false; } }; -} -class jspan { - opentelemetry::trace::Span span; +class jspan_ptr { + jspan span; public: - opentelemetry::trace::Span& operator*() { return span; } - const opentelemetry::trace::Span& operator*() const { return span; } - - opentelemetry::trace::Span* operator->() { return &span; } - const opentelemetry::trace::Span* operator->() const { return &span; } - + jspan& operator*() { return span; } + const jspan& operator*() const { return span; } + jspan* operator->() { return &span; } + const jspan* operator->() const { return &span; } operator bool() const { return false; } - - opentelemetry::trace::Span* get() { return &span; } - const opentelemetry::trace::Span* get() const { return &span; } + jspan* get() { return &span; } + const jspan* get() const { return &span; } }; namespace tracing { @@ -101,9 +98,9 @@ namespace tracing { struct Tracer { void init(CephContext* _cct, std::string_view service_name) {} bool is_enabled() const { return false; } - jspan start_trace(std::string_view, bool enabled = true) { return {}; } - jspan add_span(std::string_view, const jspan&) { return {}; } - jspan add_span(std::string_view span_name, const jspan_context& parent_ctx) { return {}; } + jspan_ptr start_trace(std::string_view, bool enabled = true) { return {}; } + jspan_ptr add_span(std::string_view, const jspan_ptr&) { return {}; } + jspan_ptr add_span(std::string_view span_name, const jspan_context& parent_ctx) { return {}; } }; inline void encode(const jspan_context& span, bufferlist& bl, uint64_t f=0) {} diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h index 8df4f25d53e5..23f3b1d932e9 100644 --- a/src/osd/OpRequest.h +++ b/src/osd/OpRequest.h @@ -94,7 +94,7 @@ struct OpRequest : public TrackedOp { epoch_t min_epoch = 0; ///< min epoch needed to handle this msg bool hitset_inserted; - jspan osd_parent_span; + jspan_ptr osd_parent_span; template const T* get_req() const { return static_cast(request); } diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index a71f1368070e..1094d0cd443e 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -25,6 +25,7 @@ #include "common/ceph_crypto.h" #include "common/random_string.h" +#include "common/tracer.h" #include "rgw_acl.h" #include "rgw_bucket_layout.h" #include "rgw_cors.h" @@ -43,7 +44,6 @@ #include "cls/rgw/cls_rgw_types.h" #include "include/rados/librados.hpp" #include "rgw_public_access.h" -#include "common/tracer.h" #include "rgw_sal_fwd.h" namespace ceph { @@ -1232,7 +1232,7 @@ struct req_state : DoutPrefixProvider { std::vector session_policies; - jspan trace; + jspan_ptr trace; bool trace_enabled = false; //Principal tags that come in as part of AssumeRoleWithWebIdentity diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index b5ce737147f1..ffc65836ce62 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -1220,7 +1220,7 @@ class RGWPutObj : public RGWOp { std::string multipart_upload_id; std::string multipart_part_str; int multipart_part_num = 0; - jspan multipart_trace; + jspan_ptr multipart_trace; boost::optional delete_at; //append obj @@ -1847,7 +1847,7 @@ class RGWInitMultipart : public RGWOp { std::string upload_id; RGWAccessControlPolicy policy; ceph::real_time mtime; - jspan multipart_trace; + jspan_ptr multipart_trace; public: RGWInitMultipart() {} @@ -1875,7 +1875,7 @@ class RGWCompleteMultipart : public RGWOp { std::string version_id; bufferlist data; std::unique_ptr serializer; - jspan multipart_trace; + jspan_ptr multipart_trace; public: RGWCompleteMultipart() {} @@ -1896,7 +1896,7 @@ class RGWCompleteMultipart : public RGWOp { class RGWAbortMultipart : public RGWOp { protected: - jspan multipart_trace; + jspan_ptr multipart_trace; public: RGWAbortMultipart() {} diff --git a/src/rgw/rgw_req_context.h b/src/rgw/rgw_req_context.h index b0030ca1a94b..b8c284187c87 100644 --- a/src/rgw/rgw_req_context.h +++ b/src/rgw/rgw_req_context.h @@ -1,10 +1,8 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp +#pragma once #include "common/async/yield_context.h" -namespace opentelemetry::trace { - class Span; -} class DoutPrefixProvider; @@ -13,6 +11,6 @@ class DoutPrefixProvider; struct req_context { const DoutPrefixProvider* dpp{nullptr}; optional_yield y; - const opentelemetry::trace::Span* span{nullptr}; + const jspan* span; }; diff --git a/src/rgw/rgw_sal.h b/src/rgw/rgw_sal.h index 7cfd4923761f..e7a3dfe0aa52 100644 --- a/src/rgw/rgw_sal.h +++ b/src/rgw/rgw_sal.h @@ -15,11 +15,11 @@ #pragma once +#include "common/tracer.h" #include "rgw_sal_fwd.h" #include "rgw_lua.h" #include "rgw_user.h" #include "rgw_notify_event_type.h" -#include "common/tracer.h" #include "rgw_req_context.h" #include "rgw_datalog_notify.h" #include "include/random.h" From edec8f35abc2d7bb6aef68ea9b841e33e12012dd Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Tue, 10 Oct 2023 16:09:20 +0530 Subject: [PATCH 0152/2492] qa: minor improvement in ceph_test_case.py When two values (say x and y) are being printed because assert for equality of both failed (assert x == y), print both the values on a new line. Signed-off-by: Rishabh Dave --- qa/tasks/ceph_test_case.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qa/tasks/ceph_test_case.py b/qa/tasks/ceph_test_case.py index 649c0e53cf96..8926e5c7e1a5 100644 --- a/qa/tasks/ceph_test_case.py +++ b/qa/tasks/ceph_test_case.py @@ -76,8 +76,8 @@ def _verify(self, proc, exp_retval=None, exp_errmsgs=None): proc_stderr = proc.stderr.getvalue().lower() msg = ('didn\'t find any of the expected string in stderr.\n' - f'expected string: {exp_errmsgs}\n' - f'received error message: {proc_stderr}\n' + f'expected string -\n{exp_errmsgs}\n' + f'received error message -\n{proc_stderr}\n' 'note: received error message is converted to lowercase') for e in exp_errmsgs: if e in proc_stderr: From bd4cb58f7abca4fa20ec283f675f31d1ede14752 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Tue, 10 Oct 2023 16:12:13 +0530 Subject: [PATCH 0153/2492] qa: log stdout for commands being run through negtest_ceph_cmd Signed-off-by: Rishabh Dave --- qa/tasks/ceph_test_case.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/qa/tasks/ceph_test_case.py b/qa/tasks/ceph_test_case.py index 8926e5c7e1a5..877f79d06a3b 100644 --- a/qa/tasks/ceph_test_case.py +++ b/qa/tasks/ceph_test_case.py @@ -105,6 +105,8 @@ def negtest_ceph_cmd(self, args, retval=None, errmsgs=None, **kwargs): # execution is needed to not halt on command failure because we are # conducting negative testing kwargs['check_status'] = False + # log stdout since it may contain something useful when command fails + kwargs['stdout'] = StringIO() # stderr is needed to check for expected error messages. kwargs['stderr'] = StringIO() From 03df86b7c54325fd577bacec6a1f4c6441117e72 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Tue, 10 Oct 2023 16:30:00 +0530 Subject: [PATCH 0154/2492] qa/cephfs: log commands on INFO logging level Commands issued by negtest_ceph_cmd() aren't printed because log level (due to code for teuthology) changes from DEBUG to INFO in case of some files. This patch ensures that users can see commands being executed regardless of whether log level is changed or not. Signed-off-by: Rishabh Dave --- qa/tasks/vstart_runner.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/qa/tasks/vstart_runner.py b/qa/tasks/vstart_runner.py index caf7a7fe3f79..96dc9fffba2f 100644 --- a/qa/tasks/vstart_runner.py +++ b/qa/tasks/vstart_runner.py @@ -436,7 +436,13 @@ def _perform_checks_and_adjustments(self, args, omit_sudo): usr_args, args = self._omit_cmd_args(args, omit_sudo) - log.debug('> ' + usr_args) + # Let's print all commands on INFO log level since some logging level + # might be changed to INFO from DEBUG during a vstart_runner.py's + # execution due to code added for teuthology. This happened for + # ceph_test_case.RunCephCmd.negtest_ceph_cmd(). Commands it executes + # weren't printed in output because logging level for + # ceph_test_case.py is set to INFO by default. + log.info('> ' + usr_args) return args, usr_args From afd899d29fbd86caf915db8c59be1fb77f9ebb2d Mon Sep 17 00:00:00 2001 From: Leonid Usov Date: Mon, 9 Oct 2023 22:13:59 +0300 Subject: [PATCH 0155/2492] doc/operations/monitoring: mention the new asok command 'raise' Signed-off-by: Leonid Usov --- doc/rados/operations/monitoring.rst | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/doc/rados/operations/monitoring.rst b/doc/rados/operations/monitoring.rst index a9171f2d8419..2343e7e1f59c 100644 --- a/doc/rados/operations/monitoring.rst +++ b/doc/rados/operations/monitoring.rst @@ -626,6 +626,21 @@ For example, the following commands are equivalent to each other: ceph daemon osd.0 foo ceph daemon /var/run/ceph/ceph-osd.0.asok foo +There are two methods of running admin socket commands: (1) +using ``ceph daemon`` as described above, which bypasses +the monitor and assumes a direct login to the daemon's host, +and (2) using the ``ceph tell {daemon-type}.{id}`` command, +which is relayed by monitors and does not require access +to the daemon's host. + +Use the ``raise`` command to send a signal to a daemon, as if by running ``kill -X {daemon.pid}``. +When run via ``ceph tell`` it allows signalling a daemon without access to its host: + +.. prompt:: bash $ + + ceph daemon {daemon-name} raise HUP + ceph tell {daemon-type}.{id} raise -9 + To view the available admin-socket commands, run the following command: .. prompt:: bash $ @@ -634,11 +649,7 @@ To view the available admin-socket commands, run the following command: Admin-socket commands enable you to view and set your configuration at runtime. For more on viewing your configuration, see `Viewing a Configuration at -Runtime`_. There are two methods of setting configuration value at runtime: (1) -using the admin socket, which bypasses the monitor and requires a direct login -to the host in question, and (2) using the ``ceph tell {daemon-type}.{id} -config set`` command, which relies on the monitor and does not require a direct -login. +Runtime`_. .. _Viewing a Configuration at Runtime: ../../configuration/ceph-conf#viewing-a-configuration-at-runtime .. _Storage Capacity: ../../configuration/mon-config-ref#storage-capacity From 7cc54b2b7323096bf7394d8502ff232a78e0c355 Mon Sep 17 00:00:00 2001 From: Leonid Usov Date: Wed, 27 Sep 2023 00:54:26 +0300 Subject: [PATCH 0156/2492] common/admin_socket: add a command to raise a signal The new command "raise [--after X]" accepts signals in the forms: '9', '-9', 'kill', '-KILL' When --after is specified, the program will fork to wait for the timeout The forked instance will bail out if it detects that the parent PID has changed which would indicate that the original parent has terminated. Forking an instance allows to schedule delivery of signals even if the original process is suspended, e.g.: ceph tell mds.a raise CONT --after 10 ceph tell mds.a raise STOP Signed-off-by: Leonid Usov Fixes: https://tracker.ceph.com/issues/62882 --- src/common/admin_socket.cc | 302 +++++++++++++++++++++++++++++++++++++ src/common/admin_socket.h | 1 + src/test/admin_socket.cc | 214 ++++++++++++++++++++++++++ 3 files changed, 517 insertions(+) diff --git a/src/common/admin_socket.cc b/src/common/admin_socket.cc index 8a7e0c721971..2ed3179e8070 100644 --- a/src/common/admin_socket.cc +++ b/src/common/admin_socket.cc @@ -13,6 +13,7 @@ */ #include #include +#include #include "common/admin_socket.h" #include "common/admin_socket_client.h" @@ -36,6 +37,7 @@ #include "include/ceph_assert.h" #include "include/compat.h" #include "include/sock_compat.h" +#include "fmt/format.h" #define dout_subsys ceph_subsys_asok #undef dout_prefix @@ -693,6 +695,297 @@ class GetdescsHook : public AdminSocketHook { } }; +// Define a macro to simplify adding signals to the map +#define ADD_SIGNAL(signalName) \ + { \ + ((const char*)#signalName) + 3, signalName \ + } + +static const std::map known_signals = { + // the following 6 signals are recognized in windows according to + // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/raise?view=msvc-170 + ADD_SIGNAL(SIGABRT), + ADD_SIGNAL(SIGFPE), + ADD_SIGNAL(SIGILL), + ADD_SIGNAL(SIGINT), + ADD_SIGNAL(SIGSEGV), + ADD_SIGNAL(SIGTERM), +#ifndef WIN32 + ADD_SIGNAL(SIGTRAP), + ADD_SIGNAL(SIGHUP), + ADD_SIGNAL(SIGBUS), + ADD_SIGNAL(SIGQUIT), + ADD_SIGNAL(SIGKILL), + ADD_SIGNAL(SIGUSR1), + ADD_SIGNAL(SIGUSR2), + ADD_SIGNAL(SIGPIPE), + ADD_SIGNAL(SIGALRM), + ADD_SIGNAL(SIGCHLD), + ADD_SIGNAL(SIGCONT), + ADD_SIGNAL(SIGSTOP), + ADD_SIGNAL(SIGTSTP), + ADD_SIGNAL(SIGTTIN), + ADD_SIGNAL(SIGTTOU), +#endif + // Add more signals as needed... +}; + +#undef ADD_SIGNAL + +static std::string strsignal_compat(int signal) { +#ifndef WIN32 + return strsignal(signal); +#else + switch (signal) { + case SIGABRT: return "SIGABRT"; + case SIGFPE: return "SIGFPE"; + case SIGILL: return "SIGILL"; + case SIGINT: return "SIGINT"; + case SIGSEGV: return "SIGSEGV"; + case SIGTERM: return "SIGTERM"; + default: return fmt::format("Signal #{}", signal); + } +#endif +} + +class RaiseHook: public AdminSocketHook { + struct Killer { + CephContext* m_cct; + pid_t pid; + int signal; + ceph::coarse_mono_clock::time_point due; + + std::string describe() + { + using std::chrono::duration_cast; + using std::chrono::seconds; + auto remaining = (due - coarse_mono_clock::now()); + return fmt::format( + "pending signal ({}) due in {}", + strsignal_compat(signal), + duration_cast(remaining)); + } + + bool cancel() + { +# ifndef WIN32 + int wstatus; + int status; + if (0 == (status = waitpid(pid, &wstatus, WNOHANG))) { + status = kill(pid, SIGKILL); + if (status) { + ldout(m_cct, 5) << __func__ << "couldn't kill the killer. Error: " << strerror(errno) << dendl; + return false; + } + while (pid == waitpid(pid, &wstatus, 0)) { + if (WIFEXITED(wstatus)) { + return false; + } + if (WIFSIGNALED(wstatus)) { + return true; + } + } + } + if (status < 0) { + ldout(m_cct, 5) << __func__ << "waitpid(killer, NOHANG) returned " << status << "; " << strerror(errno) << dendl; + } else { + ldout(m_cct, 20) << __func__ << "killer process " << pid << "\"" << describe() << "\" reaped. " + << "WIFEXITED: " << WIFEXITED(wstatus) + << "WIFSIGNALED: " << WIFSIGNALED(wstatus) + << dendl; + } +# endif + return false; + } + + static std::optional fork(CephContext *m_cct, int signal_to_send, double delay) { +# ifndef WIN32 + pid_t victim = getpid(); + auto until = ceph::coarse_mono_clock::now() + ceph::make_timespan(delay); + + int fresult = ::fork(); + if (fresult < 0) { + ldout(m_cct, 5) << __func__ << "couldn't fork the killer. Error: " << strerror(errno) << dendl; + return std::nullopt; + } + + if (fresult) { + // this is parent + return {{m_cct, fresult, signal_to_send, until}}; + } + + const auto poll_interval = ceph::make_timespan(0.1); + auto remaining = (until - ceph::coarse_mono_clock::now()); + do { + using std::chrono::duration_cast; + using std::chrono::nanoseconds; + std::this_thread::sleep_for(duration_cast(std::min(remaining, poll_interval))); + if (getppid() != victim) { + // suicide if my parent has changed + // this means that the original parent process has terminated + _exit(1); + } + remaining = (until - ceph::coarse_mono_clock::now()); + } while (remaining > ceph::signedspan::zero()); + + int status = kill(victim, signal_to_send); + if (0 != status) { + ldout(m_cct, 5) << __func__ << "couldn't kill the victim: " << strerror(errno) << dendl; + } + _exit(status); +# endif + return std::nullopt; + } + }; + + CephContext* m_cct; + std::optional killer; + + int parse_signal(std::string&& sigdesc, Formatter* f, std::ostream& errss) + { + int result = 0; + std::transform(sigdesc.begin(), sigdesc.end(), sigdesc.begin(), + [](unsigned char c) { return std::toupper(c); }); + if (sigdesc.starts_with("-")) { + sigdesc.erase(0, 1); + } + if (sigdesc.starts_with("SIG")) { + sigdesc.erase(0, 3); + } + + if (sigdesc == "L") { + f->open_object_section("known_signals"); + for (auto& [name, num] : known_signals) { + f->dump_int(name, num); + } + f->close_section(); + } else { + try { + result = std::stoi(sigdesc); + if (result < 1 || result > 64) { + errss << "signal number should be an integer in the range [1..64]" << std::endl; + return -EINVAL; + } + } catch (std::invalid_argument) { + auto sig_it = known_signals.find(sigdesc); + if (sig_it == known_signals.end()) { + errss << "unknown signal name; use -l to see recognized names" << std::endl; + return -EINVAL; + } + result = sig_it->second; + } + } + return result; + } + +public: + RaiseHook(CephContext* cct) : m_cct(cct) { } + static const char* get_cmddesc() + { + return "raise " + "name=signal,type=CephString,req=false " + "name=cancel,type=CephBool,req=false " + "name=after,type=CephFloat,range=0.0,req=false "; + } + + static const char* get_help() + { + return "deliver the to the daemon process, optionally delaying seconds; " + "when --after is used, the program will fork before sleeping, which allows to " + "schedule signal delivery to a stopped daemon; it's possible to --cancel a pending signal delivery. " + " can be in the forms '9', '-9', 'kill', '-KILL'. Use `raise -l` to list known signal names."; + } + + int call(std::string_view command, const cmdmap_t& cmdmap, + const bufferlist&, + Formatter* f, + std::ostream& errss, + bufferlist& out) override + { + using std::endl; + string sigdesc; + bool cancel = cmd_getval_or(cmdmap, "cancel", false); + int signal_to_send = 0; + + if (cmd_getval(cmdmap, "signal", sigdesc)) { + signal_to_send = parse_signal(std::move(sigdesc), f, errss); + if (signal_to_send < 0) { + return signal_to_send; + } + } else if (!cancel) { + errss << "signal name or number is required" << endl; + return -EINVAL; + } + + if (cancel) { + if (killer) { + if (signal_to_send == 0 || signal_to_send == killer->signal) { + if (killer->cancel()) { + errss << "cancelled " << killer->describe() << endl; + return 0; + } + killer = std::nullopt; + } + if (signal_to_send) { + errss << "signal " << signal_to_send << " is not pending" << endl; + } + } else { + errss << "no pending signal" << endl; + } + return 1; + } + + if (!signal_to_send) { + return 0; + } + + double delay = 0; + if (cmd_getval(cmdmap, "after", delay)) { + #ifdef WIN32 + errss << "'--after' functionality is unsupported on Windows" << endl; + return -ENOTSUP; + #endif + if (killer) { + if (killer->cancel()) { + errss << "cancelled " << killer->describe() << endl; + } + } + + killer = Killer::fork(m_cct, signal_to_send, delay); + + if (killer) { + errss << "scheduled " << killer->describe() << endl; + ldout(m_cct, 20) << __func__ << "scheduled " << killer->describe() << dendl; + } else { + errss << "couldn't fork the killer" << std::endl; + return -EAGAIN; + } + } else { + ldout(m_cct, 20) << __func__ << "raising " + << " (" << strsignal_compat(signal_to_send) << ")" << dendl; + // raise the signal immediately + int status = raise(signal_to_send); + + if (0 == status) { + errss << "raised signal " + << " (" << strsignal_compat(signal_to_send) << ")" << endl; + } else { + errss << "couldn't raise signal " + << " (" << strsignal_compat(signal_to_send) << ")." + << " Error: " << strerror(errno) << endl; + + ldout(m_cct, 5) << __func__ << "couldn't raise signal " + << " (" << strsignal_compat(signal_to_send) << ")." + << " Error: " << strerror(errno) << dendl; + + return 1; + } + } + + return 0; + } +}; + bool AdminSocket::init(const std::string& path) { ldout(m_cct, 5) << "init " << path << dendl; @@ -745,6 +1038,12 @@ bool AdminSocket::init(const std::string& path) register_command("get_command_descriptions", getdescs_hook.get(), "list available commands"); + raise_hook = std::make_unique(m_cct); + register_command( + RaiseHook::get_cmddesc(), + raise_hook.get(), + RaiseHook::get_help()); + th = make_named_thread("admin_socket", &AdminSocket::entry, this); add_cleanup_file(m_path.c_str()); return true; @@ -777,6 +1076,9 @@ void AdminSocket::shutdown() unregister_commands(getdescs_hook.get()); getdescs_hook.reset(); + unregister_commands(raise_hook.get()); + raise_hook.reset(); + remove_cleanup_file(m_path); m_path.clear(); } diff --git a/src/common/admin_socket.h b/src/common/admin_socket.h index 3f364a5b711c..b95a52af7beb 100644 --- a/src/common/admin_socket.h +++ b/src/common/admin_socket.h @@ -190,6 +190,7 @@ class AdminSocket std::unique_ptr version_hook; std::unique_ptr help_hook; std::unique_ptr getdescs_hook; + std::unique_ptr raise_hook; std::mutex tell_lock; std::list> tell_queue; diff --git a/src/test/admin_socket.cc b/src/test/admin_socket.cc index 369e7abbf9b6..a8236271652c 100644 --- a/src/test/admin_socket.cc +++ b/src/test/admin_socket.cc @@ -17,12 +17,14 @@ #include "common/admin_socket.h" #include "common/admin_socket_client.h" #include "common/ceph_argparse.h" +#include "json_spirit/json_spirit.h" #include "gtest/gtest.h" #include #include #include #include +#include using namespace std; @@ -328,6 +330,218 @@ TEST(AdminSocket, bind_and_listen) { } } +class AdminSocketRaise: public ::testing::Test +{ +public: + struct TestSignal { + int sig; + const char * name; + std::atomic count; + }; + + static void SetUpTestSuite() { + signal(sig1.sig, sighandler); + signal(sig2.sig, sighandler); + } + static void TearDownTestSuite() + { + signal(sig1.sig, SIG_DFL); + signal(sig2.sig, SIG_DFL); + } + void SetUp() override + { + std::string path = get_rand_socket_path(); + asock = std::make_unique(g_ceph_context); + asock_client = std::make_unique(path); + ASSERT_TRUE(asock->init(path)); + sig1.count = 0; + sig2.count = 0; + } + void TearDown() override + { + AdminSocketTest(asock.get()).shutdown(); + } +protected: + static TestSignal sig1; + static TestSignal sig2; + + std::unique_ptr asock; + std::unique_ptr asock_client; + + static void sighandler(int signal) + { + if (signal == sig1.sig) { + sig1.count++; + } else if (signal == sig2.sig) { + sig2.count++; + } + + // Windows resets the handler upon signal delivery + // as apparently some linuxes do as well. + // The below shouldn't hurt in any case. + ::signal(signal, sighandler); + } + std::string send_raise(std::optional arg, std::optional after, bool cancel) + { + JSONFormatter f; + f.open_object_section(""); + f.dump_string("prefix", "raise"); + if (arg) { + f.dump_string("signal", *arg); + } + if (after) { + f.dump_float("after", *after); + } + if (cancel) { + f.dump_bool("cancel", true); + } + f.close_section(); + + bufferlist command; + f.flush(command); + + std::string response; + + asock_client->do_request(command.to_str(), &response); + return response; + } + + std::string send_raise_cancel(std::optional arg = std::nullopt) { + return send_raise(arg, std::nullopt, true); + } + + std::string send_raise(std::string arg, std::optional after = std::nullopt) { + return send_raise(arg, after, false); + } +}; + +AdminSocketRaise::TestSignal AdminSocketRaise::sig1 = { SIGINT, "INT", 0 }; +AdminSocketRaise::TestSignal AdminSocketRaise::sig2 = { SIGTERM, "TERM", 0 }; + +TEST_F(AdminSocketRaise, List) { + auto r = send_raise("-l"); + json_spirit::mValue v; + ASSERT_TRUE(json_spirit::read(r, v)); + ASSERT_EQ(json_spirit::Value_type::obj_type, v.type()); + EXPECT_EQ(sig1.sig, v.get_obj()[sig1.name].get_int()); + EXPECT_EQ(sig2.sig, v.get_obj()[sig2.name].get_int()); +} + +TEST_F(AdminSocketRaise, ImmediateFormats) { + std::string name1, name2; + + name1 = sig1.name; + std::transform(name1.begin(), name1.end(), name1.begin(), [](int c) { return std::tolower(c); }); + name2 = fmt::format("-{}", sig2.name); + std::transform(name2.begin(), name2.end(), name2.begin(), [](int c) { return std::tolower(c); }); + + send_raise(fmt::format("-{}", sig1.sig)); + send_raise(name1); + send_raise(name2); + send_raise(fmt::format("{}", sig2.sig)); + EXPECT_EQ(2, sig1.count.load()); + EXPECT_EQ(2, sig2.count.load()); +} + +TEST_F(AdminSocketRaise, Async) +{ + using std::chrono::milliseconds; + +#ifdef WIN32 + GTEST_SKIP() << "Windows doesn't support --after behavior"; +#endif + + ASSERT_EQ("", send_raise(fmt::format("{}", sig1.sig))); + ASSERT_EQ("", send_raise(sig2.name, 0.1)); + + EXPECT_EQ(1, sig1.count.load()); + EXPECT_EQ(0, sig2.count.load()); + + this_thread::sleep_for(milliseconds(150)); + + EXPECT_EQ(1, sig1.count.load()); + EXPECT_EQ(1, sig2.count.load()); +} + +TEST_F(AdminSocketRaise, AsyncReschedule) +{ + using std::chrono::milliseconds; + +#ifdef WIN32 + GTEST_SKIP() << "Windows doesn't support --after behavior"; +#endif + + ASSERT_EQ("", send_raise(sig1.name, 0.1)); + ASSERT_EQ("", send_raise(sig2.name, 0.2)); + + EXPECT_EQ(0, sig1.count.load()); + EXPECT_EQ(0, sig2.count.load()); + + this_thread::sleep_for(milliseconds(150)); + + // USR1 got overridden by the second async schedule + EXPECT_EQ(0, sig1.count.load()); + EXPECT_EQ(0, sig2.count.load()); + + this_thread::sleep_for(milliseconds(100)); + EXPECT_EQ(0, sig1.count.load()); + EXPECT_EQ(1, sig2.count.load()); +} + +TEST_F(AdminSocketRaise, AsyncCancel) +{ + using std::chrono::milliseconds; + +#ifdef WIN32 + GTEST_SKIP() << "Windows doesn't support --after behavior"; +#endif + + ASSERT_EQ("", send_raise(sig1.name, 0.1)); + + EXPECT_EQ(0, sig1.count.load()); + EXPECT_EQ(0, sig2.count.load()); + + ASSERT_EQ("", send_raise_cancel(sig2.name)); + + this_thread::sleep_for(milliseconds(150)); + + // cancel shouldn't have worked because the signals + // didn't match + EXPECT_EQ(1, sig1.count.load()); + + ASSERT_EQ("", send_raise(sig2.name, 0.1)); + ASSERT_EQ("", send_raise_cancel(sig2.name)); + + this_thread::sleep_for(milliseconds(150)); + + // cancel must have worked + EXPECT_EQ(0, sig2.count.load()); + + ASSERT_EQ("", send_raise(sig1.name, 0.1)); + ASSERT_EQ("", send_raise_cancel()); + + // cancel must have worked, the counter stays 1 + EXPECT_EQ(1, sig1.count.load()); +} + +TEST_F(AdminSocketRaise, StopCont) +{ + using std::chrono::duration_cast; + using std::chrono::milliseconds; + using std::chrono::system_clock; + +#ifdef WIN32 + GTEST_SKIP() << "Windows doesn't support SIGSTOP/SIGCONT and --after"; +#endif + + auto then = system_clock::now(); + ASSERT_EQ("", send_raise("CONT", 0.2)); + ASSERT_EQ("", send_raise("STOP")); + auto elapsed = system_clock::now() - then; + // give it a 1% slack + EXPECT_LE(milliseconds(198), duration_cast(elapsed)); +} + /* * Local Variables: * compile-command: "cd .. ; From fd3c941f30296995c8f03d8d29288943871c0baf Mon Sep 17 00:00:00 2001 From: Dhairya Parmar Date: Thu, 14 Sep 2023 00:24:46 +0530 Subject: [PATCH 0157/2492] mgr/nfs: report proper errno with err status Fixes: https://tracker.ceph.com/issues/62641 Signed-off-by: Dhairya Parmar --- src/pybind/mgr/nfs/export.py | 50 ++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/src/pybind/mgr/nfs/export.py b/src/pybind/mgr/nfs/export.py index 5887c898fef9..aaa93c34f6c1 100644 --- a/src/pybind/mgr/nfs/export.py +++ b/src/pybind/mgr/nfs/export.py @@ -167,9 +167,22 @@ class AppliedExportResults: def __init__(self) -> None: self.changes: List[Dict[str, str]] = [] self.has_error = False + self.exceptions: List[Exception] = [] + self.faulty_export_block_indices = "" + self.num_errors = 0 + self.status = "" - def append(self, value: Dict[str, str]) -> None: + def append(self, value: Dict[str, Any]) -> None: if value.get("state", "") == "error": + self.num_errors += 1 + # If there is an error then there must be an exception in the dict. + self.exceptions.append(value.pop("exception")) + # Index is for indicating at which export block in the conf/json + # file did the export creation/update failed. + if len(self.faulty_export_block_indices) == 0: + self.faulty_export_block_indices = str(value.pop("index")) + else: + self.faulty_export_block_indices += f", {value.pop('index')}" self.has_error = True self.changes.append(value) @@ -177,7 +190,29 @@ def to_simplified(self) -> List[Dict[str, str]]: return self.changes def mgr_return_value(self) -> int: - return -errno.EIO if self.has_error else 0 + if self.has_error: + if len(self.exceptions) == 1: + ex = self.exceptions[0] + if isinstance(ex, NFSException): + return ex.errno + # Some non-nfs exception occurred, this can be anything + # therefore return EAGAIN as a generalised errno. + return -errno.EAGAIN + # There are multiple failures so returning EIO as a generalised + # errno. + return -errno.EIO + return 0 + + def mgr_status_value(self) -> str: + if self.has_error: + if len(self.faulty_export_block_indices) == 1: + self.status = f"{str(self.exceptions[0])} for export block" \ + f" at index {self.faulty_export_block_indices}" + elif len(self.faulty_export_block_indices) > 1: + self.status = f"{self.num_errors} export blocks (at index" \ + f" {self.faulty_export_block_indices}) failed" \ + " to be created/updated" + return self.status class ExportMgr: @@ -501,7 +536,12 @@ def apply_export(self, cluster_id: str, export_config: str) -> AppliedExportResu aeresults = AppliedExportResults() for export in exports: - aeresults.append(self._change_export(cluster_id, export)) + changed_export = self._change_export(cluster_id, export) + # This will help figure out which export blocks in conf/json file + # are problematic. + if changed_export.get("state", "") == "error": + changed_export.update({"index": exports.index(export) + 1}) + aeresults.append(changed_export) return aeresults def _read_export_config(self, cluster_id: str, export_config: str) -> List[Dict]: @@ -525,7 +565,7 @@ def _read_export_config(self, cluster_id: str, export_config: str) -> List[Dict] return j # j is already a list object return [j] # return a single object list, with j as the only item - def _change_export(self, cluster_id: str, export: Dict) -> Dict[str, str]: + def _change_export(self, cluster_id: str, export: Dict) -> Dict[str, Any]: try: return self._apply_export(cluster_id, export) except NotImplementedError: @@ -543,7 +583,7 @@ def _change_export(self, cluster_id: str, export: Dict) -> Dict[str, str]: except Exception as ex: msg = f'Failed to apply export: {ex}' log.exception(msg) - return {"state": "error", "msg": msg} + return {"state": "error", "msg": msg, "exception": ex} def _update_user_id( self, From 3a5381326c3e5e88080f6b9e249383db67fcb622 Mon Sep 17 00:00:00 2001 From: Dhairya Parmar Date: Thu, 14 Sep 2023 00:25:25 +0530 Subject: [PATCH 0158/2492] mgr: make object_format's Responder class capable of responding err status Fixes: https://tracker.ceph.com/issues/62641 Signed-off-by: Dhairya Parmar --- src/pybind/mgr/object_format.py | 49 ++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/src/pybind/mgr/object_format.py b/src/pybind/mgr/object_format.py index b53bc3eb060a..8b60aa1b3dc8 100644 --- a/src/pybind/mgr/object_format.py +++ b/src/pybind/mgr/object_format.py @@ -235,6 +235,15 @@ def mgr_return_value(self) -> int: ... # pragma: no cover +class StatusValueProvider(Protocol): + def mgr_status_value(self) -> str: + """Return a string value to provide the Ceph MGR with an error status + for the MGR's response tuple. Empty string means success. Return a string + containing error info otherwise. + """ + ... # pragma: no cover + + class CommonFormatter(Protocol): """A protocol that indicates the type is a formatter for multiple possible formats. @@ -276,6 +285,11 @@ def _is_return_value_provider(obj: ReturnValueProvider) -> bool: return callable(getattr(obj, 'mgr_return_value', None)) +def _is_status_value_provider(obj: StatusValueProvider) -> bool: + """Return true if obj is usable as a StatusValueProvider""" + return callable(getattr(obj, 'mgr_status_value', None)) + + class ObjectFormatAdapter: """A format adapater for a single object. Given an input object, this type will adapt the object, or a simplified @@ -366,6 +380,27 @@ def mgr_return_value(self) -> int: return self.default_return_value +class StatusValueAdapter: + """A status-value adapter for an object. + Given an input object, this type will attempt to get a mgr status value + from the object if provides a `mgr_status_value` function. + If not it returns a default status value, typically an empty string. + """ + + def __init__( + self, + obj: Any, + default: str = "", + ) -> None: + self.obj = obj + self.default_status = default + + def mgr_status_value(self) -> str: + if _is_status_value_provider(self.obj): + return str(self.obj.mgr_status_value()) + return self.default_status + + class ErrorResponseBase(Exception): """An exception that can directly be converted to a mgr reponse.""" @@ -448,6 +483,7 @@ def wrap( Callable[..., JSONDataProvider], Callable[..., YAMLDataProvider], Callable[..., ReturnValueProvider], + Callable[..., StatusValueProvider], ] @@ -487,6 +523,10 @@ def _retval_provider(self, obj: Any) -> ReturnValueProvider: """Return a ReturnValueProvider for the given object.""" return ReturnValueAdapter(obj) + def _statusval_provider(self, obj: Any) -> StatusValueProvider: + """Return a StatusValueProvider for the given object.""" + return StatusValueAdapter(obj) + def _get_format_func( self, obj: Any, format_req: Optional[str] = None ) -> Callable: @@ -515,6 +555,12 @@ def _return_value(self, obj: Any) -> int: """Return a mgr return-value for the given object (usually zero).""" return self._retval_provider(obj).mgr_return_value() + def _return_status(self, obj: Any) -> str: + """Return a mgr status-value for the given object (usually empty + string). + """ + return self._statusval_provider(obj).mgr_status_value() + def __call__(self, f: ObjectResponseFuncType) -> HandlerFuncType: """Wrap a python function so that the original function's return value becomes the source for an automatically formatted mgr response. @@ -528,9 +574,10 @@ def _format_response(*args: Any, **kwargs: Any) -> Tuple[int, str, str]: robj = f(*args, **kwargs) body = self._formatted(robj, format_req) retval = self._return_value(robj) + statusval = self._return_status(robj) except ErrorResponseBase as e: return e.format_response() - return retval, body, "" + return retval, body, statusval # set the extra args on our wrapper function. this will be consumed by # the CLICommand decorator and added to the set of optional arguments From 4d663e4e4484977fb90beeb05f67bc71215bddb3 Mon Sep 17 00:00:00 2001 From: Dhairya Parmar Date: Thu, 14 Sep 2023 00:34:25 +0530 Subject: [PATCH 0159/2492] mgr/tests: test returning error status works as expected Fixes: https://tracker.ceph.com/issues/62641 Signed-off-by: Dhairya Parmar Signed-off-by: John Mulligan --- src/pybind/mgr/tests/test_object_format.py | 26 +++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/pybind/mgr/tests/test_object_format.py b/src/pybind/mgr/tests/test_object_format.py index d2fd20870e7a..2e674c69838c 100644 --- a/src/pybind/mgr/tests/test_object_format.py +++ b/src/pybind/mgr/tests/test_object_format.py @@ -115,12 +115,18 @@ def test_format_yaml(obj: Any, compatible: bool, yaml_val: str): class Retty: - def __init__(self, v) -> None: + def __init__(self, v, status="") -> None: self.value = v + self.status = status def mgr_return_value(self) -> int: return self.value + def mgr_status_value(self) -> str: + if self.status: + return self.status + return "NOPE" + @pytest.mark.parametrize( "obj, ret", @@ -139,6 +145,24 @@ def test_return_value(obj: Any, ret: int): assert rva.mgr_return_value() == ret +@pytest.mark.parametrize( + "obj, ret", + [ + ({}, ""), + ({"fish": "sticks"}, ""), + (-55, ""), + (Retty(0), "NOPE"), + (Retty(-55, "cake"), "cake"), + (Retty(-50, "pie"), "pie"), + ], +) +def test_return_status(obj: Any, ret: str): + rva = object_format.StatusValueAdapter(obj) + # a StatusValueAdapter instance meets the StatusValueProvider protocol. + assert object_format._is_status_value_provider(rva) + assert rva.mgr_status_value() == ret + + def test_valid_formats(): ofa = object_format.ObjectFormatAdapter({"fred": "wilma"}) vf = ofa.valid_formats() From 106b88537ddd2a0ecc5b38eefa765af16fac14f6 Mon Sep 17 00:00:00 2001 From: Dhairya Parmar Date: Thu, 14 Sep 2023 15:03:35 +0530 Subject: [PATCH 0160/2492] mgr: fix some doc strings in object_format.py Fixes: https://tracker.ceph.com/issues/62641 Signed-off-by: Dhairya Parmar --- src/pybind/mgr/object_format.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pybind/mgr/object_format.py b/src/pybind/mgr/object_format.py index 8b60aa1b3dc8..4a2b6fa8b69b 100644 --- a/src/pybind/mgr/object_format.py +++ b/src/pybind/mgr/object_format.py @@ -228,8 +228,8 @@ def format_yaml(self) -> str: class ReturnValueProvider(Protocol): def mgr_return_value(self) -> int: - """Return an integer value to provide the Ceph MGR with a error code - for the MGR's response tuple. Zero means success. Return an negative + """Return an integer value to provide the Ceph MGR with an error code + for the MGR's response tuple. Zero means success. Return a negative errno otherwise. """ ... # pragma: no cover @@ -281,7 +281,7 @@ def _is_yaml_data_provider(obj: YAMLDataProvider) -> bool: def _is_return_value_provider(obj: ReturnValueProvider) -> bool: - """Return true if obj is usable as a YAMLDataProvider.""" + """Return true if obj is usable as a ReturnValueProvider.""" return callable(getattr(obj, 'mgr_return_value', None)) From 9a92242e997527a07c7abe065696887f5f761546 Mon Sep 17 00:00:00 2001 From: Dhairya Parmar Date: Wed, 20 Sep 2023 17:52:50 +0530 Subject: [PATCH 0161/2492] qa: add test cases to verify error reporting works as expected Fixes: https://tracker.ceph.com/issues/62641 Signed-off-by: Dhairya Parmar --- qa/tasks/cephfs/test_nfs.py | 198 ++++++++++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py index 5fe71054ad9a..94d3182f98b5 100644 --- a/qa/tasks/cephfs/test_nfs.py +++ b/qa/tasks/cephfs/test_nfs.py @@ -404,6 +404,13 @@ def _delete_cluster_with_fs(self, fs_name, mnt_pt=None, mode=None): self._cmd('fs', 'volume', 'rm', fs_name, '--yes-i-really-mean-it') self._test_delete_cluster() + def _nfs_export_apply(self, cluster, exports, raise_on_error=False): + return self.ctx.cluster.run(args=['ceph', 'nfs', 'export', 'apply', + cluster, '-i', '-'], + check_status=raise_on_error, + stdin=json.dumps(exports), + stdout=StringIO(), stderr=StringIO()) + def test_create_and_delete_cluster(self): ''' Test successful creation and deletion of the nfs cluster. @@ -876,3 +883,194 @@ def test_nfs_export_creation_at_symlink(self): raise self.ctx.cluster.run(args=['rm', '-rf', f'{mnt_pt}/*']) self._delete_cluster_with_fs(self.fs_name, mnt_pt, preserve_mode) + + def test_nfs_export_apply_multiple_exports(self): + """ + Test multiple export creation/update with multiple + export blocks provided in the json/conf file using: + ceph nfs export apply -i <{conf/json}_file>, and check + 1) if there are multiple failure: + -> Return the EIO and error status to CLI (along with JSON output + containing status of every export). + 2) if there is single failure: + -> Return the respective errno and error status to CLI (along with + JSON output containing status of every export). + """ + + mnt_pt = self._sys_cmd(['mktemp', '-d']).decode().strip() + self._create_cluster_with_fs(self.fs_name, mnt_pt) + try: + self.ctx.cluster.run(args=['mkdir', f'{mnt_pt}/testdir1']) + self.ctx.cluster.run(args=['mkdir', f'{mnt_pt}/testdir2']) + self.ctx.cluster.run(args=['mkdir', f'{mnt_pt}/testdir3']) + self._create_export(export_id='1', + extra_cmd=['--pseudo-path', self.pseudo_path, + '--path', '/testdir1']) + self._create_export(export_id='2', + extra_cmd=['--pseudo-path', + self.pseudo_path+'2', + '--path', '/testdir2']) + exports = [ + { + "export_id": 11, # export_id change not allowed + "path": "/testdir1", + "pseudo": self.pseudo_path, + "squash": "none", + "access_type": "rw", + "protocols": [4], + "fsal": { + "name": "CEPH", + "user_id": "nfs.test.1", + "fs_name": self.fs_name + } + }, + { + "export_id": 2, + "path": "/testdir2", + "pseudo": self.pseudo_path+'2', + "squash": "none", + "access_type": "rw", + "protocols": [4], + "fsal": { + "name": "CEPH", + "user_id": "nfs.test.2", + "fs_name": "invalid_fs_name" # invalid fs + } + }, + { # no error, export creation should succeed + "export_id": 3, + "path": "/testdir3", + "pseudo": self.pseudo_path+'3', + "squash": "none", + "access_type": "rw", + "protocols": [4], + "fsal": { + "name": "CEPH", + "user_id": "nfs.test.3", + "fs_name": self.fs_name + } + } + ] + + # multiple failures + ret = self._nfs_export_apply(self.cluster_id, exports) + self.assertEqual(ret[0].returncode, errno.EIO) + self.assertIn("2 export blocks (at index 1, 2) failed to be " + "created/updated", ret[0].stderr.getvalue()) + + # single failure + exports[1]["fsal"]["fs_name"] = self.fs_name # correct the fs + ret = self._nfs_export_apply(self.cluster_id, exports) + self.assertEqual(ret[0].returncode, errno.EINVAL) + self.assertIn("Export ID changed, Cannot update export for " + "export block at index 1", ret[0].stderr.getvalue()) + finally: + self._delete_cluster_with_fs(self.fs_name, mnt_pt) + self.ctx.cluster.run(args=['rm', '-rf', f'{mnt_pt}']) + + def test_nfs_export_apply_single_export(self): + """ + Test that when single export creation/update fails with multiple + export blocks provided in the json/conf file using: + ceph nfs export apply -i <{conf/json}_file>, it + returns the respective errno and error status to CLI (along with + JSON output containing status of every export). + """ + + mnt_pt = self._sys_cmd(['mktemp', '-d']).decode().strip() + self._create_cluster_with_fs(self.fs_name, mnt_pt) + try: + self.ctx.cluster.run(args=['mkdir', f'{mnt_pt}/testdir1']) + self._create_export(export_id='1', + extra_cmd=['--pseudo-path', self.pseudo_path, + '--path', '/testdir1']) + export = { + "export_id": 1, + "path": "/testdir1", + "pseudo": self.pseudo_path, + "squash": "none", + "access_type": "rw", + "protocols": [4], + "fsal": { + "name": "CEPH", + "user_id": "nfs.test.1", + "fs_name": "invalid_fs_name" # invalid fs + } + } + ret = self._nfs_export_apply(self.cluster_id, export) + self.assertEqual(ret[0].returncode, errno.ENOENT) + self.assertIn("filesystem invalid_fs_name not found for " + "export block at index 1", ret[0].stderr.getvalue()) + finally: + self._delete_cluster_with_fs(self.fs_name, mnt_pt) + self.ctx.cluster.run(args=['rm', '-rf', f'{mnt_pt}']) + + def test_nfs_export_apply_json_output_states(self): + """ + If export creation/update is done using: + ceph nfs export apply -i <{conf/json}_file> then the + "status" field in the json output maybe added, updated, error or + warning. Test different scenarios to make sure these states are + in the json output as expected. + """ + + mnt_pt = self._sys_cmd(['mktemp', '-d']).decode().strip() + self._create_cluster_with_fs(self.fs_name, mnt_pt) + try: + self.ctx.cluster.run(args=['mkdir', f'{mnt_pt}/testdir1']) + self.ctx.cluster.run(args=['mkdir', f'{mnt_pt}/testdir2']) + self.ctx.cluster.run(args=['mkdir', f'{mnt_pt}/testdir3']) + self._create_export(export_id='1', + extra_cmd=['--pseudo-path', self.pseudo_path, + '--path', '/testdir1']) + exports = [ + { # change pseudo, state should be "updated" + "export_id": 1, + "path": "/testdir1", + "pseudo": self.pseudo_path+'1', + "squash": "none", + "access_type": "rw", + "protocols": [4], + "fsal": { + "name": "CEPH", + "user_id": "nfs.test.1", + "fs_name": self.fs_name + } + }, + { # a new export, state should be "added" + "export_id": 2, + "path": "/testdir2", + "pseudo": self.pseudo_path+'2', + "squash": "none", + "access_type": "rw", + "protocols": [4], + "fsal": { + "name": "CEPH", + "user_id": "nfs.test.2", + "fs_name": self.fs_name + } + }, + { # error in export block, state should be "error" since the + # fs_name is invalid + "export_id": 3, + "path": "/testdir3", + "pseudo": self.pseudo_path+'3', + "squash": "none", + "access_type": "RW", + "protocols": [4], + "fsal": { + "name": "CEPH", + "user_id": "nfs.test.3", + "fs_name": "invalid_fs_name" + } + } + ] + ret = self._nfs_export_apply(self.cluster_id, exports) + json_output = json.loads(ret[0].stdout.getvalue().strip()) + self.assertEqual(len(json_output), 3) + self.assertEqual(json_output[0]["state"], "updated") + self.assertEqual(json_output[1]["state"], "added") + self.assertEqual(json_output[2]["state"], "error") + finally: + self._delete_cluster_with_fs(self.fs_name, mnt_pt) + self.ctx.cluster.run(args=['rm', '-rf', f'{mnt_pt}']) From 3f11cd94c0add0ac182d03ed1fc1a8708d72a06d Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 12 Oct 2023 17:56:05 +0800 Subject: [PATCH 0162/2492] crimson/osd: store "final pool info" for pools that are deleted Although "final pool info" is needed by ec which is not yet implemented for crimson, we need this to make OSD reboot work Fixes: https://tracker.ceph.com/issues/63186 Signed-off-by: Xuehan Xu --- src/crimson/osd/osd_meta.cc | 31 +++++++++++++++++++++++++++++++ src/crimson/osd/osd_meta.h | 4 ++++ src/crimson/osd/shard_services.cc | 22 +++++++++++++++++----- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/src/crimson/osd/osd_meta.cc b/src/crimson/osd/osd_meta.cc index e40b2b2464be..0a9b42cd594f 100644 --- a/src/crimson/osd/osd_meta.cc +++ b/src/crimson/osd/osd_meta.cc @@ -9,6 +9,7 @@ #include "crimson/os/futurized_collection.h" #include "crimson/os/futurized_store.h" #include "os/Transaction.h" +#include "osd/OSDMap.h" using std::string; using read_errorator = crimson::os::FuturizedStore::Shard::read_errorator; @@ -80,6 +81,36 @@ OSDMeta::load_final_pool_info(int64_t pool) { })); } +void OSDMeta::store_final_pool_info( + ceph::os::Transaction &t, + OSDMap* lastmap, + std::map &added_map) +{ + for (auto [e, map] : added_map) { + if (!lastmap) { + lastmap = map; + continue; + } + for (auto &[pool_id, pool] : lastmap->get_pools()) { + if (!map->have_pg_pool(pool_id)) { + ghobject_t obj = final_pool_info_oid(pool_id); + bufferlist bl; + encode(pool, bl, CEPH_FEATURES_ALL); + string name = lastmap->get_pool_name(pool_id); + encode(name, bl); + std::map profile; + if (pool.is_erasure()) { + profile = lastmap->get_erasure_code_profile( + pool.erasure_code_profile); + } + encode(profile, bl); + t.write(coll->get_cid(), obj, 0, bl.length(), bl); + } + } + lastmap = map; + } +} + ghobject_t OSDMeta::osdmap_oid(epoch_t epoch) { string name = fmt::format("osdmap.{}", epoch); diff --git a/src/crimson/osd/osd_meta.h b/src/crimson/osd/osd_meta.h index 652266d9e201..af18991ff74b 100644 --- a/src/crimson/osd/osd_meta.h +++ b/src/crimson/osd/osd_meta.h @@ -53,6 +53,10 @@ class OSDMeta { seastar::future> load_final_pool_info(int64_t pool); + void store_final_pool_info( + ceph::os::Transaction&, + OSDMap* lastmap, + std::map&); private: static ghobject_t osdmap_oid(epoch_t epoch); static ghobject_t final_pool_info_oid(int64_t pool); diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc index a6431305d806..dd11cddce733 100644 --- a/src/crimson/osd/shard_services.cc +++ b/src/crimson/osd/shard_services.cc @@ -420,15 +420,19 @@ seastar::future> OSDSingletonState::load_map(epoch_t e) seastar::future<> OSDSingletonState::store_maps(ceph::os::Transaction& t, epoch_t start, Ref m) { - return seastar::do_for_each( - boost::make_counting_iterator(start), - boost::make_counting_iterator(m->get_last() + 1), - [&t, m, this](epoch_t e) { + return seastar::do_with( + std::map(), + [&t, m, start, this](auto &added_maps) { + return seastar::do_for_each( + boost::make_counting_iterator(start), + boost::make_counting_iterator(m->get_last() + 1), + [&t, m, this, &added_maps](epoch_t e) { if (auto p = m->maps.find(e); p != m->maps.end()) { auto o = std::make_unique(); o->decode(p->second); logger().info("store_maps storing osdmap.{}", e); store_map_bl(t, e, std::move(std::move(p->second))); + added_maps.emplace(e, o.get()); osdmaps.insert(e, std::move(o)); return seastar::now(); } else if (auto p = m->incremental_maps.find(e); @@ -436,7 +440,8 @@ seastar::future<> OSDSingletonState::store_maps(ceph::os::Transaction& t, logger().info("store_maps found osdmap.{} incremental map, " "loading osdmap.{}", e, e - 1); ceph_assert(std::cmp_greater(e, 0u)); - return load_map(e - 1).then([e, bl=p->second, &t, this](auto o) { + return load_map(e - 1).then( + [&added_maps, e, bl=p->second, &t, this](auto o) { OSDMap::Incremental inc; auto i = bl.cbegin(); inc.decode(i); @@ -445,6 +450,7 @@ seastar::future<> OSDSingletonState::store_maps(ceph::os::Transaction& t, o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED); logger().info("store_maps storing osdmap.{}", o->get_epoch()); store_map_bl(t, e, std::move(fbl)); + added_maps.emplace(e, o.get()); osdmaps.insert(e, std::move(o)); return seastar::now(); }); @@ -452,7 +458,13 @@ seastar::future<> OSDSingletonState::store_maps(ceph::os::Transaction& t, logger().error("MOSDMap lied about what maps it had?"); return seastar::now(); } + }).then([&t, this, &added_maps] { + auto [e, map] = *added_maps.begin(); + auto lastmap = osdmaps.find(e - 1).get(); + meta_coll->store_final_pool_info(t, lastmap, added_maps); + return seastar::now(); }); + }); } seastar::future> ShardServices::make_pg( From 8fed26a185215d1a92c88544fd64f84a94bb106e Mon Sep 17 00:00:00 2001 From: "yite.gu" Date: Fri, 13 Oct 2023 02:50:35 +0800 Subject: [PATCH 0163/2492] os/bluestore: add bluestore fragmentation micros to prometheus High fragmentation is one of the factors affecting performance decrease, add it to prometheus, it helps to us monitor fragmentation changes. Signed-off-by: Yite Gu --- src/os/bluestore/BlueStore.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index ec03fcde14ae..67f9fb92c6fb 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -6088,7 +6088,9 @@ void BlueStore::_init_logger() PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); b.add_u64(l_bluestore_fragmentation, "fragmentation_micros", - "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000"); + "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000", + "fbss", + PerfCountersBuilder::PRIO_USEFUL); b.add_u64(l_bluestore_alloc_unit, "alloc_unit", "allocation unit size in bytes", "au_b", From cc373de90932a4f2dd54466062fae89f6598ea67 Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Tue, 2 May 2023 16:22:19 -0400 Subject: [PATCH 0164/2492] common: perf counters cache + rgw op labeled counters This commit contains the following features: - a perf counters cache per CephContext that acts as a wrapper around perf counters for storing and modifying labeled perf counters per CephContext - instrumentation of the rgw with labeled perf injest counters for the major rgw ops Signed-off-by: Ali Maredia --- src/common/CMakeLists.txt | 1 + src/common/options/rgw.yaml.in | 18 + src/common/perf_counters.cc | 7 +- src/common/perf_counters_cache.cc | 116 +++ src/common/perf_counters_cache.h | 83 ++ src/rgw/rgw_file.cc | 6 +- src/rgw/rgw_op.cc | 52 +- src/rgw/rgw_perf_counters.cc | 188 +++-- src/rgw/rgw_perf_counters.h | 68 +- src/test/CMakeLists.txt | 7 + src/test/test_perf_counters_cache.cc | 1063 ++++++++++++++++++++++++++ 11 files changed, 1543 insertions(+), 66 deletions(-) create mode 100644 src/common/perf_counters_cache.cc create mode 100644 src/common/perf_counters_cache.h create mode 100644 src/test/test_perf_counters_cache.cc diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 695ea7a68b3e..5f07f7e8483c 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -83,6 +83,7 @@ set(common_srcs options.cc page.cc perf_counters.cc + perf_counters_cache.cc perf_counters_collection.cc perf_counters_key.cc perf_histogram.cc diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in index 3971929e412d..f2f85b648f8b 100644 --- a/src/common/options/rgw.yaml.in +++ b/src/common/options/rgw.yaml.in @@ -3829,8 +3829,26 @@ options: other form of policies that Amazon does, so if you are mirroring policies between RGW and AWS, you may wish to set this to false. default: true +- name: rgw_perf_counters_cache + type: bool + level: dev + default: false + desc: enable rgw labeled perf counters cache + long desc: If set to true, rgw creates labeled perf counters and stores them + in an rgw specific labeled perf counters cache. + see_also: + - rgw_perf_counters_cache_size services: - rgw + with_legacy: true +- name: rgw_perf_counters_cache_size + type: uint + level: advanced + desc: Number of labeled perf counters the rgw perf counters cache can store + default: 10000 + services: + - rgw + with_legacy: true - name: rgw_d4n_host type: str level: advanced diff --git a/src/common/perf_counters.cc b/src/common/perf_counters.cc index b5e361b505cd..81bf3284bdae 100644 --- a/src/common/perf_counters.cc +++ b/src/common/perf_counters.cc @@ -135,7 +135,12 @@ void PerfCountersCollectionImpl::dump_formatted_generic( const std::string &counter) const { f->open_object_section("perfcounter_collection"); - + // close out all of counters collection immediately if collection is empty + if (m_loggers.empty()) { + f->close_section(); // all of counters collection + return; + } + if (dump_labeled) { std::string prev_key_name; for (auto l = m_loggers.begin(); l != m_loggers.end(); ++l) { diff --git a/src/common/perf_counters_cache.cc b/src/common/perf_counters_cache.cc new file mode 100644 index 000000000000..e0810508ce7f --- /dev/null +++ b/src/common/perf_counters_cache.cc @@ -0,0 +1,116 @@ +#include "common/perf_counters_cache.h" +#include "common/perf_counters_key.h" + +namespace ceph::perf_counters { + +void PerfCountersCache::check_key(const std::string &key) { + std::string_view key_name = ceph::perf_counters::key_name(key); + // return false for empty key name + assert(key_name != ""); + + // if there are no labels key name is not valid + auto key_labels = ceph::perf_counters::key_labels(key); + assert(key_labels.begin() != key_labels.end()); + + // don't accept keys where any labels have an empty label name + for (auto key_label : key_labels) { + assert(key_label.first != ""); + assert(key_label.second != ""); + } +} + +std::shared_ptr PerfCountersCache::add(const std::string &key) { + check_key(key); + + auto [ref, key_existed] = cache.get_or_create(key); + if (!key_existed) { + ref->counters = create_counters(key, cct); + assert(ref->counters); + ref->cct = cct; + } + return ref->counters; +} + + +std::shared_ptr PerfCountersCache::get(const std::string &key) { + std::lock_guard lock(m_lock); + return add(key); +} + +void PerfCountersCache::inc(const std::string &key, int indx, uint64_t v) { + std::lock_guard lock(m_lock); + auto counters = add(key); + if (counters) { + counters->inc(indx, v); + } +} + +void PerfCountersCache::dec(const std::string &key, int indx, uint64_t v) { + std::lock_guard lock(m_lock); + auto counters = add(key); + if (counters) { + counters->dec(indx, v); + } +} + +void PerfCountersCache::tinc(const std::string &key, int indx, utime_t amt) { + std::lock_guard lock(m_lock); + auto counters = add(key); + if (counters) { + counters->tinc(indx, amt); + } +} + +void PerfCountersCache::tinc(const std::string &key, int indx, ceph::timespan amt) { + std::lock_guard lock(m_lock); + auto counters = add(key); + if (counters) { + counters->tinc(indx, amt); + } +} + +void PerfCountersCache::set_counter(const std::string &key, int indx, uint64_t val) { + std::lock_guard lock(m_lock); + auto counters = add(key); + if (counters) { + counters->set(indx, val); + } +} + +uint64_t PerfCountersCache::get_counter(const std::string &key, int indx) { + std::lock_guard lock(m_lock); + auto counters = add(key); + uint64_t val = 0; + if (counters) { + val = counters->get(indx); + } + return val; +} + +utime_t PerfCountersCache::tget(const std::string &key, int indx) { + std::lock_guard lock(m_lock); + auto counters = add(key); + utime_t val; + if (counters) { + val = counters->tget(indx); + return val; + } else { + return utime_t(); + } +} + +void PerfCountersCache::tset(const std::string &key, int indx, utime_t amt) { + std::lock_guard lock(m_lock); + auto counters = add(key); + if (counters) { + counters->tset(indx, amt); + } +} + +PerfCountersCache::PerfCountersCache(CephContext *_cct, size_t _target_size, + std::function(const std::string&, CephContext*)> _create_counters) + : cct(_cct), create_counters(_create_counters), m_lock(ceph::make_mutex("PerfCountersCache")) { cache.set_target_size(_target_size); } + +PerfCountersCache::~PerfCountersCache() { cache.set_target_size(0); } + +} // namespace ceph::perf_counters diff --git a/src/common/perf_counters_cache.h b/src/common/perf_counters_cache.h new file mode 100644 index 000000000000..866f56ee3502 --- /dev/null +++ b/src/common/perf_counters_cache.h @@ -0,0 +1,83 @@ +#pragma once + +#include "common/perf_counters.h" +#include "common/ceph_context.h" +#include "common/intrusive_lru.h" + +namespace ceph::perf_counters { + +struct perf_counters_cache_item_to_key; + +struct PerfCountersCacheEntry : public ceph::common::intrusive_lru_base< + ceph::common::intrusive_lru_config< + std::string, PerfCountersCacheEntry, perf_counters_cache_item_to_key>> { + std::string key; + std::shared_ptr counters; + CephContext *cct; + + PerfCountersCacheEntry(const std::string &_key) : key(_key) {} + + ~PerfCountersCacheEntry() { + if (counters) { + cct->get_perfcounters_collection()->remove(counters.get()); + } + } +}; + +struct perf_counters_cache_item_to_key { + using type = std::string; + const type &operator()(const PerfCountersCacheEntry &entry) { + return entry.key; + } +}; + +class PerfCountersCache { +private: + CephContext *cct; + std::function(const std::string&, CephContext*)> create_counters; + PerfCountersCacheEntry::lru_t cache; + mutable ceph::mutex m_lock; + + /* check to make sure key name is non-empty and non-empty labels + * + * A valid key has the the form + * key\0label1\0val1\0label2\0val2 ... label\0valN + * The following 3 properties checked for in this function + * 1. A non-empty key + * 2. At least 1 set of labels + * 3. Each label has a non-empty key and value + * + * See perf_counters_key.h + */ + void check_key(const std::string &key); + + // adds a new entry to the cache and returns its respective PerfCounter* + // or returns the PerfCounter* of an existing entry in the cache + std::shared_ptr add(const std::string &key); + +public: + + // get() and its associated shared_ptr reference counting should be avoided + // unless the caller intends to modify multiple counter values at the same time. + // If multiple counter values will not be modified at the same time, inc/dec/etc. + // are recommended. + std::shared_ptr get(const std::string &key); + + void inc(const std::string &key, int indx, uint64_t v); + void dec(const std::string &key, int indx, uint64_t v); + void tinc(const std::string &key, int indx, utime_t amt); + void tinc(const std::string &key, int indx, ceph::timespan amt); + void set_counter(const std::string &key, int indx, uint64_t val); + uint64_t get_counter(const std::string &key, int indx); + utime_t tget(const std::string &key, int indx); + void tset(const std::string &key, int indx, utime_t amt); + + // _create_counters should be a function that returns a valid, newly created perf counters instance + // Ceph components utilizing the PerfCountersCache are encouraged to pass in a factory function that would + // create and initialize different kinds of counters based on the name returned from ceph::perfcounters::key_name(key) + PerfCountersCache(CephContext *_cct, size_t _target_size, + std::function(const std::string&, CephContext*)> _create_counters); + ~PerfCountersCache(); +}; + +} // namespace ceph::perf_counters diff --git a/src/rgw/rgw_file.cc b/src/rgw/rgw_file.cc index 6a55d3f1d8f5..92aa66f060eb 100644 --- a/src/rgw/rgw_file.cc +++ b/src/rgw/rgw_file.cc @@ -1838,7 +1838,7 @@ namespace rgw { ceph_assert(! dlo_manifest); ceph_assert(! slo_info); - perfcounter->inc(l_rgw_put); + rgw::op_counters::global_op_counters->inc(l_rgw_op_put); op_ret = -EINVAL; if (state->object->empty()) { @@ -1944,7 +1944,7 @@ namespace rgw { real_time appx_t = real_clock::now(); state->obj_size = bytes_written; - perfcounter->inc(l_rgw_put_b, state->obj_size); + rgw::op_counters::global_op_counters->inc(l_rgw_op_put_b, state->obj_size); // flush data in filters op_ret = filter->process({}, state->obj_size); @@ -2027,7 +2027,7 @@ namespace rgw { } done: - perfcounter->tinc(l_rgw_put_lat, state->time_elapsed()); + rgw::op_counters::global_op_counters->tinc(l_rgw_op_put_lat, state->time_elapsed()); return op_ret; } /* exec_finish */ diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 8c15e5bd2e3f..cd347502e76c 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -21,6 +21,7 @@ #include "common/utf8.h" #include "common/ceph_json.h" #include "common/static_ptr.h" +#include "common/perf_counters_key.h" #include "rgw_tracer.h" #include "rgw_rados.h" @@ -1691,7 +1692,8 @@ int RGWGetObj::read_user_manifest_part(rgw::sal::Bucket* bucket, return 0; } - perfcounter->inc(l_rgw_get_b, cur_end - cur_ofs); + auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); + rgw::op_counters::inc(labeled_counters, l_rgw_op_get_b, cur_end - cur_ofs); filter->fixup_range(cur_ofs, cur_end); op_ret = read_op->iterate(this, cur_ofs, cur_end, filter, s->yield); if (op_ret >= 0) @@ -1764,8 +1766,8 @@ static int iterate_user_manifest_parts(const DoutPrefixProvider *dpp, found_end = true; } - perfcounter->tinc(l_rgw_get_lat, - (ceph_clock_now() - start_time)); + rgw::op_counters::global_op_counters->tinc(l_rgw_op_get_lat, + (ceph_clock_now() - start_time)); if (found_start && !handled_end) { len_count += end_ofs - start_ofs; @@ -1860,8 +1862,8 @@ static int iterate_slo_parts(const DoutPrefixProvider *dpp, found_end = true; } - perfcounter->tinc(l_rgw_get_lat, - (ceph_clock_now() - start_time)); + rgw::op_counters::global_op_counters->tinc(l_rgw_op_get_lat, + (ceph_clock_now() - start_time)); if (found_start) { if (cb) { @@ -2208,7 +2210,8 @@ void RGWGetObj::execute(optional_yield y) std::unique_ptr run_lua; map::iterator attr_iter; - perfcounter->inc(l_rgw_get); + auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); + rgw::op_counters::inc(labeled_counters, l_rgw_op_get, 1); std::unique_ptr read_op(s->object->get_read_op()); @@ -2406,14 +2409,15 @@ void RGWGetObj::execute(optional_yield y) return; } - perfcounter->inc(l_rgw_get_b, end - ofs); + rgw::op_counters::inc(labeled_counters, l_rgw_op_get_b, end-ofs); op_ret = read_op->iterate(this, ofs_x, end_x, filter, s->yield); if (op_ret >= 0) op_ret = filter->flush(); - perfcounter->tinc(l_rgw_get_lat, s->time_elapsed()); + rgw::op_counters::tinc(labeled_counters, l_rgw_op_get_lat, s->time_elapsed()); + if (op_ret < 0) { goto done_err; } @@ -2489,6 +2493,9 @@ void RGWListBuckets::execute(optional_yield y) const uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk; + auto labeled_counters = rgw::op_counters::get({{"User", s->user->get_id().id}}); + rgw::op_counters::inc(labeled_counters, l_rgw_op_list_buckets, 1); + auto g = make_scope_guard([this, &started] { if (!started) { send_response_begin(false); @@ -2565,6 +2572,8 @@ void RGWListBuckets::execute(optional_yield y) handle_listing_chunk(listing.buckets); } while (!marker.empty() && !done); + + rgw::op_counters::tinc(labeled_counters, l_rgw_op_list_buckets_lat, s->time_elapsed()); } void RGWGetUsage::execute(optional_yield y) @@ -3049,6 +3058,10 @@ void RGWListBucket::execute(optional_yield y) objs = std::move(results.objs); common_prefixes = std::move(results.common_prefixes); } + + auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); + rgw::op_counters::inc(labeled_counters, l_rgw_op_list_obj, 1); + rgw::op_counters::tinc(labeled_counters, l_rgw_op_list_obj_lat, s->time_elapsed()); } int RGWGetBucketLogging::verify_permission(optional_yield y) @@ -3582,6 +3595,10 @@ void RGWDeleteBucket::execute(optional_yield y) op_ret = 0; } + auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); + rgw::op_counters::inc(labeled_counters, l_rgw_op_del_bucket, 1); + rgw::op_counters::tinc(labeled_counters, l_rgw_op_del_bucket_lat, s->time_elapsed()); + return; } @@ -4008,11 +4025,14 @@ void RGWPutObj::execute(optional_yield y) off_t fst; off_t lst; + auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); + bool need_calc_md5 = (dlo_manifest == NULL) && (slo_info == NULL); - perfcounter->inc(l_rgw_put); + rgw::op_counters::inc(labeled_counters, l_rgw_op_put, 1); + // report latency on return auto put_lat = make_scope_guard([&] { - perfcounter->tinc(l_rgw_put_lat, s->time_elapsed()); + rgw::op_counters::tinc(labeled_counters, l_rgw_op_put_lat, s->time_elapsed()); }); op_ret = -EINVAL; @@ -4287,7 +4307,7 @@ void RGWPutObj::execute(optional_yield y) s->obj_size = ofs; s->object->set_obj_size(ofs); - perfcounter->inc(l_rgw_put_b, s->obj_size); + rgw::op_counters::inc(labeled_counters, l_rgw_op_put_b, s->obj_size); op_ret = do_aws4_auth_completion(); if (op_ret < 0) { @@ -5244,6 +5264,11 @@ void RGWDeleteObj::execute(optional_yield y) op_ret = 0; } + auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); + rgw::op_counters::inc(labeled_counters, l_rgw_op_del_obj, 1); + rgw::op_counters::inc(labeled_counters, l_rgw_op_del_obj_b, obj_size); + rgw::op_counters::tinc(labeled_counters, l_rgw_op_del_obj_lat, s->time_elapsed()); + // send request to notification manager int ret = res->publish_commit(this, obj_size, ceph::real_clock::now(), etag, version_id); if (ret < 0) { @@ -5703,6 +5728,11 @@ void RGWCopyObj::execute(optional_yield y) ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl; // too late to rollback operation, hence op_ret is not set here } + + auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); + rgw::op_counters::inc(labeled_counters, l_rgw_op_copy_obj, 1); + rgw::op_counters::inc(labeled_counters, l_rgw_op_copy_obj_b, obj_size); + rgw::op_counters::tinc(labeled_counters, l_rgw_op_copy_obj_lat, s->time_elapsed()); } int RGWGetACLs::verify_permission(optional_yield y) diff --git a/src/rgw/rgw_perf_counters.cc b/src/rgw/rgw_perf_counters.cc index 6757dd8913cf..aca56a60946d 100644 --- a/src/rgw/rgw_perf_counters.cc +++ b/src/rgw/rgw_perf_counters.cc @@ -3,69 +3,171 @@ #include "rgw_perf_counters.h" #include "common/perf_counters.h" +#include "common/perf_counters_key.h" #include "common/ceph_context.h" PerfCounters *perfcounter = NULL; +ceph::perf_counters::PerfCountersCache *perf_counters_cache = NULL; +std::string rgw_op_counters_key = "rgw_op"; -int rgw_perf_start(CephContext *cct) -{ - PerfCountersBuilder plb(cct, "rgw", l_rgw_first, l_rgw_last); - +static void add_rgw_frontend_counters(PerfCountersBuilder *pcb) { // RGW emits comparatively few metrics, so let's be generous // and mark them all USEFUL to get transmission to ceph-mgr by default. - plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + pcb->set_prio_default(PerfCountersBuilder::PRIO_USEFUL); - plb.add_u64_counter(l_rgw_req, "req", "Requests"); - plb.add_u64_counter(l_rgw_failed_req, "failed_req", "Aborted requests"); + pcb->add_u64_counter(l_rgw_req, "req", "Requests"); + pcb->add_u64_counter(l_rgw_failed_req, "failed_req", "Aborted requests"); - plb.add_u64_counter(l_rgw_get, "get", "Gets"); - plb.add_u64_counter(l_rgw_get_b, "get_b", "Size of gets"); - plb.add_time_avg(l_rgw_get_lat, "get_initial_lat", "Get latency"); - plb.add_u64_counter(l_rgw_put, "put", "Puts"); - plb.add_u64_counter(l_rgw_put_b, "put_b", "Size of puts"); - plb.add_time_avg(l_rgw_put_lat, "put_initial_lat", "Put latency"); + pcb->add_u64(l_rgw_qlen, "qlen", "Queue length"); + pcb->add_u64(l_rgw_qactive, "qactive", "Active requests queue"); - plb.add_u64(l_rgw_qlen, "qlen", "Queue length"); - plb.add_u64(l_rgw_qactive, "qactive", "Active requests queue"); + pcb->add_u64_counter(l_rgw_cache_hit, "cache_hit", "Cache hits"); + pcb->add_u64_counter(l_rgw_cache_miss, "cache_miss", "Cache miss"); - plb.add_u64_counter(l_rgw_cache_hit, "cache_hit", "Cache hits"); - plb.add_u64_counter(l_rgw_cache_miss, "cache_miss", "Cache miss"); + pcb->add_u64_counter(l_rgw_keystone_token_cache_hit, "keystone_token_cache_hit", "Keystone token cache hits"); + pcb->add_u64_counter(l_rgw_keystone_token_cache_miss, "keystone_token_cache_miss", "Keystone token cache miss"); - plb.add_u64_counter(l_rgw_keystone_token_cache_hit, "keystone_token_cache_hit", "Keystone token cache hits"); - plb.add_u64_counter(l_rgw_keystone_token_cache_miss, "keystone_token_cache_miss", "Keystone token cache miss"); + pcb->add_u64_counter(l_rgw_gc_retire, "gc_retire_object", "GC object retires"); - plb.add_u64_counter(l_rgw_gc_retire, "gc_retire_object", "GC object retires"); - - plb.add_u64_counter(l_rgw_lc_expire_current, "lc_expire_current", + pcb->add_u64_counter(l_rgw_lc_expire_current, "lc_expire_current", "Lifecycle current expiration"); - plb.add_u64_counter(l_rgw_lc_expire_noncurrent, "lc_expire_noncurrent", + pcb->add_u64_counter(l_rgw_lc_expire_noncurrent, "lc_expire_noncurrent", "Lifecycle non-current expiration"); - plb.add_u64_counter(l_rgw_lc_expire_dm, "lc_expire_dm", + pcb->add_u64_counter(l_rgw_lc_expire_dm, "lc_expire_dm", "Lifecycle delete-marker expiration"); - plb.add_u64_counter(l_rgw_lc_transition_current, "lc_transition_current", + pcb->add_u64_counter(l_rgw_lc_transition_current, "lc_transition_current", "Lifecycle current transition"); - plb.add_u64_counter(l_rgw_lc_transition_noncurrent, + pcb->add_u64_counter(l_rgw_lc_transition_noncurrent, "lc_transition_noncurrent", "Lifecycle non-current transition"); - plb.add_u64_counter(l_rgw_lc_abort_mpu, "lc_abort_mpu", + pcb->add_u64_counter(l_rgw_lc_abort_mpu, "lc_abort_mpu", "Lifecycle abort multipart upload"); - plb.add_u64_counter(l_rgw_pubsub_event_triggered, "pubsub_event_triggered", "Pubsub events with at least one topic"); - plb.add_u64_counter(l_rgw_pubsub_event_lost, "pubsub_event_lost", "Pubsub events lost"); - plb.add_u64_counter(l_rgw_pubsub_store_ok, "pubsub_store_ok", "Pubsub events successfully stored"); - plb.add_u64_counter(l_rgw_pubsub_store_fail, "pubsub_store_fail", "Pubsub events failed to be stored"); - plb.add_u64(l_rgw_pubsub_events, "pubsub_events", "Pubsub events in store"); - plb.add_u64_counter(l_rgw_pubsub_push_ok, "pubsub_push_ok", "Pubsub events pushed to an endpoint"); - plb.add_u64_counter(l_rgw_pubsub_push_failed, "pubsub_push_failed", "Pubsub events failed to be pushed to an endpoint"); - plb.add_u64(l_rgw_pubsub_push_pending, "pubsub_push_pending", "Pubsub events pending reply from endpoint"); - plb.add_u64_counter(l_rgw_pubsub_missing_conf, "pubsub_missing_conf", "Pubsub events could not be handled because of missing configuration"); - - plb.add_u64_counter(l_rgw_lua_script_ok, "lua_script_ok", "Successfull executions of Lua scripts"); - plb.add_u64_counter(l_rgw_lua_script_fail, "lua_script_fail", "Failed executions of Lua scripts"); - plb.add_u64(l_rgw_lua_current_vms, "lua_current_vms", "Number of Lua VMs currently being executed"); + pcb->add_u64_counter(l_rgw_pubsub_event_triggered, "pubsub_event_triggered", "Pubsub events with at least one topic"); + pcb->add_u64_counter(l_rgw_pubsub_event_lost, "pubsub_event_lost", "Pubsub events lost"); + pcb->add_u64_counter(l_rgw_pubsub_store_ok, "pubsub_store_ok", "Pubsub events successfully stored"); + pcb->add_u64_counter(l_rgw_pubsub_store_fail, "pubsub_store_fail", "Pubsub events failed to be stored"); + pcb->add_u64(l_rgw_pubsub_events, "pubsub_events", "Pubsub events in store"); + pcb->add_u64_counter(l_rgw_pubsub_push_ok, "pubsub_push_ok", "Pubsub events pushed to an endpoint"); + pcb->add_u64_counter(l_rgw_pubsub_push_failed, "pubsub_push_failed", "Pubsub events failed to be pushed to an endpoint"); + pcb->add_u64(l_rgw_pubsub_push_pending, "pubsub_push_pending", "Pubsub events pending reply from endpoint"); + pcb->add_u64_counter(l_rgw_pubsub_missing_conf, "pubsub_missing_conf", "Pubsub events could not be handled because of missing configuration"); - perfcounter = plb.create_perf_counters(); - cct->get_perfcounters_collection()->add(perfcounter); + pcb->add_u64_counter(l_rgw_lua_script_ok, "lua_script_ok", "Successfull executions of Lua scripts"); + pcb->add_u64_counter(l_rgw_lua_script_fail, "lua_script_fail", "Failed executions of Lua scripts"); + pcb->add_u64(l_rgw_lua_current_vms, "lua_current_vms", "Number of Lua VMs currently being executed"); +} + +static void add_rgw_op_counters(PerfCountersBuilder *lpcb) { + // description must match general rgw counters description above + lpcb->set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + + lpcb->add_u64_counter(l_rgw_op_put, "put_ops", "Puts"); + lpcb->add_u64_counter(l_rgw_op_put_b, "put_b", "Size of puts"); + lpcb->add_time_avg(l_rgw_op_put_lat, "put_initial_lat", "Put latency"); + + lpcb->add_u64_counter(l_rgw_op_get, "get_ops", "Gets"); + lpcb->add_u64_counter(l_rgw_op_get_b, "get_b", "Size of gets"); + lpcb->add_time_avg(l_rgw_op_get_lat, "get_initial_lat", "Get latency"); + + lpcb->add_u64_counter(l_rgw_op_del_obj, "del_obj_ops", "Delete objects"); + lpcb->add_u64_counter(l_rgw_op_del_obj_b, "del_obj_bytes", "Size of delete objects"); + lpcb->add_time_avg(l_rgw_op_del_obj_lat, "del_obj_lat", "Delete object latency"); + + lpcb->add_u64_counter(l_rgw_op_del_bucket, "del_bucket_ops", "Delete Buckets"); + lpcb->add_time_avg(l_rgw_op_del_bucket_lat, "del_bucket_lat", "Delete bucket latency"); + + lpcb->add_u64_counter(l_rgw_op_copy_obj, "copy_obj_ops", "Copy objects"); + lpcb->add_u64_counter(l_rgw_op_copy_obj_b, "copy_obj_bytes", "Size of copy objects"); + lpcb->add_time_avg(l_rgw_op_copy_obj_lat, "copy_obj_lat", "Copy object latency"); + + lpcb->add_u64_counter(l_rgw_op_list_obj, "list_obj_ops", "List objects"); + lpcb->add_time_avg(l_rgw_op_list_obj_lat, "list_obj_lat", "List objects latency"); + + lpcb->add_u64_counter(l_rgw_op_list_buckets, "list_buckets_ops", "List buckets"); + lpcb->add_time_avg(l_rgw_op_list_buckets_lat, "list_buckets_lat", "List buckets latency"); +} + +std::shared_ptr create_rgw_counters(const std::string& name, CephContext *cct) { + std::string_view key = ceph::perf_counters::key_name(name); + if (rgw_op_counters_key.compare(key) == 0) { + PerfCountersBuilder pcb(cct, name, l_rgw_op_first, l_rgw_op_last); + add_rgw_op_counters(&pcb); + std::shared_ptr new_counters(pcb.create_perf_counters()); + cct->get_perfcounters_collection()->add(new_counters.get()); + return new_counters; + } else { + PerfCountersBuilder pcb(cct, name, l_rgw_first, l_rgw_last); + add_rgw_frontend_counters(&pcb); + std::shared_ptr new_counters(pcb.create_perf_counters()); + cct->get_perfcounters_collection()->add(new_counters.get()); + return new_counters; + } +} + +void frontend_counters_init(CephContext *cct) { + PerfCountersBuilder pcb(cct, "rgw", l_rgw_first, l_rgw_last); + add_rgw_frontend_counters(&pcb); + PerfCounters *new_counters = pcb.create_perf_counters(); + cct->get_perfcounters_collection()->add(new_counters); + perfcounter = new_counters; +} + +namespace rgw::op_counters { + +PerfCounters *global_op_counters = NULL; + +void global_op_counters_init(CephContext *cct) { + PerfCountersBuilder pcb(cct, rgw_op_counters_key, l_rgw_op_first, l_rgw_op_last); + add_rgw_op_counters(&pcb); + PerfCounters *new_counters = pcb.create_perf_counters(); + cct->get_perfcounters_collection()->add(new_counters); + global_op_counters = new_counters; +} + +void inc(std::shared_ptr labeled_counters, int idx, uint64_t v) { + if (labeled_counters) { + PerfCounters *counter = labeled_counters.get(); + counter->inc(idx, v); + } + if (global_op_counters) { + global_op_counters->inc(idx, v); + } +} + +void tinc(std::shared_ptr labeled_counters, int idx, utime_t amt) { + if (labeled_counters) { + PerfCounters *counter = labeled_counters.get(); + counter->tinc(idx, amt); + } + if (global_op_counters) { + global_op_counters->tinc(idx, amt); + } +} + +void tinc(std::shared_ptr labeled_counters, int idx, ceph::timespan amt) { + if (labeled_counters) { + PerfCounters *counter = labeled_counters.get(); + counter->tinc(idx, amt); + } + if (global_op_counters) { + global_op_counters->tinc(idx, amt); + } +} + +} // namespace rgw::op_counters + +int rgw_perf_start(CephContext *cct) +{ + frontend_counters_init(cct); + + bool cache_enabled = cct->_conf.get_val("rgw_perf_counters_cache"); + if (cache_enabled) { + uint64_t target_size = cct->_conf.get_val("rgw_perf_counters_cache_size"); + perf_counters_cache = new ceph::perf_counters::PerfCountersCache(cct, target_size, create_rgw_counters); + } + + rgw::op_counters::global_op_counters_init(cct); return 0; } @@ -74,5 +176,5 @@ void rgw_perf_stop(CephContext *cct) ceph_assert(perfcounter); cct->get_perfcounters_collection()->remove(perfcounter); delete perfcounter; + delete perf_counters_cache; } - diff --git a/src/rgw/rgw_perf_counters.h b/src/rgw/rgw_perf_counters.h index 3c4e4e97f023..49f7e4d4218b 100644 --- a/src/rgw/rgw_perf_counters.h +++ b/src/rgw/rgw_perf_counters.h @@ -4,25 +4,23 @@ #pragma once #include "include/common_fwd.h" +#include "common/perf_counters_cache.h" +#include "common/perf_counters_key.h" extern PerfCounters *perfcounter; +extern ceph::perf_counters::PerfCountersCache *perf_counters_cache; +extern std::string rgw_op_counters_key; extern int rgw_perf_start(CephContext *cct); extern void rgw_perf_stop(CephContext *cct); +extern void frontend_counters_init(CephContext *cct); +extern std::shared_ptr create_rgw_counters(const std::string& name, CephContext *cct); enum { l_rgw_first = 15000, l_rgw_req, l_rgw_failed_req, - l_rgw_get, - l_rgw_get_b, - l_rgw_get_lat, - - l_rgw_put, - l_rgw_put_b, - l_rgw_put_lat, - l_rgw_qlen, l_rgw_qactive, @@ -58,3 +56,57 @@ enum { l_rgw_last, }; +enum { + l_rgw_op_first = 16000, + + l_rgw_op_put, + l_rgw_op_put_b, + l_rgw_op_put_lat, + + l_rgw_op_get, + l_rgw_op_get_b, + l_rgw_op_get_lat, + + l_rgw_op_del_obj, + l_rgw_op_del_obj_b, + l_rgw_op_del_obj_lat, + + l_rgw_op_del_bucket, + l_rgw_op_del_bucket_lat, + + l_rgw_op_copy_obj, + l_rgw_op_copy_obj_b, + l_rgw_op_copy_obj_lat, + + l_rgw_op_list_obj, + l_rgw_op_list_obj_lat, + + l_rgw_op_list_buckets, + l_rgw_op_list_buckets_lat, + + l_rgw_op_last +}; + +namespace rgw::op_counters { + +extern PerfCounters *global_op_counters; + +void global_op_counters_init(CephContext *cct); + +template +std::shared_ptr get(ceph::perf_counters::label_pair (&&labels)[Count]) { + if (perf_counters_cache) { + std::string key = ceph::perf_counters::key_create(rgw_op_counters_key, std::move(labels)); + return perf_counters_cache->get(key); + } else { + return std::shared_ptr(nullptr); + } +} + +void inc(std::shared_ptr labeled_counters, int idx, uint64_t v); + +void tinc(std::shared_ptr labeled_counters, int idx, utime_t); + +void tinc(std::shared_ptr labeled_counters, int idx, ceph::timespan amt); + +} // namespace rgw::op_counters diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 09281ab2dbf5..f8e2f2a1b011 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -834,6 +834,13 @@ add_executable(unittest_perf_counters add_ceph_unittest(unittest_perf_counters) target_link_libraries(unittest_perf_counters global) +# unittest_perf_counters_cache +add_executable(unittest_perf_counters_cache + test_perf_counters_cache.cc + ) +add_ceph_unittest(unittest_perf_counters_cache) +target_link_libraries(unittest_perf_counters_cache global) + # unittest_ceph_crypto add_executable(unittest_ceph_crypto ceph_crypto.cc) diff --git a/src/test/test_perf_counters_cache.cc b/src/test/test_perf_counters_cache.cc new file mode 100644 index 000000000000..16d92bd7d431 --- /dev/null +++ b/src/test/test_perf_counters_cache.cc @@ -0,0 +1,1063 @@ +#include "common/perf_counters_cache.h" +#include "common/perf_counters_key.h" +#include "common/admin_socket_client.h" +#include "global/global_context.h" +#include "global/global_init.h" +#include "include/msgr.h" // for CEPH_ENTITY_TYPE_CLIENT +#include "gtest/gtest.h" + +using namespace ceph::perf_counters; + +int main(int argc, char **argv) { + std::map defaults = { + { "admin_socket", get_rand_socket_path() } + }; + std::vector args; + auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE| + CINIT_FLAG_NO_CCT_PERF_COUNTERS); + common_init_finish(g_ceph_context); + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +enum { + TEST_PERFCOUNTERS1_ELEMENT_FIRST = 200, + TEST_PERFCOUNTERS_COUNTER, + TEST_PERFCOUNTERS_TIME, + TEST_PERFCOUNTERS_TIME_AVG, + TEST_PERFCOUNTERS1_ELEMENT_LAST, +}; + +std::string sd(const char *c) +{ + std::string ret(c); + std::string::size_type sz = ret.size(); + for (std::string::size_type i = 0; i < sz; ++i) { + if (ret[i] == '\'') { + ret[i] = '\"'; + } + } + return ret; +} + +void add_test_counters(PerfCountersBuilder *pcb) { + pcb->add_u64(TEST_PERFCOUNTERS_COUNTER, "test_counter"); + pcb->add_time(TEST_PERFCOUNTERS_TIME, "test_time"); + pcb->add_time_avg(TEST_PERFCOUNTERS_TIME_AVG, "test_time_avg"); +} + +static std::shared_ptr create_test_counters(const std::string& name, CephContext *cct) { + PerfCountersBuilder pcb(cct, name, TEST_PERFCOUNTERS1_ELEMENT_FIRST, TEST_PERFCOUNTERS1_ELEMENT_LAST); + add_test_counters(&pcb); + std::shared_ptr new_counters(pcb.create_perf_counters()); + cct->get_perfcounters_collection()->add(new_counters.get()); + return new_counters; +} + +static PerfCountersCache* setup_test_perf_counters_cache(CephContext *cct, uint64_t target_size = 100) +{ + return new PerfCountersCache(cct, target_size, create_test_counters); +} + + +void cleanup_test(PerfCountersCache *pcc) { + delete pcc; +} + +TEST(PerfCountersCache, NoCacheTest) { + AdminSocketClient client(get_rand_socket_path()); + std::string message; + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump" })", &message)); + ASSERT_EQ("{}\n", message); + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter schema" })", &message)); + ASSERT_EQ("{}\n", message); +} + +TEST(PerfCountersCache, TestEviction) { + PerfCountersCache *pcc = setup_test_perf_counters_cache(g_ceph_context, 4); + std::string label1 = key_create("key1", {{"label1", "val1"}}); + std::string label2 = key_create("key2", {{"label2", "val2"}}); + std::string label3 = key_create("key3", {{"label3", "val3"}}); + std::string label4 = key_create("key4", {{"label4", "val4"}}); + std::string label5 = key_create("key5", {{"label5", "val5"}}); + std::string label6 = key_create("key6", {{"label6", "val6"}}); + + pcc->set_counter(label1, TEST_PERFCOUNTERS_COUNTER, 0); + std::shared_ptr counter = pcc->get(label2); + counter->set(TEST_PERFCOUNTERS_COUNTER, 0); + pcc->set_counter(label3, TEST_PERFCOUNTERS_COUNTER, 0); + pcc->set_counter(label4, TEST_PERFCOUNTERS_COUNTER, 0); + + AdminSocketClient client(get_rand_socket_path()); + std::string message; + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "key1": [ + { + "labels": { + "label1": "val1" + }, + "counters": { + "test_counter": 0, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], + "key2": [ + { + "labels": { + "label2": "val2" + }, + "counters": { + "test_counter": 0, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], + "key3": [ + { + "labels": { + "label3": "val3" + }, + "counters": { + "test_counter": 0, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], + "key4": [ + { + "labels": { + "label4": "val4" + }, + "counters": { + "test_counter": 0, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ] +} +)", message); + + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter schema", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "key1": [ + { + "labels": { + "label1": "val1" + }, + "counters": { + "test_counter": { + "type": 2, + "metric_type": "gauge", + "value_type": "integer", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time": { + "type": 1, + "metric_type": "gauge", + "value_type": "real", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time_avg": { + "type": 5, + "metric_type": "gauge", + "value_type": "real-integer-pair", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + } + } + } + ], + "key2": [ + { + "labels": { + "label2": "val2" + }, + "counters": { + "test_counter": { + "type": 2, + "metric_type": "gauge", + "value_type": "integer", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time": { + "type": 1, + "metric_type": "gauge", + "value_type": "real", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time_avg": { + "type": 5, + "metric_type": "gauge", + "value_type": "real-integer-pair", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + } + } + } + ], + "key3": [ + { + "labels": { + "label3": "val3" + }, + "counters": { + "test_counter": { + "type": 2, + "metric_type": "gauge", + "value_type": "integer", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time": { + "type": 1, + "metric_type": "gauge", + "value_type": "real", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time_avg": { + "type": 5, + "metric_type": "gauge", + "value_type": "real-integer-pair", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + } + } + } + ], + "key4": [ + { + "labels": { + "label4": "val4" + }, + "counters": { + "test_counter": { + "type": 2, + "metric_type": "gauge", + "value_type": "integer", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time": { + "type": 1, + "metric_type": "gauge", + "value_type": "real", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time_avg": { + "type": 5, + "metric_type": "gauge", + "value_type": "real-integer-pair", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + } + } + } + ] +} +)", message); + + pcc->set_counter(label5, TEST_PERFCOUNTERS_COUNTER, 0); + pcc->set_counter(label6, TEST_PERFCOUNTERS_COUNTER, 0); + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "key3": [ + { + "labels": { + "label3": "val3" + }, + "counters": { + "test_counter": 0, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], + "key4": [ + { + "labels": { + "label4": "val4" + }, + "counters": { + "test_counter": 0, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], + "key5": [ + { + "labels": { + "label5": "val5" + }, + "counters": { + "test_counter": 0, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], + "key6": [ + { + "labels": { + "label6": "val6" + }, + "counters": { + "test_counter": 0, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ] +} +)", message); + + + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter schema", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "key3": [ + { + "labels": { + "label3": "val3" + }, + "counters": { + "test_counter": { + "type": 2, + "metric_type": "gauge", + "value_type": "integer", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time": { + "type": 1, + "metric_type": "gauge", + "value_type": "real", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time_avg": { + "type": 5, + "metric_type": "gauge", + "value_type": "real-integer-pair", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + } + } + } + ], + "key4": [ + { + "labels": { + "label4": "val4" + }, + "counters": { + "test_counter": { + "type": 2, + "metric_type": "gauge", + "value_type": "integer", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time": { + "type": 1, + "metric_type": "gauge", + "value_type": "real", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time_avg": { + "type": 5, + "metric_type": "gauge", + "value_type": "real-integer-pair", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + } + } + } + ], + "key5": [ + { + "labels": { + "label5": "val5" + }, + "counters": { + "test_counter": { + "type": 2, + "metric_type": "gauge", + "value_type": "integer", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time": { + "type": 1, + "metric_type": "gauge", + "value_type": "real", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time_avg": { + "type": 5, + "metric_type": "gauge", + "value_type": "real-integer-pair", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + } + } + } + ], + "key6": [ + { + "labels": { + "label6": "val6" + }, + "counters": { + "test_counter": { + "type": 2, + "metric_type": "gauge", + "value_type": "integer", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time": { + "type": 1, + "metric_type": "gauge", + "value_type": "real", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time_avg": { + "type": 5, + "metric_type": "gauge", + "value_type": "real-integer-pair", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + } + } + } + ] +} +)", message); + cleanup_test(pcc); +} + +TEST(PerfCountersCache, TestLabeledCounters) { + PerfCountersCache *pcc = setup_test_perf_counters_cache(g_ceph_context); + std::string label1 = key_create("key1", {{"label1", "val1"}}); + std::string label2 = key_create("key2", {{"label2", "val2"}}); + std::string label3 = key_create("key3", {{"label3", "val3"}}); + + // test inc() + pcc->inc(label1, TEST_PERFCOUNTERS_COUNTER, 1); + pcc->inc(label2, TEST_PERFCOUNTERS_COUNTER, 2); + + AdminSocketClient client(get_rand_socket_path()); + std::string message; + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "key1": [ + { + "labels": { + "label1": "val1" + }, + "counters": { + "test_counter": 1, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], + "key2": [ + { + "labels": { + "label2": "val2" + }, + "counters": { + "test_counter": 2, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ] +} +)", message); + + + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter schema", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "key1": [ + { + "labels": { + "label1": "val1" + }, + "counters": { + "test_counter": { + "type": 2, + "metric_type": "gauge", + "value_type": "integer", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time": { + "type": 1, + "metric_type": "gauge", + "value_type": "real", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time_avg": { + "type": 5, + "metric_type": "gauge", + "value_type": "real-integer-pair", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + } + } + } + ], + "key2": [ + { + "labels": { + "label2": "val2" + }, + "counters": { + "test_counter": { + "type": 2, + "metric_type": "gauge", + "value_type": "integer", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time": { + "type": 1, + "metric_type": "gauge", + "value_type": "real", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time_avg": { + "type": 5, + "metric_type": "gauge", + "value_type": "real-integer-pair", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + } + } + } + ] +} +)", message); + + // tests to ensure there is no interaction with normal perf counters + ASSERT_EQ("", client.do_request(R"({ "prefix": "perf dump", "format": "raw" })", &message)); + ASSERT_EQ("{}\n", message); + ASSERT_EQ("", client.do_request(R"({ "prefix": "perf schema", "format": "raw" })", &message)); + ASSERT_EQ("{}\n", message); + + // test dec() + pcc->dec(label2, TEST_PERFCOUNTERS_COUNTER, 1); + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "key1": [ + { + "labels": { + "label1": "val1" + }, + "counters": { + "test_counter": 1, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], + "key2": [ + { + "labels": { + "label2": "val2" + }, + "counters": { + "test_counter": 1, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ] +} +)", message); + + + // test set_counters() + pcc->set_counter(label3, TEST_PERFCOUNTERS_COUNTER, 4); + uint64_t val = pcc->get_counter(label3, TEST_PERFCOUNTERS_COUNTER); + ASSERT_EQ(val, 4); + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "key1": [ + { + "labels": { + "label1": "val1" + }, + "counters": { + "test_counter": 1, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], + "key2": [ + { + "labels": { + "label2": "val2" + }, + "counters": { + "test_counter": 1, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], + "key3": [ + { + "labels": { + "label3": "val3" + }, + "counters": { + "test_counter": 4, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ] +} +)", message); + + cleanup_test(pcc); +} + +TEST(PerfCountersCache, TestLabeledTimes) { + PerfCountersCache *pcc = setup_test_perf_counters_cache(g_ceph_context); + std::string label1 = key_create("key1", {{"label1", "val1"}}); + std::string label2 = key_create("key2", {{"label2", "val2"}}); + std::string label3 = key_create("key3", {{"label3", "val3"}}); + + // test inc() + pcc->tinc(label1, TEST_PERFCOUNTERS_TIME, utime_t(100,0)); + pcc->tinc(label2, TEST_PERFCOUNTERS_TIME, utime_t(200,0)); + + //tinc() that takes a ceph_timespan + ceph::timespan ceph_timespan = std::chrono::seconds(10); + pcc->tinc(label1, TEST_PERFCOUNTERS_TIME, ceph_timespan); + + pcc->tinc(label1, TEST_PERFCOUNTERS_TIME_AVG, utime_t(200,0)); + pcc->tinc(label1, TEST_PERFCOUNTERS_TIME_AVG, utime_t(400,0)); + pcc->tinc(label2, TEST_PERFCOUNTERS_TIME_AVG, utime_t(100,0)); + pcc->tinc(label2, TEST_PERFCOUNTERS_TIME_AVG, utime_t(200,0)); + + AdminSocketClient client(get_rand_socket_path()); + std::string message; + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "key1": [ + { + "labels": { + "label1": "val1" + }, + "counters": { + "test_counter": 0, + "test_time": 110.000000000, + "test_time_avg": { + "avgcount": 2, + "sum": 600.000000000, + "avgtime": 300.000000000 + } + } + } + ], + "key2": [ + { + "labels": { + "label2": "val2" + }, + "counters": { + "test_counter": 0, + "test_time": 200.000000000, + "test_time_avg": { + "avgcount": 2, + "sum": 300.000000000, + "avgtime": 150.000000000 + } + } + } + ] +} +)", message); + + + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter schema", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "key1": [ + { + "labels": { + "label1": "val1" + }, + "counters": { + "test_counter": { + "type": 2, + "metric_type": "gauge", + "value_type": "integer", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time": { + "type": 1, + "metric_type": "gauge", + "value_type": "real", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time_avg": { + "type": 5, + "metric_type": "gauge", + "value_type": "real-integer-pair", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + } + } + } + ], + "key2": [ + { + "labels": { + "label2": "val2" + }, + "counters": { + "test_counter": { + "type": 2, + "metric_type": "gauge", + "value_type": "integer", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time": { + "type": 1, + "metric_type": "gauge", + "value_type": "real", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + }, + "test_time_avg": { + "type": 5, + "metric_type": "gauge", + "value_type": "real-integer-pair", + "description": "", + "nick": "", + "priority": 0, + "units": "none" + } + } + } + ] +} +)", message); + + // test tset() & tget() + pcc->tset(label1, TEST_PERFCOUNTERS_TIME, utime_t(500,0)); + utime_t label1_time = pcc->tget(label1, TEST_PERFCOUNTERS_TIME); + ASSERT_EQ(utime_t(500,0), label1_time); + + cleanup_test(pcc); +} + +TEST(PerfCountersCache, TestLabelStrings) { + AdminSocketClient client(get_rand_socket_path()); + std::string message; + PerfCountersCache *pcc = setup_test_perf_counters_cache(g_ceph_context); + std::string empty_key = ""; + + // empty string as should not create a labeled entry + EXPECT_DEATH(pcc->set_counter(empty_key, TEST_PERFCOUNTERS_COUNTER, 1), ""); + EXPECT_DEATH(pcc->get(empty_key), ""); + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); + ASSERT_EQ("{}\n", message); + + // key name but no labels at all should not create a labeled entry + std::string only_key = "only_key"; + // run an op on an invalid key name to make sure nothing happens + EXPECT_DEATH(pcc->set_counter(only_key, TEST_PERFCOUNTERS_COUNTER, 4), ""); + EXPECT_DEATH(pcc->get(only_key), ""); + + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); + ASSERT_EQ("{}\n", message); + + // test valid key name with multiple valid label pairs + std::string label1 = key_create("good_ctrs", {{"label3", "val3"}, {"label2", "val4"}}); + pcc->set_counter(label1, TEST_PERFCOUNTERS_COUNTER, 8); + + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "good_ctrs": [ + { + "labels": { + "label2": "val4", + "label3": "val3" + }, + "counters": { + "test_counter": 8, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ] +} +)", message); + + // test empty val in a label pair will get the label pair added into the perf counters cache but empty key will not + std::string label2 = key_create("bad_ctrs1", {{"label3", "val4"}, {"label1", ""}}); + EXPECT_DEATH(pcc->set_counter(label2, TEST_PERFCOUNTERS_COUNTER, 2), ""); + + std::string label3 = key_create("bad_ctrs2", {{"", "val4"}, {"label1", "val1"}}); + EXPECT_DEATH(pcc->set_counter(label3, TEST_PERFCOUNTERS_COUNTER, 2), ""); + + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "good_ctrs": [ + { + "labels": { + "label2": "val4", + "label3": "val3" + }, + "counters": { + "test_counter": 8, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ] +} +)", message); + + // test empty keys in each of the label pairs will not get the label added into the perf counters cache + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "good_ctrs": [ + { + "labels": { + "label2": "val4", + "label3": "val3" + }, + "counters": { + "test_counter": 8, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ] +} +)", message); + + // a key with a somehow odd number of entries after the the key name will omit final unfinished label pair + std::string label5 = "too_many_delimiters"; + label5 += '\0'; + label5 += "label1"; + label5 += '\0'; + label5 += "val1"; + label5 += '\0'; + label5 += "label2"; + label5 += '\0'; + pcc->set_counter(label5, TEST_PERFCOUNTERS_COUNTER, 0); + + ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); + ASSERT_EQ(R"({ + "good_ctrs": [ + { + "labels": { + "label2": "val4", + "label3": "val3" + }, + "counters": { + "test_counter": 8, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], + "too_many_delimiters": [ + { + "labels": { + "label1": "val1" + }, + "counters": { + "test_counter": 0, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ] +} +)", message); + + cleanup_test(pcc); +} From 6284322af41b2d76ca2e3fcf56faef4d55701250 Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Wed, 23 Aug 2023 01:05:37 -0400 Subject: [PATCH 0165/2492] exporter: check key exists before json::object at() Signed-off-by: Ali Maredia --- src/exporter/DaemonMetricCollector.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/exporter/DaemonMetricCollector.cc b/src/exporter/DaemonMetricCollector.cc index ebe85c3041e5..0ce5419a7764 100644 --- a/src/exporter/DaemonMetricCollector.cc +++ b/src/exporter/DaemonMetricCollector.cc @@ -160,8 +160,10 @@ void DaemonMetricCollector::dump_asok_metrics() { labels.insert(multisite_labels_and_name.first.begin(), multisite_labels_and_name.first.end()); counter_name = multisite_labels_and_name.second; } - auto perf_values = counters_values.at(counter_name_init); - dump_asok_metric(counter_group, perf_values, counter_name, labels); + if (counters_values.find(counter_name_init) != counters_values.end()) { + auto perf_values = counters_values.at(counter_name_init); + dump_asok_metric(counter_group, perf_values, counter_name, labels); + } } } } From 56c7cc8c59339164c052a432cd59a55aeca0dbab Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Fri, 8 Sep 2023 11:16:02 -0400 Subject: [PATCH 0166/2492] rgw: misc labeled op counters work Highlights of this commit include: - splitting the rgw perf counters cache int two caches for bucket labeled and user labeled op counters - add config overrides to verify suite for CI - add tenant label for op counters - misc cleanup cleanup - add docs for rgw metrics Signed-off-by: Ali Maredia --- doc/dev/perf_counters.rst | 4 + doc/radosgw/index.rst | 1 + doc/radosgw/metrics.rst | 204 +++ qa/suites/rgw/verify/overrides.yaml | 2 + src/common/options/rgw.yaml.in | 62 +- src/common/perf_counters.cc | 7 +- src/common/perf_counters_cache.cc | 7 +- src/exporter/DaemonMetricCollector.cc | 5 +- src/rgw/rgw_file.cc | 7 +- src/rgw/rgw_file_int.h | 2 + src/rgw/rgw_op.cc | 62 +- src/rgw/rgw_perf_counters.cc | 123 +- src/rgw/rgw_perf_counters.h | 29 +- src/rgw/vstart.sh.swift | 1930 +++++++++++++++++++++++++ src/test/test_perf_counters_cache.cc | 54 +- 15 files changed, 2376 insertions(+), 123 deletions(-) create mode 100644 doc/radosgw/metrics.rst create mode 100755 src/rgw/vstart.sh.swift diff --git a/doc/dev/perf_counters.rst b/doc/dev/perf_counters.rst index a64d14d33bd0..1bcc6120e4af 100644 --- a/doc/dev/perf_counters.rst +++ b/doc/dev/perf_counters.rst @@ -1,3 +1,5 @@ +.. _Perf Counters: + =============== Perf counters =============== @@ -200,6 +202,8 @@ The actual dump is similar to the schema, except that average values are grouped } } +.. _Labeled Perf Counters: + Labeled Perf Counters --------------------- diff --git a/doc/radosgw/index.rst b/doc/radosgw/index.rst index 70443620237a..ed67413646d8 100644 --- a/doc/radosgw/index.rst +++ b/doc/radosgw/index.rst @@ -84,4 +84,5 @@ Storage Cluster with one API and then retrieve that data with the other API. Lua Scripting D3N Data Cache Cloud Transition + Metrics diff --git a/doc/radosgw/metrics.rst b/doc/radosgw/metrics.rst new file mode 100644 index 000000000000..75ef782fa6aa --- /dev/null +++ b/doc/radosgw/metrics.rst @@ -0,0 +1,204 @@ +======= +Metrics +======= + +The Ceph Object Gateway uses :ref:`Perf Counters` to track metrics. The counters can be labeled (:ref:`Labeled Perf Counters`). When counters are labeled, they are stored in the Ceph Object Gateway specific caches. + +These metrics can be sent to the time series database Prometheus to visualize a cluster wide view of usage data (ex: number of S3 put operations on a specific bucket) over time. + +.. contents:: + +Op Metrics +========== + +The following metrics related to S3 or Swift operations are tracked per Ceph Object Gateway. + +.. list-table:: Radosgw Op Metrics + :widths: 25 25 75 + :header-rows: 1 + + * - Name + - Type + - Description + * - put_ops + - Counter + - Number of put operations + * - put_b + - Counter + - Number of bytes put + * - put_initial_lat + - Guage + - Total latency of put operations + * - get_ops + - Counter + - Number of get operations + * - get_b + - Counter + - Number of bytes from get requests + * - get_initial_lat + - Guage + - Total latency of get operations + * - del_obj_ops + - Counter + - Number of delete object operations + * - del_obj_bytes + - Counter + - Number of bytes deleted + * - del_obj_lat + - Guage + - Total latency of delete object operations + * - del_bucket_ops + - Counter + - Number of delete bucket operations + * - del_bucket_lat + - Guage + - Total latency of delete bucket operations + * - copy_obj_ops + - Counter + - Number of copy object operations + * - copy_obj_bytes + - Counter + - Number of bytes copied + * - copy_obj_lat + - Guage + - Total latency of copy object operations + * - list_object_ops + - Counter + - Number of list object operations + * - list_object_lat + - Guage + - Total latency of list object operations + * - list_bucket_ops + - Counter + - Number of list bucket operations + * - list_bucket_lat + - Guage + - Total latency of list bucket operations + +More information about op metrics can be seen in the ``rgw_op`` section of the output of the ``counter schema`` command. +To view op metrics in the Ceph Object Gateway go to the ``rgw_op`` section of the output of the ``counter dump`` command:: + + "rgw_op": [ + { + "labels": {}, + "counters": { + "put_ops": 2, + "put_b": 5327, + "put_initial_lat": { + "avgcount": 2, + "sum": 2.818064835, + "avgtime": 1.409032417 + }, + "get_ops": 5, + "get_b": 5325, + "get_initial_lat": { + "avgcount": 2, + "sum": 0.003000069, + "avgtime": 0.001500034 + }, + ... + "list_buckets_ops": 1, + "list_buckets_lat": { + "avgcount": 1, + "sum": 0.002300000, + "avgtime": 0.002300000 + } + } + }, + ] + +Op Metrics Labels +-------------------- + +Op metrics can also be tracked per-user or per-bucket. These metrics are exported to Prometheus with labels like Bucket = {name} or User = {userid}:: + + "rgw_op": [ + ... + { + "labels": { + "Bucket": "bucket1" + }, + "counters": { + "put_ops": 2, + "put_b": 5327, + "put_initial_lat": { + "avgcount": 2, + "sum": 2.818064835, + "avgtime": 1.409032417 + }, + "get_ops": 5, + "get_b": 5325, + "get_initial_lat": { + "avgcount": 2, + "sum": 0.003000069, + "avgtime": 0.001500034 + }, + ... + "list_buckets_ops": 1, + "list_buckets_lat": { + "avgcount": 1, + "sum": 0.002300000, + "avgtime": 0.002300000 + } + } + }, + ... + ] + +:ref:`rgw-multitenancy` allows to use buckets and users of the same name simultaneously. If a user or bucket lies under a tenant, a label for tenant in the form Tenant = {tenantid} is added to the metric. + +In a large system with many users and buckets, it may not be tractable to export all metrics to Prometheus. For that reason, the collection of these labeled metrics is disabled by default. + +Once enabled, the working set of tracked users and buckets is constrained to limit memory and database usage. As a result, the collection of these labeled metrics will not always be reliable. + + +User & Bucket Counter Caches +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To track op metrics by user the Ceph Object Gateway the config value ``rgw_user_counters_cache`` must be set to ``true``. + +To track op metrics by bucket the Ceph Object Gateway the config value ``rgw_bucket_counters_cache`` must be set to ``true``. + +These config values are set in Ceph via the command ``ceph config set client.rgw rgw_{user,bucket}_counters_cache true`` + +Since the op metrics are labeled perf counters, they live in memory. If the Ceph Object Gateway is restarted or crashes, all counters in the Ceph Object Gateway, whether in a cache or not, are lost. + +User & Bucket Counter Cache Size & Eviction +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Both ``rgw_user_counters_cache_size`` and ``rgw_bucket_counters_cache_size`` can be used to set number of entries in each cache. + +Counters are evicted from a cache once the number of counters in the cache are greater than the cache size config variable. The counters that are evicted are the least recently used (LRU). + +For example if the number of buckets exceeded ``rgw_bucket_counters_cache_size`` by 1 and the counters with label ``bucket1`` were the last to be updated, the counters for ``bucket1`` would be evicted from the cache. If S3 operations tracked by the op metrics were done on ``bucket1`` after eviction, all of the metrics in the cache for ``bucket1`` would start at 0. + +Cache sizing can depend on a number of factors. These factors include: + +#. Number of users in the cluster +#. Number of buckets in the cluster +#. Memory usage of the Ceph Object Gateway +#. Disk and memory usage of Promtheus. + +To help calculate the Ceph Object Gateway's memory usage of a cache, it should be noted that each cache entry, encompassing all of the op metrics, is 1360 bytes. This is an estimate and subject to change if metrics are added or removed from the op metrics list. + +Sending Metrics to Prometheus +============================= + +To get metrics from a Ceph Object Gateway into the time series database Prometheus, the ceph-exporter daemon must be running and configured to scrape the Radogw's admin socket. + +The ceph-exporter daemon scrapes the Ceph Object Gateway's admin socket at a regular interval, defined by the config variable ``exporter_stats_period``. + +Prometheus has a configurable interval in which it scrapes the exporter (see: https://prometheus.io/docs/prometheus/latest/configuration/configuration/). + +Config Reference +================ +The following rgw op metrics related settings can be set via ``ceph config set client.rgw CONFIG_VARIABLE VALUE``. + +.. confval:: rgw_user_counters_cache +.. confval:: rgw_user_counters_cache_size +.. confval:: rgw_bucket_counters_cache +.. confval:: rgw_bucket_counters_cache_size + +The following are notable ceph-exporter related settings can be set via ``ceph config set global CONFIG_VARIABLE VALUE``. + +.. confval:: exporter_stats_period diff --git a/qa/suites/rgw/verify/overrides.yaml b/qa/suites/rgw/verify/overrides.yaml index eac70f30c9f3..a881ce5cbe28 100644 --- a/qa/suites/rgw/verify/overrides.yaml +++ b/qa/suites/rgw/verify/overrides.yaml @@ -9,6 +9,8 @@ overrides: rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo= testkey-2=aWIKTWFrZWZpbGUKbWFuCm91dApzcmMKVGVzdGluZwo= rgw crypt require ssl: false rgw torrent flag: true + rgw user counters cache: true + rgw bucket counters cache: true rgw: compression type: random storage classes: LUKEWARM, FROZEN diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in index f2f85b648f8b..1dfb96228944 100644 --- a/src/common/options/rgw.yaml.in +++ b/src/common/options/rgw.yaml.in @@ -3829,26 +3829,8 @@ options: other form of policies that Amazon does, so if you are mirroring policies between RGW and AWS, you may wish to set this to false. default: true -- name: rgw_perf_counters_cache - type: bool - level: dev - default: false - desc: enable rgw labeled perf counters cache - long desc: If set to true, rgw creates labeled perf counters and stores them - in an rgw specific labeled perf counters cache. - see_also: - - rgw_perf_counters_cache_size - services: - - rgw - with_legacy: true -- name: rgw_perf_counters_cache_size - type: uint - level: advanced - desc: Number of labeled perf counters the rgw perf counters cache can store - default: 10000 services: - rgw - with_legacy: true - name: rgw_d4n_host type: str level: advanced @@ -3914,3 +3896,47 @@ options: services: - rgw with_legacy: true +- name: rgw_user_counters_cache + type: bool + level: dev + default: false + desc: enable a rgw perf counters cache for counters with user label + long desc: If set to true, rgw creates perf counters with a label for the user and stores them + in a perf counters cache. This perf counters cache contains only perf counters labeled by user. + see_also: + - rgw_user_counters_cache_size + services: + - rgw + with_legacy: true +- name: rgw_user_counters_cache_size + type: uint + level: advanced + desc: Number of labeled perf counters the user perf counters cache can store + default: 10000 + services: + - rgw + see_also: + - rgw_user_counters_cache + with_legacy: true +- name: rgw_bucket_counters_cache + type: bool + level: dev + default: false + desc: enable a rgw perf counters cache for counters with bucket label + long desc: If set to true, rgw creates perf counters with a label for the bucket and stores them + in a perf counters cache. This perf counters cache contains only perf counters labeled by bucket. + see_also: + - rgw_bucket_counters_cache_size + services: + - rgw + with_legacy: true +- name: rgw_bucket_counters_cache_size + type: uint + level: advanced + desc: Number of labeled perf counters the bucket perf counters cache can store + default: 10000 + services: + - rgw + see_also: + - rgw_bucket_counters_cache + with_legacy: true diff --git a/src/common/perf_counters.cc b/src/common/perf_counters.cc index 81bf3284bdae..b5e361b505cd 100644 --- a/src/common/perf_counters.cc +++ b/src/common/perf_counters.cc @@ -135,12 +135,7 @@ void PerfCountersCollectionImpl::dump_formatted_generic( const std::string &counter) const { f->open_object_section("perfcounter_collection"); - // close out all of counters collection immediately if collection is empty - if (m_loggers.empty()) { - f->close_section(); // all of counters collection - return; - } - + if (dump_labeled) { std::string prev_key_name; for (auto l = m_loggers.begin(); l != m_loggers.end(); ++l) { diff --git a/src/common/perf_counters_cache.cc b/src/common/perf_counters_cache.cc index e0810508ce7f..946b5f5b8edb 100644 --- a/src/common/perf_counters_cache.cc +++ b/src/common/perf_counters_cache.cc @@ -5,17 +5,16 @@ namespace ceph::perf_counters { void PerfCountersCache::check_key(const std::string &key) { std::string_view key_name = ceph::perf_counters::key_name(key); - // return false for empty key name + // don't accept an empty key name assert(key_name != ""); - // if there are no labels key name is not valid + // if there are no labels, key name is not valid auto key_labels = ceph::perf_counters::key_labels(key); assert(key_labels.begin() != key_labels.end()); - // don't accept keys where any labels have an empty label name + // don't accept keys where any labels in the key have an empty key name for (auto key_label : key_labels) { assert(key_label.first != ""); - assert(key_label.second != ""); } } diff --git a/src/exporter/DaemonMetricCollector.cc b/src/exporter/DaemonMetricCollector.cc index 0ce5419a7764..23a0dd550f1c 100644 --- a/src/exporter/DaemonMetricCollector.cc +++ b/src/exporter/DaemonMetricCollector.cc @@ -160,8 +160,9 @@ void DaemonMetricCollector::dump_asok_metrics() { labels.insert(multisite_labels_and_name.first.begin(), multisite_labels_and_name.first.end()); counter_name = multisite_labels_and_name.second; } - if (counters_values.find(counter_name_init) != counters_values.end()) { - auto perf_values = counters_values.at(counter_name_init); + auto counters_values_itr = counters_values.find(counter_name_init); + if (counters_values_itr != counters_values.end()) { + auto perf_values = counters_values_itr->value(); dump_asok_metric(counter_group, perf_values, counter_name, labels); } } diff --git a/src/rgw/rgw_file.cc b/src/rgw/rgw_file.cc index 92aa66f060eb..ca149ad836b8 100644 --- a/src/rgw/rgw_file.cc +++ b/src/rgw/rgw_file.cc @@ -1838,7 +1838,8 @@ namespace rgw { ceph_assert(! dlo_manifest); ceph_assert(! slo_info); - rgw::op_counters::global_op_counters->inc(l_rgw_op_put); + counters = rgw::op_counters::get(state); + rgw::op_counters::inc(counters, l_rgw_op_put, 1); op_ret = -EINVAL; if (state->object->empty()) { @@ -1944,7 +1945,7 @@ namespace rgw { real_time appx_t = real_clock::now(); state->obj_size = bytes_written; - rgw::op_counters::global_op_counters->inc(l_rgw_op_put_b, state->obj_size); + rgw::op_counters::inc(counters, l_rgw_op_put_b, state->obj_size); // flush data in filters op_ret = filter->process({}, state->obj_size); @@ -2027,7 +2028,7 @@ namespace rgw { } done: - rgw::op_counters::global_op_counters->tinc(l_rgw_op_put_lat, state->time_elapsed()); + rgw::op_counters::tinc(counters, l_rgw_op_put_lat, state->time_elapsed()); return op_ret; } /* exec_finish */ diff --git a/src/rgw/rgw_file_int.h b/src/rgw/rgw_file_int.h index 6ecd4b2447da..91c858e5b3bd 100644 --- a/src/rgw/rgw_file_int.h +++ b/src/rgw/rgw_file_int.h @@ -36,6 +36,7 @@ #include "rgw_putobj_processor.h" #include "rgw_aio_throttle.h" #include "rgw_compression.h" +#include "rgw_perf_counters.h" /* XXX @@ -2485,6 +2486,7 @@ class RGWWriteRequest : public RGWLibContinuedReq, off_t real_ofs; size_t bytes_written; bool eio; + rgw::op_counters::CountersContainer counters; RGWWriteRequest(rgw::sal::Driver* driver, const RGWProcessEnv& penv, std::unique_ptr _user, diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index cd347502e76c..29a1f36c0273 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -1692,8 +1692,8 @@ int RGWGetObj::read_user_manifest_part(rgw::sal::Bucket* bucket, return 0; } - auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); - rgw::op_counters::inc(labeled_counters, l_rgw_op_get_b, cur_end - cur_ofs); + auto counters = rgw::op_counters::get(s); + rgw::op_counters::inc(counters, l_rgw_op_get_b, cur_end - cur_ofs); filter->fixup_range(cur_ofs, cur_end); op_ret = read_op->iterate(this, cur_ofs, cur_end, filter, s->yield); if (op_ret >= 0) @@ -1766,7 +1766,8 @@ static int iterate_user_manifest_parts(const DoutPrefixProvider *dpp, found_end = true; } - rgw::op_counters::global_op_counters->tinc(l_rgw_op_get_lat, + rgw::op_counters::CountersContainer counters; + rgw::op_counters::tinc(counters, l_rgw_op_get_lat, (ceph_clock_now() - start_time)); if (found_start && !handled_end) { @@ -1862,8 +1863,9 @@ static int iterate_slo_parts(const DoutPrefixProvider *dpp, found_end = true; } - rgw::op_counters::global_op_counters->tinc(l_rgw_op_get_lat, - (ceph_clock_now() - start_time)); + rgw::op_counters::CountersContainer counters; + rgw::op_counters::tinc(counters, l_rgw_op_get_lat, + (ceph_clock_now() - start_time)); if (found_start) { if (cb) { @@ -2210,8 +2212,8 @@ void RGWGetObj::execute(optional_yield y) std::unique_ptr run_lua; map::iterator attr_iter; - auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); - rgw::op_counters::inc(labeled_counters, l_rgw_op_get, 1); + auto counters = rgw::op_counters::get(s); + rgw::op_counters::inc(counters, l_rgw_op_get, 1); std::unique_ptr read_op(s->object->get_read_op()); @@ -2409,14 +2411,14 @@ void RGWGetObj::execute(optional_yield y) return; } - rgw::op_counters::inc(labeled_counters, l_rgw_op_get_b, end-ofs); + rgw::op_counters::inc(counters, l_rgw_op_get_b, end-ofs); op_ret = read_op->iterate(this, ofs_x, end_x, filter, s->yield); if (op_ret >= 0) op_ret = filter->flush(); - rgw::op_counters::tinc(labeled_counters, l_rgw_op_get_lat, s->time_elapsed()); + rgw::op_counters::tinc(counters, l_rgw_op_get_lat, s->time_elapsed()); if (op_ret < 0) { goto done_err; @@ -2493,8 +2495,8 @@ void RGWListBuckets::execute(optional_yield y) const uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk; - auto labeled_counters = rgw::op_counters::get({{"User", s->user->get_id().id}}); - rgw::op_counters::inc(labeled_counters, l_rgw_op_list_buckets, 1); + auto counters = rgw::op_counters::get(s); + rgw::op_counters::inc(counters, l_rgw_op_list_buckets, 1); auto g = make_scope_guard([this, &started] { if (!started) { @@ -2573,7 +2575,7 @@ void RGWListBuckets::execute(optional_yield y) handle_listing_chunk(listing.buckets); } while (!marker.empty() && !done); - rgw::op_counters::tinc(labeled_counters, l_rgw_op_list_buckets_lat, s->time_elapsed()); + rgw::op_counters::tinc(counters, l_rgw_op_list_buckets_lat, s->time_elapsed()); } void RGWGetUsage::execute(optional_yield y) @@ -3059,9 +3061,9 @@ void RGWListBucket::execute(optional_yield y) common_prefixes = std::move(results.common_prefixes); } - auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); - rgw::op_counters::inc(labeled_counters, l_rgw_op_list_obj, 1); - rgw::op_counters::tinc(labeled_counters, l_rgw_op_list_obj_lat, s->time_elapsed()); + auto counters = rgw::op_counters::get(s); + rgw::op_counters::inc(counters, l_rgw_op_list_obj, 1); + rgw::op_counters::tinc(counters, l_rgw_op_list_obj_lat, s->time_elapsed()); } int RGWGetBucketLogging::verify_permission(optional_yield y) @@ -3595,9 +3597,9 @@ void RGWDeleteBucket::execute(optional_yield y) op_ret = 0; } - auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); - rgw::op_counters::inc(labeled_counters, l_rgw_op_del_bucket, 1); - rgw::op_counters::tinc(labeled_counters, l_rgw_op_del_bucket_lat, s->time_elapsed()); + auto counters = rgw::op_counters::get(s); + rgw::op_counters::inc(counters, l_rgw_op_del_bucket, 1); + rgw::op_counters::tinc(counters, l_rgw_op_del_bucket_lat, s->time_elapsed()); return; } @@ -4025,14 +4027,14 @@ void RGWPutObj::execute(optional_yield y) off_t fst; off_t lst; - auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); + auto counters = rgw::op_counters::get(s); bool need_calc_md5 = (dlo_manifest == NULL) && (slo_info == NULL); - rgw::op_counters::inc(labeled_counters, l_rgw_op_put, 1); + rgw::op_counters::inc(counters, l_rgw_op_put, 1); // report latency on return auto put_lat = make_scope_guard([&] { - rgw::op_counters::tinc(labeled_counters, l_rgw_op_put_lat, s->time_elapsed()); + rgw::op_counters::tinc(counters, l_rgw_op_put_lat, s->time_elapsed()); }); op_ret = -EINVAL; @@ -4307,7 +4309,7 @@ void RGWPutObj::execute(optional_yield y) s->obj_size = ofs; s->object->set_obj_size(ofs); - rgw::op_counters::inc(labeled_counters, l_rgw_op_put_b, s->obj_size); + rgw::op_counters::inc(counters, l_rgw_op_put_b, s->obj_size); op_ret = do_aws4_auth_completion(); if (op_ret < 0) { @@ -5264,10 +5266,10 @@ void RGWDeleteObj::execute(optional_yield y) op_ret = 0; } - auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); - rgw::op_counters::inc(labeled_counters, l_rgw_op_del_obj, 1); - rgw::op_counters::inc(labeled_counters, l_rgw_op_del_obj_b, obj_size); - rgw::op_counters::tinc(labeled_counters, l_rgw_op_del_obj_lat, s->time_elapsed()); + auto counters = rgw::op_counters::get(s); + rgw::op_counters::inc(counters, l_rgw_op_del_obj, 1); + rgw::op_counters::inc(counters, l_rgw_op_del_obj_b, obj_size); + rgw::op_counters::tinc(counters, l_rgw_op_del_obj_lat, s->time_elapsed()); // send request to notification manager int ret = res->publish_commit(this, obj_size, ceph::real_clock::now(), etag, version_id); @@ -5729,10 +5731,10 @@ void RGWCopyObj::execute(optional_yield y) // too late to rollback operation, hence op_ret is not set here } - auto labeled_counters = rgw::op_counters::get({{"Bucket", s->bucket_name}, {"User", s->user->get_id().id}}); - rgw::op_counters::inc(labeled_counters, l_rgw_op_copy_obj, 1); - rgw::op_counters::inc(labeled_counters, l_rgw_op_copy_obj_b, obj_size); - rgw::op_counters::tinc(labeled_counters, l_rgw_op_copy_obj_lat, s->time_elapsed()); + auto counters = rgw::op_counters::get(s); + rgw::op_counters::inc(counters, l_rgw_op_copy_obj, 1); + rgw::op_counters::inc(counters, l_rgw_op_copy_obj_b, obj_size); + rgw::op_counters::tinc(counters, l_rgw_op_copy_obj_lat, s->time_elapsed()); } int RGWGetACLs::verify_permission(optional_yield y) diff --git a/src/rgw/rgw_perf_counters.cc b/src/rgw/rgw_perf_counters.cc index aca56a60946d..f2d245c27d2e 100644 --- a/src/rgw/rgw_perf_counters.cc +++ b/src/rgw/rgw_perf_counters.cc @@ -5,12 +5,14 @@ #include "common/perf_counters.h" #include "common/perf_counters_key.h" #include "common/ceph_context.h" +#include "rgw_sal.h" + +using namespace ceph::perf_counters; +using namespace rgw::op_counters; PerfCounters *perfcounter = NULL; -ceph::perf_counters::PerfCountersCache *perf_counters_cache = NULL; -std::string rgw_op_counters_key = "rgw_op"; -static void add_rgw_frontend_counters(PerfCountersBuilder *pcb) { +void add_rgw_frontend_counters(PerfCountersBuilder *pcb) { // RGW emits comparatively few metrics, so let's be generous // and mark them all USEFUL to get transmission to ceph-mgr by default. pcb->set_prio_default(PerfCountersBuilder::PRIO_USEFUL); @@ -58,7 +60,7 @@ static void add_rgw_frontend_counters(PerfCountersBuilder *pcb) { pcb->add_u64(l_rgw_lua_current_vms, "lua_current_vms", "Number of Lua VMs currently being executed"); } -static void add_rgw_op_counters(PerfCountersBuilder *lpcb) { +void add_rgw_op_counters(PerfCountersBuilder *lpcb) { // description must match general rgw counters description above lpcb->set_prio_default(PerfCountersBuilder::PRIO_USEFUL); @@ -88,23 +90,6 @@ static void add_rgw_op_counters(PerfCountersBuilder *lpcb) { lpcb->add_time_avg(l_rgw_op_list_buckets_lat, "list_buckets_lat", "List buckets latency"); } -std::shared_ptr create_rgw_counters(const std::string& name, CephContext *cct) { - std::string_view key = ceph::perf_counters::key_name(name); - if (rgw_op_counters_key.compare(key) == 0) { - PerfCountersBuilder pcb(cct, name, l_rgw_op_first, l_rgw_op_last); - add_rgw_op_counters(&pcb); - std::shared_ptr new_counters(pcb.create_perf_counters()); - cct->get_perfcounters_collection()->add(new_counters.get()); - return new_counters; - } else { - PerfCountersBuilder pcb(cct, name, l_rgw_first, l_rgw_last); - add_rgw_frontend_counters(&pcb); - std::shared_ptr new_counters(pcb.create_perf_counters()); - cct->get_perfcounters_collection()->add(new_counters.get()); - return new_counters; - } -} - void frontend_counters_init(CephContext *cct) { PerfCountersBuilder pcb(cct, "rgw", l_rgw_first, l_rgw_last); add_rgw_frontend_counters(&pcb); @@ -115,7 +100,20 @@ void frontend_counters_init(CephContext *cct) { namespace rgw::op_counters { +ceph::perf_counters::PerfCountersCache *user_counters_cache = NULL; +ceph::perf_counters::PerfCountersCache *bucket_counters_cache = NULL; PerfCounters *global_op_counters = NULL; +const std::string rgw_op_counters_key = "rgw_op"; + +std::shared_ptr create_rgw_op_counters(const std::string& name, CephContext *cct) { + std::string_view key = ceph::perf_counters::key_name(name); + ceph_assert(rgw_op_counters_key == key); + PerfCountersBuilder pcb(cct, name, l_rgw_op_first, l_rgw_op_last); + add_rgw_op_counters(&pcb); + std::shared_ptr new_counters(pcb.create_perf_counters()); + cct->get_perfcounters_collection()->add(new_counters.get()); + return new_counters; +} void global_op_counters_init(CephContext *cct) { PerfCountersBuilder pcb(cct, rgw_op_counters_key, l_rgw_op_first, l_rgw_op_last); @@ -125,30 +123,67 @@ void global_op_counters_init(CephContext *cct) { global_op_counters = new_counters; } -void inc(std::shared_ptr labeled_counters, int idx, uint64_t v) { - if (labeled_counters) { - PerfCounters *counter = labeled_counters.get(); - counter->inc(idx, v); +CountersContainer get(req_state *s) { + CountersContainer counters; + std::string key; + + if (user_counters_cache && !s->user->get_id().id.empty()) { + if (s->user->get_tenant().empty()) { + key = std::move(ceph::perf_counters::key_create(rgw_op_counters_key, {{"User", s->user->get_id().id}})); + } else { + key = std::move(ceph::perf_counters::key_create(rgw_op_counters_key, {{"User", s->user->get_id().id}, {"Tenant", s->user->get_tenant()}})); + } + counters.user_counters = user_counters_cache->get(key); + } + + if (bucket_counters_cache && !s->bucket_name.empty()) { + if (s->bucket_tenant.empty()) { + key = std::move(ceph::perf_counters::key_create(rgw_op_counters_key, {{"Bucket", s->bucket_name}})); + } else { + key = std::move(ceph::perf_counters::key_create(rgw_op_counters_key, {{"Bucket", s->bucket_name}, {"Tenant", s->bucket_tenant}})); + } + counters.bucket_counters = bucket_counters_cache->get(key); + } + + return counters; +} + +void inc(const CountersContainer &counters, int idx, uint64_t v) { + if (counters.user_counters) { + PerfCounters *user_counters = counters.user_counters.get(); + user_counters->inc(idx, v); + } + if (counters.bucket_counters) { + PerfCounters *bucket_counters = counters.bucket_counters.get(); + bucket_counters->inc(idx, v); } if (global_op_counters) { global_op_counters->inc(idx, v); } } -void tinc(std::shared_ptr labeled_counters, int idx, utime_t amt) { - if (labeled_counters) { - PerfCounters *counter = labeled_counters.get(); - counter->tinc(idx, amt); +void tinc(const CountersContainer &counters, int idx, utime_t amt) { + if (counters.user_counters) { + PerfCounters *user_counters = counters.user_counters.get(); + user_counters->tinc(idx, amt); + } + if (counters.bucket_counters) { + PerfCounters *bucket_counters = counters.bucket_counters.get(); + bucket_counters->tinc(idx, amt); } if (global_op_counters) { global_op_counters->tinc(idx, amt); } } -void tinc(std::shared_ptr labeled_counters, int idx, ceph::timespan amt) { - if (labeled_counters) { - PerfCounters *counter = labeled_counters.get(); - counter->tinc(idx, amt); +void tinc(const CountersContainer &counters, int idx, ceph::timespan amt) { + if (counters.user_counters) { + PerfCounters *user_counters = counters.user_counters.get(); + user_counters->tinc(idx, amt); + } + if (counters.bucket_counters) { + PerfCounters *bucket_counters = counters.bucket_counters.get(); + bucket_counters->tinc(idx, amt); } if (global_op_counters) { global_op_counters->tinc(idx, amt); @@ -161,13 +196,19 @@ int rgw_perf_start(CephContext *cct) { frontend_counters_init(cct); - bool cache_enabled = cct->_conf.get_val("rgw_perf_counters_cache"); - if (cache_enabled) { - uint64_t target_size = cct->_conf.get_val("rgw_perf_counters_cache_size"); - perf_counters_cache = new ceph::perf_counters::PerfCountersCache(cct, target_size, create_rgw_counters); + bool user_counters_cache_enabled = cct->_conf.get_val("rgw_user_counters_cache"); + if (user_counters_cache_enabled) { + uint64_t target_size = cct->_conf.get_val("rgw_user_counters_cache_size"); + user_counters_cache = new PerfCountersCache(cct, target_size, create_rgw_op_counters); + } + + bool bucket_counters_cache_enabled = cct->_conf.get_val("rgw_bucket_counters_cache"); + if (bucket_counters_cache_enabled) { + uint64_t target_size = cct->_conf.get_val("rgw_bucket_counters_cache_size"); + bucket_counters_cache = new PerfCountersCache(cct, target_size, create_rgw_op_counters); } - rgw::op_counters::global_op_counters_init(cct); + global_op_counters_init(cct); return 0; } @@ -176,5 +217,9 @@ void rgw_perf_stop(CephContext *cct) ceph_assert(perfcounter); cct->get_perfcounters_collection()->remove(perfcounter); delete perfcounter; - delete perf_counters_cache; + ceph_assert(global_op_counters); + cct->get_perfcounters_collection()->remove(global_op_counters); + delete global_op_counters; + delete user_counters_cache; + delete bucket_counters_cache; } diff --git a/src/rgw/rgw_perf_counters.h b/src/rgw/rgw_perf_counters.h index 49f7e4d4218b..e9068f4c9ff2 100644 --- a/src/rgw/rgw_perf_counters.h +++ b/src/rgw/rgw_perf_counters.h @@ -4,17 +4,13 @@ #pragma once #include "include/common_fwd.h" +#include "rgw_common.h" #include "common/perf_counters_cache.h" #include "common/perf_counters_key.h" extern PerfCounters *perfcounter; -extern ceph::perf_counters::PerfCountersCache *perf_counters_cache; -extern std::string rgw_op_counters_key; - extern int rgw_perf_start(CephContext *cct); extern void rgw_perf_stop(CephContext *cct); -extern void frontend_counters_init(CephContext *cct); -extern std::shared_ptr create_rgw_counters(const std::string& name, CephContext *cct); enum { l_rgw_first = 15000, @@ -89,24 +85,17 @@ enum { namespace rgw::op_counters { -extern PerfCounters *global_op_counters; - -void global_op_counters_init(CephContext *cct); +struct CountersContainer { + std::shared_ptr user_counters; + std::shared_ptr bucket_counters; +}; -template -std::shared_ptr get(ceph::perf_counters::label_pair (&&labels)[Count]) { - if (perf_counters_cache) { - std::string key = ceph::perf_counters::key_create(rgw_op_counters_key, std::move(labels)); - return perf_counters_cache->get(key); - } else { - return std::shared_ptr(nullptr); - } -} +CountersContainer get(req_state *s); -void inc(std::shared_ptr labeled_counters, int idx, uint64_t v); +void inc(const CountersContainer &counters, int idx, uint64_t v); -void tinc(std::shared_ptr labeled_counters, int idx, utime_t); +void tinc(const CountersContainer &counters, int idx, utime_t); -void tinc(std::shared_ptr labeled_counters, int idx, ceph::timespan amt); +void tinc(const CountersContainer &counters, int idx, ceph::timespan amt); } // namespace rgw::op_counters diff --git a/src/rgw/vstart.sh.swift b/src/rgw/vstart.sh.swift new file mode 100755 index 000000000000..46e46da0e4a5 --- /dev/null +++ b/src/rgw/vstart.sh.swift @@ -0,0 +1,1930 @@ +#!/usr/bin/env bash +# -*- mode:sh; tab-width:4; sh-basic-offset:4; indent-tabs-mode:nil -*- +# vim: softtabstop=4 shiftwidth=4 expandtab + +# abort on failure +set -e + +quoted_print() { + for s in "$@"; do + if [[ "$s" =~ \ ]]; then + printf -- "'%s' " "$s" + else + printf -- "$s " + fi + done + printf '\n' +} + +debug() { + "$@" >&2 +} + +prunb() { + debug quoted_print "$@" '&' + PATH=$CEPH_BIN:$PATH "$@" & +} + +prun() { + debug quoted_print "$@" + PATH=$CEPH_BIN:$PATH "$@" +} + + +if [ -n "$VSTART_DEST" ]; then + SRC_PATH=`dirname $0` + SRC_PATH=`(cd $SRC_PATH; pwd)` + + CEPH_DIR=$SRC_PATH + CEPH_BIN=${CEPH_BIN:-${PWD}/bin} + CEPH_LIB=${CEPH_LIB:-${PWD}/lib} + + CEPH_CONF_PATH=$VSTART_DEST + CEPH_DEV_DIR=$VSTART_DEST/dev + CEPH_OUT_DIR=$VSTART_DEST/out + CEPH_ASOK_DIR=$VSTART_DEST/asok + CEPH_OUT_CLIENT_DIR=${CEPH_OUT_CLIENT_DIR:-$CEPH_OUT_DIR} +fi + +get_cmake_variable() { + local variable=$1 + grep "${variable}:" CMakeCache.txt | cut -d "=" -f 2 +} + +# for running out of the CMake build directory +if [ -e CMakeCache.txt ]; then + # Out of tree build, learn source location from CMakeCache.txt + CEPH_ROOT=$(get_cmake_variable ceph_SOURCE_DIR) + CEPH_BUILD_DIR=`pwd` + [ -z "$MGR_PYTHON_PATH" ] && MGR_PYTHON_PATH=$CEPH_ROOT/src/pybind/mgr +fi + +# use CEPH_BUILD_ROOT to vstart from a 'make install' +if [ -n "$CEPH_BUILD_ROOT" ]; then + [ -z "$CEPH_BIN" ] && CEPH_BIN=$CEPH_BUILD_ROOT/bin + [ -z "$CEPH_LIB" ] && CEPH_LIB=$CEPH_BUILD_ROOT/lib + [ -z "$CEPH_EXT_LIB" ] && CEPH_EXT_LIB=$CEPH_BUILD_ROOT/external/lib + [ -z "$EC_PATH" ] && EC_PATH=$CEPH_LIB/erasure-code + [ -z "$OBJCLASS_PATH" ] && OBJCLASS_PATH=$CEPH_LIB/rados-classes + # make install should install python extensions into PYTHONPATH +elif [ -n "$CEPH_ROOT" ]; then + [ -z "$CEPHFS_SHELL" ] && CEPHFS_SHELL=$CEPH_ROOT/src/tools/cephfs/shell/cephfs-shell + [ -z "$PYBIND" ] && PYBIND=$CEPH_ROOT/src/pybind + [ -z "$CEPH_BIN" ] && CEPH_BIN=$CEPH_BUILD_DIR/bin + [ -z "$CEPH_ADM" ] && CEPH_ADM=$CEPH_BIN/ceph + [ -z "$INIT_CEPH" ] && INIT_CEPH=$CEPH_BIN/init-ceph + [ -z "$CEPH_LIB" ] && CEPH_LIB=$CEPH_BUILD_DIR/lib + [ -z "$CEPH_EXT_LIB" ] && CEPH_EXT_LIB=$CEPH_BUILD_DIR/external/lib + [ -z "$OBJCLASS_PATH" ] && OBJCLASS_PATH=$CEPH_LIB + [ -z "$EC_PATH" ] && EC_PATH=$CEPH_LIB + [ -z "$CEPH_PYTHON_COMMON" ] && CEPH_PYTHON_COMMON=$CEPH_ROOT/src/python-common +fi + +if [ -z "${CEPH_VSTART_WRAPPER}" ]; then + PATH=$(pwd):$PATH +fi + +[ -z "$PYBIND" ] && PYBIND=./pybind + +[ -n "$CEPH_PYTHON_COMMON" ] && CEPH_PYTHON_COMMON="$CEPH_PYTHON_COMMON:" +CYTHON_PYTHONPATH="$CEPH_LIB/cython_modules/lib.3" +export PYTHONPATH=$PYBIND:$CYTHON_PYTHONPATH:$CEPH_PYTHON_COMMON$PYTHONPATH + +export LD_LIBRARY_PATH=$CEPH_LIB:$CEPH_EXT_LIB:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$CEPH_LIB:$CEPH_EXT_LIB:$DYLD_LIBRARY_PATH +# Suppress logging for regular use that indicated that we are using a +# development version. vstart.sh is only used during testing and +# development +export CEPH_DEV=1 + +[ -z "$CEPH_NUM_MON" ] && CEPH_NUM_MON="$MON" +[ -z "$CEPH_NUM_OSD" ] && CEPH_NUM_OSD="$OSD" +[ -z "$CEPH_NUM_MDS" ] && CEPH_NUM_MDS="$MDS" +[ -z "$CEPH_NUM_MGR" ] && CEPH_NUM_MGR="$MGR" +[ -z "$CEPH_NUM_FS" ] && CEPH_NUM_FS="$FS" +[ -z "$CEPH_NUM_RGW" ] && CEPH_NUM_RGW="$RGW" +[ -z "$GANESHA_DAEMON_NUM" ] && GANESHA_DAEMON_NUM="$NFS" + +# if none of the CEPH_NUM_* number is specified, kill the existing +# cluster. +if [ -z "$CEPH_NUM_MON" -a \ + -z "$CEPH_NUM_OSD" -a \ + -z "$CEPH_NUM_MDS" -a \ + -z "$CEPH_NUM_MGR" -a \ + -z "$GANESHA_DAEMON_NUM" ]; then + kill_all=1 +else + kill_all=0 +fi + +[ -z "$CEPH_NUM_MON" ] && CEPH_NUM_MON=3 +[ -z "$CEPH_NUM_OSD" ] && CEPH_NUM_OSD=3 +[ -z "$CEPH_NUM_MDS" ] && CEPH_NUM_MDS=3 +[ -z "$CEPH_NUM_MGR" ] && CEPH_NUM_MGR=1 +[ -z "$CEPH_NUM_FS" ] && CEPH_NUM_FS=1 +[ -z "$CEPH_MAX_MDS" ] && CEPH_MAX_MDS=1 +[ -z "$CEPH_NUM_RGW" ] && CEPH_NUM_RGW=0 +[ -z "$GANESHA_DAEMON_NUM" ] && GANESHA_DAEMON_NUM=0 + +[ -z "$CEPH_DIR" ] && CEPH_DIR="$PWD" +[ -z "$CEPH_DEV_DIR" ] && CEPH_DEV_DIR="$CEPH_DIR/dev" +[ -z "$CEPH_OUT_DIR" ] && CEPH_OUT_DIR="$CEPH_DIR/out" +[ -z "$CEPH_ASOK_DIR" ] && CEPH_ASOK_DIR="$CEPH_DIR/asok" +[ -z "$CEPH_RGW_PORT" ] && CEPH_RGW_PORT=8000 +[ -z "$CEPH_CONF_PATH" ] && CEPH_CONF_PATH=$CEPH_DIR +CEPH_OUT_CLIENT_DIR=${CEPH_OUT_CLIENT_DIR:-$CEPH_OUT_DIR} + +if [ $CEPH_NUM_OSD -gt 3 ]; then + OSD_POOL_DEFAULT_SIZE=3 +else + OSD_POOL_DEFAULT_SIZE=$CEPH_NUM_OSD +fi + +extra_conf="" +new=0 +standby=0 +debug=0 +trace=0 +ip="" +nodaemon=0 +redirect=0 +smallmds=0 +short=0 +crimson=0 +ec=0 +cephadm=0 +parallel=true +restart=1 +hitset="" +overwrite_conf=0 +cephx=1 #turn cephx on by default +gssapi_authx=0 +cache="" +if [ `uname` = FreeBSD ]; then + objectstore="memstore" +else + objectstore="bluestore" +fi +ceph_osd=ceph-osd +rgw_frontend="beast prefix=/swift" +rgw_compression="" +lockdep=${LOCKDEP:-1} +spdk_enabled=0 # disable SPDK by default +pmem_enabled=0 +zoned_enabled=0 +io_uring_enabled=0 +with_jaeger=0 + +with_mgr_dashboard=true +if [[ "$(get_cmake_variable WITH_MGR_DASHBOARD_FRONTEND)" != "ON" ]] || + [[ "$(get_cmake_variable WITH_RBD)" != "ON" ]]; then + debug echo "ceph-mgr dashboard not built - disabling." + with_mgr_dashboard=false +fi +with_mgr_restful=false + +kstore_path= +declare -a block_devs +declare -a bluestore_db_devs +declare -a bluestore_wal_devs +declare -a secondary_block_devs +secondary_block_devs_type="SSD" + +VSTART_SEC="client.vstart.sh" + +MON_ADDR="" +DASH_URLS="" +RESTFUL_URLS="" + +conf_fn="$CEPH_CONF_PATH/ceph.conf" +keyring_fn="$CEPH_CONF_PATH/keyring" +monmap_fn="/tmp/ceph_monmap.$$" +inc_osd_num=0 + +msgr="21" + +read -r -d '' usage <: bind to specific ip + -n, --new + --valgrind[_{osd,mds,mon,rgw}] 'toolname args...' + --nodaemon: use ceph-run as wrapper for mon/osd/mds + --redirect-output: only useful with nodaemon, directs output to log file + --smallmds: limit mds cache memory limit + -m ip:port specify monitor address + -k keep old configuration files (default) + -x enable cephx (on by default) + -X disable cephx + -g --gssapi enable Kerberos/GSSApi authentication + -G disable Kerberos/GSSApi authentication + --hitset : enable hitset tracking + -e : create an erasure pool + -o config add extra config parameters to all sections + --rgw_port specify ceph rgw http listen port + --rgw_frontend specify the rgw frontend configuration + --rgw_arrow_flight start arrow flight frontend + --rgw_compression specify the rgw compression plugin + --seastore use seastore as crimson osd backend + -b, --bluestore use bluestore as the osd objectstore backend (default) + -K, --kstore use kstore as the osd objectstore backend + --cyanstore use cyanstore as the osd objectstore backend + --memstore use memstore as the osd objectstore backend + --cache : enable cache tiering on pool + --short: short object names only; necessary for ext4 dev + --nolockdep disable lockdep + --multimds allow multimds with maximum active count + --without-dashboard: do not run using mgr dashboard + --bluestore-spdk: enable SPDK and with a comma-delimited list of PCI-IDs of NVME device (e.g, 0000:81:00.0) + --bluestore-pmem: enable PMEM and with path to a file mapped to PMEM + --msgr1: use msgr1 only + --msgr2: use msgr2 only + --msgr21: use msgr2 and msgr1 + --crimson: use crimson-osd instead of ceph-osd + --crimson-foreground: use crimson-osd, but run it in the foreground + --osd-args: specify any extra osd specific options + --bluestore-devs: comma-separated list of blockdevs to use for bluestore + --bluestore-db-devs: comma-separated list of db-devs to use for bluestore + --bluestore-wal-devs: comma-separated list of wal-devs to use for bluestore + --bluestore-zoned: blockdevs listed by --bluestore-devs are zoned devices (HM-SMR HDD or ZNS SSD) + --bluestore-io-uring: enable io_uring backend + --inc-osd: append some more osds into existing vcluster + --cephadm: enable cephadm orchestrator with ~/.ssh/id_rsa[.pub] + --no-parallel: dont start all OSDs in parallel + --no-restart: dont restart process when using ceph-run + --jaeger: use jaegertracing for tracing + --seastore-devs: comma-separated list of blockdevs to use for seastore + --seastore-secondary-devs: comma-separated list of secondary blockdevs to use for seastore + --seastore-secondary-devs-type: device type of all secondary blockdevs. HDD, SSD(default), ZNS or RANDOM_BLOCK_SSD + --crimson-smp: number of cores to use for crimson +\n +EOF + +usage_exit() { + printf "$usage" + exit +} + +parse_block_devs() { + local opt_name=$1 + shift + local devs=$1 + shift + local dev + IFS=',' read -r -a block_devs <<< "$devs" + for dev in "${block_devs[@]}"; do + if [ ! -b $dev ] || [ ! -w $dev ]; then + echo "All $opt_name must refer to writable block devices" + exit 1 + fi + done +} + +parse_bluestore_db_devs() { + local opt_name=$1 + shift + local devs=$1 + shift + local dev + IFS=',' read -r -a bluestore_db_devs <<< "$devs" + for dev in "${bluestore_db_devs[@]}"; do + if [ ! -b $dev ] || [ ! -w $dev ]; then + echo "All $opt_name must refer to writable block devices" + exit 1 + fi + done +} + +parse_bluestore_wal_devs() { + local opt_name=$1 + shift + local devs=$1 + shift + local dev + IFS=',' read -r -a bluestore_wal_devs <<< "$devs" + for dev in "${bluestore_wal_devs[@]}"; do + if [ ! -b $dev ] || [ ! -w $dev ]; then + echo "All $opt_name must refer to writable block devices" + exit 1 + fi + done +} + +parse_secondary_devs() { + local opt_name=$1 + shift + local devs=$1 + shift + local dev + IFS=',' read -r -a secondary_block_devs <<< "$devs" + for dev in "${secondary_block_devs[@]}"; do + if [ ! -b $dev ] || [ ! -w $dev ]; then + echo "All $opt_name must refer to writable block devices" + exit 1 + fi + done +} + +crimson_smp=1 +while [ $# -ge 1 ]; do +case $1 in + -d | --debug) + debug=1 + ;; + -t | --trace) + trace=1 + ;; + -s | --standby_mds) + standby=1 + ;; + -l | --localhost) + ip="127.0.0.1" + ;; + -i) + [ -z "$2" ] && usage_exit + ip="$2" + shift + ;; + -e) + ec=1 + ;; + --new | -n) + new=1 + ;; + --inc-osd) + new=0 + kill_all=0 + inc_osd_num=$2 + if [ "$inc_osd_num" == "" ]; then + inc_osd_num=1 + else + shift + fi + ;; + --short) + short=1 + ;; + --crimson) + crimson=1 + ceph_osd=crimson-osd + nodaemon=1 + msgr=2 + ;; + --crimson-foreground) + crimson=1 + ceph_osd=crimson-osd + nodaemon=0 + msgr=2 + ;; + --osd-args) + extra_osd_args="$2" + shift + ;; + --msgr1) + msgr="1" + ;; + --msgr2) + msgr="2" + ;; + --msgr21) + msgr="21" + ;; + --cephadm) + cephadm=1 + ;; + --no-parallel) + parallel=false + ;; + --no-restart) + restart=0 + ;; + --valgrind) + [ -z "$2" ] && usage_exit + valgrind=$2 + shift + ;; + --valgrind_args) + valgrind_args="$2" + shift + ;; + --valgrind_mds) + [ -z "$2" ] && usage_exit + valgrind_mds=$2 + shift + ;; + --valgrind_osd) + [ -z "$2" ] && usage_exit + valgrind_osd=$2 + shift + ;; + --valgrind_mon) + [ -z "$2" ] && usage_exit + valgrind_mon=$2 + shift + ;; + --valgrind_mgr) + [ -z "$2" ] && usage_exit + valgrind_mgr=$2 + shift + ;; + --valgrind_rgw) + [ -z "$2" ] && usage_exit + valgrind_rgw=$2 + shift + ;; + --nodaemon) + nodaemon=1 + ;; + --redirect-output) + redirect=1 + ;; + --smallmds) + smallmds=1 + ;; + --rgw_port) + CEPH_RGW_PORT=$2 + shift + ;; + --rgw_frontend) + rgw_frontend=$2 + shift + ;; + --rgw_arrow_flight) + rgw_flight_frontend="yes" + ;; + --rgw_compression) + rgw_compression=$2 + shift + ;; + --kstore_path) + kstore_path=$2 + shift + ;; + -m) + [ -z "$2" ] && usage_exit + MON_ADDR=$2 + shift + ;; + -x) + cephx=1 # this is on be default, flag exists for historical consistency + ;; + -X) + cephx=0 + ;; + + -g | --gssapi) + gssapi_authx=1 + ;; + -G) + gssapi_authx=0 + ;; + + -k) + if [ ! -r $conf_fn ]; then + echo "cannot use old configuration: $conf_fn not readable." >&2 + exit + fi + new=0 + ;; + --memstore) + objectstore="memstore" + ;; + --cyanstore) + objectstore="cyanstore" + ;; + --seastore) + objectstore="seastore" + ;; + -b | --bluestore) + objectstore="bluestore" + ;; + -K | --kstore) + objectstore="kstore" + ;; + --hitset) + hitset="$hitset $2 $3" + shift + shift + ;; + -o) + extra_conf+=$'\n'"$2" + shift + ;; + --cache) + if [ -z "$cache" ]; then + cache="$2" + else + cache="$cache $2" + fi + shift + ;; + --nolockdep) + lockdep=0 + ;; + --multimds) + CEPH_MAX_MDS="$2" + shift + ;; + --without-dashboard) + with_mgr_dashboard=false + ;; + --with-restful) + with_mgr_restful=true + ;; + --seastore-devs) + parse_block_devs --seastore-devs "$2" + shift + ;; + --seastore-secondary-devs) + parse_secondary_devs --seastore-devs "$2" + shift + ;; + --seastore-secondary-devs-type) + secondary_block_devs_type="$2" + shift + ;; + --crimson-smp) + crimson_smp=$2 + shift + ;; + --bluestore-spdk) + [ -z "$2" ] && usage_exit + IFS=',' read -r -a bluestore_spdk_dev <<< "$2" + spdk_enabled=1 + shift + ;; + --bluestore-pmem) + [ -z "$2" ] && usage_exit + bluestore_pmem_file="$2" + pmem_enabled=1 + shift + ;; + --bluestore-devs) + parse_block_devs --bluestore-devs "$2" + shift + ;; + --bluestore-db-devs) + parse_bluestore_db_devs --bluestore-db-devs "$2" + shift + ;; + --bluestore-wal-devs) + parse_bluestore_wal_devs --bluestore-wal-devs "$2" + shift + ;; + --bluestore-zoned) + zoned_enabled=1 + ;; + --bluestore-io-uring) + io_uring_enabled=1 + shift + ;; + --jaeger) + with_jaeger=1 + echo "with_jaeger $with_jaeger" + ;; + *) + usage_exit +esac +shift +done + +if [ $kill_all -eq 1 ]; then + $SUDO $INIT_CEPH stop +fi + +if [ "$new" -eq 0 ]; then + if [ -z "$CEPH_ASOK_DIR" ]; then + CEPH_ASOK_DIR=`dirname $($CEPH_BIN/ceph-conf -c $conf_fn --show-config-value admin_socket)` + fi + mkdir -p $CEPH_ASOK_DIR + MON=`$CEPH_BIN/ceph-conf -c $conf_fn --name $VSTART_SEC --lookup num_mon 2>/dev/null` && \ + CEPH_NUM_MON="$MON" + OSD=`$CEPH_BIN/ceph-conf -c $conf_fn --name $VSTART_SEC --lookup num_osd 2>/dev/null` && \ + CEPH_NUM_OSD="$OSD" + MDS=`$CEPH_BIN/ceph-conf -c $conf_fn --name $VSTART_SEC --lookup num_mds 2>/dev/null` && \ + CEPH_NUM_MDS="$MDS" + MGR=`$CEPH_BIN/ceph-conf -c $conf_fn --name $VSTART_SEC --lookup num_mgr 2>/dev/null` && \ + CEPH_NUM_MGR="$MGR" + RGW=`$CEPH_BIN/ceph-conf -c $conf_fn --name $VSTART_SEC --lookup num_rgw 2>/dev/null` && \ + CEPH_NUM_RGW="$RGW" + NFS=`$CEPH_BIN/ceph-conf -c $conf_fn --name $VSTART_SEC --lookup num_ganesha 2>/dev/null` && \ + GANESHA_DAEMON_NUM="$NFS" +else + # only delete if -n + if [ -e "$conf_fn" ]; then + asok_dir=`dirname $($CEPH_BIN/ceph-conf -c $conf_fn --show-config-value admin_socket)` + rm -- "$conf_fn" + if [ $asok_dir != /var/run/ceph ]; then + [ -d $asok_dir ] && rm -f $asok_dir/* && rmdir $asok_dir + fi + fi + if [ -z "$CEPH_ASOK_DIR" ]; then + CEPH_ASOK_DIR=`mktemp -u -d "${TMPDIR:-/tmp}/ceph-asok.XXXXXX"` + fi +fi + +ARGS="-c $conf_fn" + +run() { + type=$1 + shift + num=$1 + shift + eval "valg=\$valgrind_$type" + [ -z "$valg" ] && valg="$valgrind" + + if [ -n "$valg" ]; then + prunb valgrind --tool="$valg" $valgrind_args "$@" -f + sleep 1 + else + if [ "$nodaemon" -eq 0 ]; then + prun "$@" + else + if [ "$restart" -eq 0 ]; then + set -- '--no-restart' "$@" + fi + if [ "$redirect" -eq 0 ]; then + prunb ${CEPH_ROOT}/src/ceph-run "$@" -f + else + ( prunb ${CEPH_ROOT}/src/ceph-run "$@" -f ) >$CEPH_OUT_DIR/$type.$num.stdout 2>&1 + fi + fi + fi +} + +wconf() { + if [ "$new" -eq 1 -o "$overwrite_conf" -eq 1 ]; then + cat >> "$conf_fn" + fi +} + + +do_rgw_conf() { + + if [ $CEPH_NUM_RGW -eq 0 ]; then + return 0 + fi + + # setup each rgw on a sequential port, starting at $CEPH_RGW_PORT. + # individual rgw's ids will be their ports. + current_port=$CEPH_RGW_PORT + # allow only first rgw to start arrow_flight server/port + local flight_conf=$rgw_flight_frontend + for n in $(seq 1 $CEPH_NUM_RGW); do + wconf << EOF +[client.rgw.${current_port}] + rgw frontends = $rgw_frontend port=${current_port}${flight_conf:+,arrow_flight} + admin socket = ${CEPH_OUT_DIR}/radosgw.${current_port}.asok + debug rgw_flight = 20 + rgw keystone accepted admin roles = admin + rgw keystone accepted roles = admin,Member + rgw keystone admin domain = Default + rgw keystone admin password = ADMIN + rgw keystone admin project = admin + rgw keystone admin user = admin + rgw keystone api version = 3 + rgw keystone implicit tenants = true + rgw swift account in url = true + rgw swift enforce content length = true + rgw swift versioning enabled = true +EOF + current_port=$((current_port + 1)) + unset flight_conf +done + +} + +format_conf() { + local opts=$1 + local indent=" " + local opt + local formatted + while read -r opt; do + if [ -z "$formatted" ]; then + formatted="${opt}" + else + formatted+=$'\n'${indent}${opt} + fi + done <<< "$opts" + echo "$formatted" +} + +prepare_conf() { + local DAEMONOPTS=" + log file = $CEPH_OUT_DIR/\$name.log + admin socket = $CEPH_ASOK_DIR/\$name.asok + chdir = \"\" + pid file = $CEPH_OUT_DIR/\$name.pid + heartbeat file = $CEPH_OUT_DIR/\$name.heartbeat +" + + local mgr_modules="iostat nfs" + if $with_mgr_dashboard; then + mgr_modules+=" dashboard" + fi + if $with_mgr_restful; then + mgr_modules+=" restful" + fi + + local msgr_conf='' + if [ $msgr -eq 21 ]; then + msgr_conf="ms bind msgr2 = true + ms bind msgr1 = true" + fi + if [ $msgr -eq 2 ]; then + msgr_conf="ms bind msgr2 = true + ms bind msgr1 = false" + fi + if [ $msgr -eq 1 ]; then + msgr_conf="ms bind msgr2 = false + ms bind msgr1 = true" + fi + + wconf < $logrotate_conf_path + fi +} + +start_mon() { + local MONS="" + local count=0 + for f in a b c d e f g h i j k l m n o p q r s t u v w x y z + do + [ $count -eq $CEPH_NUM_MON ] && break; + count=$(($count + 1)) + if [ -z "$MONS" ]; then + MONS="$f" + else + MONS="$MONS $f" + fi + done + + if [ "$new" -eq 1 ]; then + if [ `echo $IP | grep '^127\\.'` ]; then + echo + echo "NOTE: hostname resolves to loopback; remote hosts will not be able to" + echo " connect. either adjust /etc/hosts, or edit this script to use your" + echo " machine's real IP." + echo + fi + + prun $SUDO "$CEPH_BIN/ceph-authtool" --create-keyring --gen-key --name=mon. "$keyring_fn" --cap mon 'allow *' + prun $SUDO "$CEPH_BIN/ceph-authtool" --gen-key --name=client.admin \ + --cap mon 'allow *' \ + --cap osd 'allow *' \ + --cap mds 'allow *' \ + --cap mgr 'allow *' \ + "$keyring_fn" + + # build a fresh fs monmap, mon fs + local params=() + local count=0 + local mon_host="" + for f in $MONS + do + if [ $msgr -eq 1 ]; then + A="v1:$IP:$(($CEPH_PORT+$count+1))" + fi + if [ $msgr -eq 2 ]; then + A="v2:$IP:$(($CEPH_PORT+$count+1))" + fi + if [ $msgr -eq 21 ]; then + A="[v2:$IP:$(($CEPH_PORT+$count)),v1:$IP:$(($CEPH_PORT+$count+1))]" + fi + params+=("--addv" "$f" "$A") + mon_host="$mon_host $A" + wconf < /dev/null; then + for f in $CEPH_DEV_DIR/osd$osd/*; do btrfs sub delete $f &> /dev/null || true; done + fi + if [ -n "$kstore_path" ]; then + ln -s $kstore_path $CEPH_DEV_DIR/osd$osd + else + mkdir -p $CEPH_DEV_DIR/osd$osd + if [ -n "${block_devs[$osd]}" ]; then + dd if=/dev/zero of=${block_devs[$osd]} bs=1M count=1 + ln -s ${block_devs[$osd]} $CEPH_DEV_DIR/osd$osd/block + fi + if [ -n "${bluestore_db_devs[$osd]}" ]; then + dd if=/dev/zero of=${bluestore_db_devs[$osd]} bs=1M count=1 + ln -s ${bluestore_db_devs[$osd]} $CEPH_DEV_DIR/osd$osd/block.db + fi + if [ -n "${bluestore_wal_devs[$osd]}" ]; then + dd if=/dev/zero of=${bluestore_wal_devs[$osd]} bs=1M count=1 + ln -s ${bluestore_wal_devs[$osd]} $CEPH_DEV_DIR/osd$osd/block.wal + fi + if [ -n "${secondary_block_devs[$osd]}" ]; then + dd if=/dev/zero of=${secondary_block_devs[$osd]} bs=1M count=1 + mkdir -p $CEPH_DEV_DIR/osd$osd/block.${secondary_block_devs_type}.1 + ln -s ${secondary_block_devs[$osd]} $CEPH_DEV_DIR/osd$osd/block.${secondary_block_devs_type}.1/block + fi + fi + if [ "$objectstore" == "bluestore" ]; then + wconf < $CEPH_DEV_DIR/osd$osd/new.json + ceph_adm osd new $uuid -i $CEPH_DEV_DIR/osd$osd/new.json + rm $CEPH_DEV_DIR/osd$osd/new.json + prun $SUDO $CEPH_BIN/$ceph_osd $extra_osd_args -i $osd $ARGS --mkfs --key $OSD_SECRET --osd-uuid $uuid $extra_seastar_args \ + 2>&1 | tee $CEPH_OUT_DIR/osd-mkfs.$osd.log + + local key_fn=$CEPH_DEV_DIR/osd$osd/keyring + cat > $key_fn< /dev/null; then + secret_file=`mktemp` + ceph_adm restful create-key admin -o $secret_file + RESTFUL_SECRET=`cat $secret_file` + rm $secret_file + else + debug echo MGR Restful is not working, perhaps the package is not installed? + fi +} + +start_mgr() { + local mgr=0 + local ssl=${DASHBOARD_SSL:-1} + # avoid monitors on nearby ports (which test/*.sh use extensively) + MGR_PORT=$(($CEPH_PORT + 1000)) + PROMETHEUS_PORT=9283 + for name in x y z a b c d e f g h i j k l m n o p + do + [ $mgr -eq $CEPH_NUM_MGR ] && break + mgr=$(($mgr + 1)) + if [ "$new" -eq 1 ]; then + mkdir -p $CEPH_DEV_DIR/mgr.$name + key_fn=$CEPH_DEV_DIR/mgr.$name/keyring + $SUDO $CEPH_BIN/ceph-authtool --create-keyring --gen-key --name=mgr.$name $key_fn + ceph_adm -i $key_fn auth add mgr.$name mon 'allow profile mgr' mds 'allow *' osd 'allow *' + + wconf < "${DASHBOARD_ADMIN_SECRET_FILE}" + ceph_adm dashboard ac-user-create admin -i "${DASHBOARD_ADMIN_SECRET_FILE}" \ + administrator --force-password + if [ "$ssl" != "0" ]; then + if ! ceph_adm dashboard create-self-signed-cert; then + debug echo dashboard module not working correctly! + fi + fi + fi + if $with_mgr_restful; then + create_mgr_restful_secret + fi + fi + + if [ "$cephadm" -eq 1 ]; then + debug echo Enabling cephadm orchestrator + if [ "$new" -eq 1 ]; then + digest=$(curl -s \ + https://hub.docker.com/v2/repositories/ceph/daemon-base/tags/latest-master-devel \ + | jq -r '.images[0].digest') + ceph_adm config set global container_image "docker.io/ceph/daemon-base@$digest" + fi + ceph_adm config-key set mgr/cephadm/ssh_identity_key -i ~/.ssh/id_rsa + ceph_adm config-key set mgr/cephadm/ssh_identity_pub -i ~/.ssh/id_rsa.pub + ceph_adm mgr module enable cephadm + ceph_adm orch set backend cephadm + ceph_adm orch host add "$(hostname)" + ceph_adm orch apply crash '*' + ceph_adm config set mgr mgr/cephadm/allow_ptrace true + fi +} + +start_mds() { + local mds=0 + for name in a b c d e f g h i j k l m n o p + do + [ $mds -eq $CEPH_NUM_MDS ] && break + mds=$(($mds + 1)) + + if [ "$new" -eq 1 ]; then + prun mkdir -p "$CEPH_DEV_DIR/mds.$name" + key_fn=$CEPH_DEV_DIR/mds.$name/keyring + wconf < m #--debug_ms 20 + #$CEPH_BIN/ceph-mds -d $ARGS --mds_thrash_fragments 0 --mds_thrash_exports 0 #--debug_ms 20 + #ceph_adm mds set max_mds 2 + done + + if [ $new -eq 1 ]; then + if [ "$CEPH_NUM_FS" -gt "0" ] ; then + sleep 5 # time for MDS to come up as standby to avoid health warnings on fs creation + if [ "$CEPH_NUM_FS" -gt "1" ] ; then + ceph_adm fs flag set enable_multiple true --yes-i-really-mean-it + fi + + # wait for volume module to load + while ! ceph_adm fs volume ls ; do sleep 1 ; done + local fs=0 + for name in a b c d e f g h i j k l m n o p + do + ceph_adm fs volume create ${name} + ceph_adm fs authorize ${name} "client.fs_${name}" / rwp >> "$keyring_fn" + fs=$(($fs + 1)) + [ $fs -eq $CEPH_NUM_FS ] && break + done + fi + fi + +} + +# Ganesha Daemons requires nfs-ganesha nfs-ganesha-ceph nfs-ganesha-rados-grace +# nfs-ganesha-rados-urls (version 3.3 and above) packages installed. On +# Fedora>=31 these packages can be installed directly with 'dnf'. For CentOS>=8 +# the packages are available at +# https://wiki.centos.org/SpecialInterestGroup/Storage +# Similarly for Ubuntu>=16.04 follow the instructions on +# https://launchpad.net/~nfs-ganesha + +start_ganesha() { + cluster_id="vstart" + GANESHA_PORT=$(($CEPH_PORT + 4000)) + local ganesha=0 + test_user="$cluster_id" + pool_name=".nfs" + namespace=$cluster_id + url="rados://$pool_name/$namespace/conf-nfs.$test_user" + + prun ceph_adm auth get-or-create client.$test_user \ + mon "allow r" \ + osd "allow rw pool=$pool_name namespace=$namespace, allow rw tag cephfs data=a" \ + mds "allow rw path=/" \ + >> "$keyring_fn" + + ceph_adm mgr module enable test_orchestrator + ceph_adm orch set backend test_orchestrator + ceph_adm test_orchestrator load_data -i $CEPH_ROOT/src/pybind/mgr/test_orchestrator/dummy_data.json + prun ceph_adm nfs cluster create $cluster_id + prun ceph_adm nfs export create cephfs --fsname "a" --cluster-id $cluster_id --pseudo-path "/cephfs" + + for name in a b c d e f g h i j k l m n o p + do + [ $ganesha -eq $GANESHA_DAEMON_NUM ] && break + + port=$(($GANESHA_PORT + ganesha)) + ganesha=$(($ganesha + 1)) + ganesha_dir="$CEPH_DEV_DIR/ganesha.$name" + prun rm -rf $ganesha_dir + prun mkdir -p $ganesha_dir + + echo "NFS_CORE_PARAM { + Enable_NLM = false; + Enable_RQUOTA = false; + Protocols = 4; + NFS_Port = $port; + } + + MDCACHE { + Dir_Chunk = 0; + } + + NFSv4 { + RecoveryBackend = rados_cluster; + Minor_Versions = 1, 2; + } + + RADOS_KV { + pool = '$pool_name'; + namespace = $namespace; + UserId = $test_user; + nodeid = $name; + } + + RADOS_URLS { + Userid = $test_user; + watch_url = '$url'; + } + + %url $url" > "$ganesha_dir/ganesha-$name.conf" + wconf <.+:${CEPH_PORT}\s+" 1>/dev/null 2>&1 || break + done +fi + +[ -z "$INIT_CEPH" ] && INIT_CEPH=$CEPH_BIN/init-ceph + +# sudo if btrfs +[ -d $CEPH_DEV_DIR/osd0/. ] && [ -e $CEPH_DEV_DIR/sudo ] && SUDO="sudo" + +if [ $inc_osd_num -eq 0 ]; then + prun $SUDO rm -f core* +fi + +[ -d $CEPH_ASOK_DIR ] || mkdir -p $CEPH_ASOK_DIR +[ -d $CEPH_OUT_DIR ] || mkdir -p $CEPH_OUT_DIR +[ -d $CEPH_DEV_DIR ] || mkdir -p $CEPH_DEV_DIR +[ -d $CEPH_OUT_CLIENT_DIR ] || mkdir -p $CEPH_OUT_CLIENT_DIR +if [ $inc_osd_num -eq 0 ]; then + $SUDO find "$CEPH_OUT_DIR" -type f -delete +fi +[ -d gmon ] && $SUDO rm -rf gmon/* + +[ "$cephx" -eq 1 ] && [ "$new" -eq 1 ] && [ -e $keyring_fn ] && rm $keyring_fn + + +# figure machine's ip +HOSTNAME=`hostname -s` +if [ -n "$ip" ]; then + IP="$ip" +else + echo hostname $HOSTNAME + if [ -x "$(which ip 2>/dev/null)" ]; then + IP_CMD="ip addr" + else + IP_CMD="ifconfig" + fi + # filter out IPv4 and localhost addresses + IP="$($IP_CMD | sed -En 's/127.0.0.1//;s/.*inet (addr:)?(([0-9]*\.){3}[0-9]*).*/\2/p' | head -n1)" + # if nothing left, try using localhost address, it might work + if [ -z "$IP" ]; then IP="127.0.0.1"; fi +fi +echo "ip $IP" +echo "port $CEPH_PORT" + + +[ -z $CEPH_ADM ] && CEPH_ADM=$CEPH_BIN/ceph + +ceph_adm() { + if [ "$cephx" -eq 1 ]; then + prun $SUDO "$CEPH_ADM" -c "$conf_fn" -k "$keyring_fn" "$@" + else + prun $SUDO "$CEPH_ADM" -c "$conf_fn" "$@" + fi +} + +if [ $inc_osd_num -gt 0 ]; then + start_osd + exit +fi + +if [ "$new" -eq 1 ]; then + prepare_conf +fi + +if [ $CEPH_NUM_MON -gt 0 ]; then + start_mon + + debug echo Populating config ... + cat <> "$keyring_fn" +fi + +# Don't set max_mds until all the daemons are started, otherwise +# the intended standbys might end up in active roles. +if [ "$CEPH_MAX_MDS" -gt 1 ]; then + sleep 5 # wait for daemons to make it into FSMap before increasing max_mds +fi +fs=0 +for name in a b c d e f g h i j k l m n o p +do + [ $fs -eq $CEPH_NUM_FS ] && break + fs=$(($fs + 1)) + if [ "$CEPH_MAX_MDS" -gt 1 ]; then + ceph_adm fs set "${name}" max_mds "$CEPH_MAX_MDS" + fi +done + +# mgr + +if [ "$ec" -eq 1 ]; then + ceph_adm < "$CEPH_OUT_DIR/$rgw_python_file" + prun python $CEPH_OUT_DIR/$rgw_python_file +} + +do_rgw_create_users() +{ + # Create S3 user + s3_akey='0555b35654ad1656d804' + s3_skey='h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q==' + debug echo "setting up user testid" + $CEPH_BIN/radosgw-admin user create --uid testid --access-key $s3_akey --secret $s3_skey --display-name 'M. Tester' --email tester@ceph.com -c $conf_fn > /dev/null + + # Create S3-test users + # See: https://github.com/ceph/s3-tests + debug echo "setting up s3-test users" + $CEPH_BIN/radosgw-admin user create \ + --uid 0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef \ + --access-key ABCDEFGHIJKLMNOPQRST \ + --secret abcdefghijklmnopqrstuvwxyzabcdefghijklmn \ + --display-name youruseridhere \ + --email s3@example.com --caps="user-policy=*" -c $conf_fn > /dev/null + $CEPH_BIN/radosgw-admin user create \ + --uid 56789abcdef0123456789abcdef0123456789abcdef0123456789abcdef01234 \ + --access-key NOPQRSTUVWXYZABCDEFG \ + --secret nopqrstuvwxyzabcdefghijklmnabcdefghijklm \ + --display-name john.doe \ + --email john.doe@example.com -c $conf_fn > /dev/null + $CEPH_BIN/radosgw-admin user create \ + --tenant testx \ + --uid 9876543210abcdef0123456789abcdef0123456789abcdef0123456789abcdef \ + --access-key HIJKLMNOPQRSTUVWXYZA \ + --secret opqrstuvwxyzabcdefghijklmnopqrstuvwxyzab \ + --display-name tenanteduser \ + --email tenanteduser@example.com -c $conf_fn > /dev/null + + # Create Swift user + debug echo "setting up user tester" + $CEPH_BIN/radosgw-admin user create -c $conf_fn --subuser=test:tester --display-name=Tester-Subuser --key-type=swift --secret=testing --access=full > /dev/null + + echo "" + echo "S3 User Info:" + echo " access key: $s3_akey" + echo " secret key: $s3_skey" + echo "" + echo "Swift User Info:" + echo " account : test" + echo " user : tester" + echo " password : testing" + echo "" +} + +do_rgw() +{ + if [ "$new" -eq 1 ]; then + do_rgw_create_users + if [ -n "$rgw_compression" ]; then + debug echo "setting compression type=$rgw_compression" + $CEPH_BIN/radosgw-admin zone placement modify -c $conf_fn --rgw-zone=default --placement-id=default-placement --compression=$rgw_compression > /dev/null + fi + fi + + if [ -n "$rgw_flight_frontend" ] ;then + debug echo "starting arrow_flight frontend on first rgw" + fi + + # Start server + if [ "$cephadm" -gt 0 ]; then + ceph_adm orch apply rgw rgwTest + return + fi + + RGWDEBUG="" + if [ "$debug" -ne 0 ]; then + RGWDEBUG="--debug-rgw=20 --debug-ms=1" + fi + + local CEPH_RGW_PORT_NUM="${CEPH_RGW_PORT}" + local CEPH_RGW_HTTPS="${CEPH_RGW_PORT: -1}" + if [[ "${CEPH_RGW_HTTPS}" = "s" ]]; then + CEPH_RGW_PORT_NUM="${CEPH_RGW_PORT::-1}" + else + CEPH_RGW_HTTPS="" + fi + RGWSUDO= + [ $CEPH_RGW_PORT_NUM -lt 1024 ] && RGWSUDO=sudo + + current_port=$CEPH_RGW_PORT + # allow only first rgw to start arrow_flight server/port + local flight_conf=$rgw_flight_frontend + for n in $(seq 1 $CEPH_NUM_RGW); do + rgw_name="client.rgw.${current_port}" + + ceph_adm auth get-or-create $rgw_name \ + mon 'allow rw' \ + osd 'allow rwx' \ + mgr 'allow rw' \ + >> "$keyring_fn" + + debug echo start rgw on http${CEPH_RGW_HTTPS}://localhost:${current_port} + run 'rgw' $current_port $RGWSUDO $CEPH_BIN/radosgw -c $conf_fn \ + --log-file=${CEPH_OUT_DIR}/radosgw.${current_port}.log \ + --admin-socket=${CEPH_OUT_DIR}/radosgw.${current_port}.asok \ + --pid-file=${CEPH_OUT_DIR}/radosgw.${current_port}.pid \ + --rgw_luarocks_location=${CEPH_OUT_DIR}/luarocks \ + --rgw_keystone_url=http://localhost:5000 \ + ${RGWDEBUG} \ + -n ${rgw_name} \ + "--rgw_frontends=${rgw_frontend} port=${current_port}${CEPH_RGW_HTTPS}${flight_conf:+,arrow_flight}" + + i=$(($i + 1)) + [ $i -eq $CEPH_NUM_RGW ] && break + + current_port=$((current_port+1)) + unset flight_conf + done +} +if [ "$CEPH_NUM_RGW" -gt 0 ]; then + do_rgw +fi + +# Ganesha Daemons +if [ $GANESHA_DAEMON_NUM -gt 0 ]; then + pseudo_path="/cephfs" + if [ "$cephadm" -gt 0 ]; then + cluster_id="vstart" + port="2049" + prun ceph_adm nfs cluster create $cluster_id + if [ $CEPH_NUM_MDS -gt 0 ]; then + prun ceph_adm nfs export create cephfs --fsname "a" --cluster-id $cluster_id --pseudo-path $pseudo_path + echo "Mount using: mount -t nfs -o port=$port $IP:$pseudo_path mountpoint" + fi + if [ "$CEPH_NUM_RGW" -gt 0 ]; then + pseudo_path="/rgw" + do_rgw_create_bucket + prun ceph_adm nfs export create rgw --cluster-id $cluster_id --pseudo-path $pseudo_path --bucket "nfs-bucket" + echo "Mount using: mount -t nfs -o port=$port $IP:$pseudo_path mountpoint" + fi + else + start_ganesha + echo "Mount using: mount -t nfs -o port= $IP:$pseudo_path mountpoint" + fi +fi + +docker_service(){ + local service='' + #prefer podman + if command -v podman > /dev/null; then + service="podman" + elif pgrep -f docker > /dev/null; then + service="docker" + fi + if [ -n "$service" ]; then + echo "using $service for deploying jaeger..." + #check for exited container, remove them and restart container + if [ "$($service ps -aq -f status=exited -f name=jaeger)" ]; then + $service rm jaeger + fi + if [ ! "$(podman ps -aq -f name=jaeger)" ]; then + $service "$@" + fi + else + echo "cannot find docker or podman, please restart service and rerun." + fi +} + +echo "" +if [ $with_jaeger -eq 1 ]; then + debug echo "Enabling jaegertracing..." + docker_service run -d --name jaeger \ + -p 5775:5775/udp \ + -p 6831:6831/udp \ + -p 6832:6832/udp \ + -p 5778:5778 \ + -p 16686:16686 \ + -p 14268:14268 \ + -p 14250:14250 \ + quay.io/jaegertracing/all-in-one +fi + +debug echo "vstart cluster complete. Use stop.sh to stop. See out/* (e.g. 'tail -f out/????') for debug output." + +echo "" +if [ "$new" -eq 1 ]; then + if $with_mgr_dashboard; then + cat < $CEPH_DIR/vstart_environment.sh +{ + echo "export PYTHONPATH=$PYBIND:$CYTHON_PYTHONPATH:$CEPH_PYTHON_COMMON\$PYTHONPATH" + echo "export LD_LIBRARY_PATH=$CEPH_LIB:\$LD_LIBRARY_PATH" + echo "export PATH=$CEPH_DIR/bin:\$PATH" + echo "export CEPH_CONF=$conf_fn" + # We cannot set CEPH_KEYRING if this is sourced by vstart_runner.py (API tests) + if [ "$CEPH_DIR" != "$PWD" ]; then + echo "export CEPH_KEYRING=$keyring_fn" + fi + + if [ -n "$CEPHFS_SHELL" ]; then + echo "alias cephfs-shell=$CEPHFS_SHELL" + fi +} | tee -a $CEPH_DIR/vstart_environment.sh + +echo "CEPH_DEV=1" + +# always keep this section at the very bottom of this file +STRAY_CONF_PATH="/etc/ceph/ceph.conf" +if [ -f "$STRAY_CONF_PATH" -a -n "$conf_fn" -a ! "$conf_fn" -ef "$STRAY_CONF_PATH" ]; then + echo "" + echo "" + echo "WARNING:" + echo " Please remove stray $STRAY_CONF_PATH if not needed." + echo " Your conf files $conf_fn and $STRAY_CONF_PATH may not be in sync" + echo " and may lead to undesired results." + echo "" + echo "NOTE:" + echo " Remember to restart cluster after removing $STRAY_CONF_PATH" +fi + +init_logrotate diff --git a/src/test/test_perf_counters_cache.cc b/src/test/test_perf_counters_cache.cc index 16d92bd7d431..8867896c02f2 100644 --- a/src/test/test_perf_counters_cache.cc +++ b/src/test/test_perf_counters_cache.cc @@ -960,13 +960,31 @@ TEST(PerfCountersCache, TestLabelStrings) { // test empty val in a label pair will get the label pair added into the perf counters cache but empty key will not std::string label2 = key_create("bad_ctrs1", {{"label3", "val4"}, {"label1", ""}}); - EXPECT_DEATH(pcc->set_counter(label2, TEST_PERFCOUNTERS_COUNTER, 2), ""); + //EXPECT_DEATH(pcc->set_counter(label2, TEST_PERFCOUNTERS_COUNTER, 2), ""); + pcc->set_counter(label2, TEST_PERFCOUNTERS_COUNTER, 2); std::string label3 = key_create("bad_ctrs2", {{"", "val4"}, {"label1", "val1"}}); EXPECT_DEATH(pcc->set_counter(label3, TEST_PERFCOUNTERS_COUNTER, 2), ""); ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); ASSERT_EQ(R"({ + "bad_ctrs1": [ + { + "labels": { + "label1": "", + "label3": "val4" + }, + "counters": { + "test_counter": 2, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], "good_ctrs": [ { "labels": { @@ -990,6 +1008,23 @@ TEST(PerfCountersCache, TestLabelStrings) { // test empty keys in each of the label pairs will not get the label added into the perf counters cache ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); ASSERT_EQ(R"({ + "bad_ctrs1": [ + { + "labels": { + "label1": "", + "label3": "val4" + }, + "counters": { + "test_counter": 2, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], "good_ctrs": [ { "labels": { @@ -1023,6 +1058,23 @@ TEST(PerfCountersCache, TestLabelStrings) { ASSERT_EQ("", client.do_request(R"({ "prefix": "counter dump", "format": "raw" })", &message)); ASSERT_EQ(R"({ + "bad_ctrs1": [ + { + "labels": { + "label1": "", + "label3": "val4" + }, + "counters": { + "test_counter": 2, + "test_time": 0.000000000, + "test_time_avg": { + "avgcount": 0, + "sum": 0.000000000, + "avgtime": 0.000000000 + } + } + } + ], "good_ctrs": [ { "labels": { From 5d1b5da21591c57cb0cbbbc8775b6ea0ced953a4 Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Wed, 11 Oct 2023 21:12:03 +0000 Subject: [PATCH 0167/2492] src/mon/Monitor: Fix set_elector_disallowed_leaders Problem: In the monitors we hold 2 copies of disallowed_leader ... 1. MonMap class 2. Elector class. When computing the ConnectivityScore for the monitors during the election, we use the `disallowed_leader` from Elector class to determine which monitors we shouldn't allow to lead. Now, we rely on the function `set_elector_disallowed_leaders` to set the `disallowed_leader` of the Elector class, MonMap class copy of the `disallowed_leader` contains the `tiebreaker_monitor` so we inherit that plus we also add the monitors that are dead due to a zone failure. Hence, the `adding dead monitors` phase is only allowed if we can enter stretch_mode. However, there is a problem when failing over a stretch cluster zone and reviving the entire zone back up, the revived monitors couldn't enter stretch_mode when they are at the state of "probing" since PaxosServices like osdmon becomes unreadable (this is expected) Solution: We unconditionally add monitors that are in `monmap->stretch_marked_down_mons` to the `disallowed_leaders` list in `Monitor::set_elector_disallowed_leaders` since if the monitors are in `monmap->stretch_marked_down_mons` we know that they probably belong in a marked down zone and is not fit for lead. This will fix the problem of newly revived monitors having different disallowed_leaders set and getting stuck in election. Fixes: https://tracker.ceph.com/issues/63183 Signed-off-by: Kamoltat --- src/mon/Monitor.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 6866536d0654..27151e60b220 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -6658,14 +6658,16 @@ void Monitor::notify_new_monmap(bool can_change_external_state, bool remove_rank void Monitor::set_elector_disallowed_leaders(bool allow_election) { set dl; + // inherit dl from monmap for (auto name : monmap->disallowed_leaders) { dl.insert(monmap->get_rank(name)); - } - if (is_stretch_mode()) { - for (auto name : monmap->stretch_marked_down_mons) { - dl.insert(monmap->get_rank(name)); - } - dl.insert(monmap->get_rank(monmap->tiebreaker_mon)); + } // unconditionally add stretch_marked_down_mons to the new dl copy + for (auto name : monmap->stretch_marked_down_mons) { + dl.insert(monmap->get_rank(name)); + } // add the tiebreaker_mon incase it is not in monmap->disallowed_leaders + if (!monmap->tiebreaker_mon.empty() && + monmap->contains(monmap->tiebreaker_mon)) { + dl.insert(monmap->get_rank(monmap->tiebreaker_mon)); } bool disallowed_changed = elector.set_disallowed_leaders(dl); From 99d12712c566800d9a710bc281980d052c3ed144 Mon Sep 17 00:00:00 2001 From: galsalomon66 Date: Sat, 9 Sep 2023 16:38:23 +0300 Subject: [PATCH 0168/2492] fix per QE defect. s3select submodule s3select engine changes s3select submodule aggregation functions upon no data Signed-off-by: galsalomon66 --- src/s3select | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/s3select b/src/s3select index 8f86167c65cc..9ade26c63ef1 160000 --- a/src/s3select +++ b/src/s3select @@ -1 +1 @@ -Subproject commit 8f86167c65ccd4f134b6baec0eeb0ed7ea193bf8 +Subproject commit 9ade26c63ef1016dac868f53270a61e6232be9ba From a5027e37ec1856b92e2af3fd19e23af537d040af Mon Sep 17 00:00:00 2001 From: Nizamudeen A Date: Mon, 25 Sep 2023 18:58:06 +0530 Subject: [PATCH 0169/2492] mgr/dashboard: fix broken alert generator Currently the alert generator is broken if you try to run `tox -ealerts-fix`. I fixed it and ran the command and it built a new json file as well. Signed-off-by: Nizamudeen A --- monitoring/ceph-mixin/README.md | 6 ++++++ monitoring/ceph-mixin/alerts.jsonnet | 2 +- monitoring/ceph-mixin/jsonnetfile.lock.json | 2 +- monitoring/ceph-mixin/prometheus_alerts.libsonnet | 2 +- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/monitoring/ceph-mixin/README.md b/monitoring/ceph-mixin/README.md index 4772021939ef..f34d67f92a05 100644 --- a/monitoring/ceph-mixin/README.md +++ b/monitoring/ceph-mixin/README.md @@ -73,4 +73,10 @@ The jsonnet code located in this directory depends on some Jsonnet third party libraries. To update those libraries you can run `jb update` and then update the generated files using `tox -egrafonnet-fix`. +### Building alerts from `prometheus_alerts.libsonnet` + +To rebuild the `prometheus_alerts.yml` file from the corresponding libsonnet, +you can run `tox -ealerts-fix`. + + ##### Any upgrade or downgrade to different major versions of the recommended tools mentioned above is not supported. diff --git a/monitoring/ceph-mixin/alerts.jsonnet b/monitoring/ceph-mixin/alerts.jsonnet index ab7907c76fd1..13e70179f14d 100644 --- a/monitoring/ceph-mixin/alerts.jsonnet +++ b/monitoring/ceph-mixin/alerts.jsonnet @@ -1 +1 @@ -std.manifestYamlDoc(((import 'config.libsonnet') + (import 'alerts.libsonnet')).prometheusAlerts, indent_array_in_object=true, quote_keys=false) +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts, indent_array_in_object=true, quote_keys=false) diff --git a/monitoring/ceph-mixin/jsonnetfile.lock.json b/monitoring/ceph-mixin/jsonnetfile.lock.json index 3c9d38d935ce..480438230f39 100644 --- a/monitoring/ceph-mixin/jsonnetfile.lock.json +++ b/monitoring/ceph-mixin/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "grafonnet" } }, - "version": "30280196507e0fe6fa978a3e0eaca3a62844f817", + "version": "a1d61cce1da59c71409b99b5c7568511fec661ea", "sum": "342u++/7rViR/zj2jeJOjshzglkZ1SY+hFNuyCBFMdc=" } ], diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index a7c994ba9b6a..b90573be6c01 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -236,7 +236,7 @@ annotations: { documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany', summary: 'Too many devices are predicted to fail, unable to resolve%(cluster)s' % $.MultiClusterSummary(), - description: 'The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availabililty. Prevent data integrity issues by adding new OSDs so that data may be relocated.', + description: 'The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated.', }, }, { From b18a1441bebdb5c997ed3970d0a53dd39cebd277 Mon Sep 17 00:00:00 2001 From: Nizamudeen A Date: Fri, 13 Oct 2023 13:17:05 +0530 Subject: [PATCH 0170/2492] mgr/dashboard: disable hosts field while editing the filesystem Even though the Placement field was disabled, the Host field was still showing up in UI while Editing. That option is not possible in fs form. Fixes: https://tracker.ceph.com/issues/63193 Signed-off-by: Nizamudeen A --- .../cephfs-form/cephfs-form.component.html | 4 +-- .../cephfs-form/cephfs-form.component.spec.ts | 29 +++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-form/cephfs-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-form/cephfs-form.component.html index 76e51b2c5f39..05235d16ccd4 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-form/cephfs-form.component.html +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-form/cephfs-form.component.html @@ -59,7 +59,7 @@ -