apache · qinzuoyan · Jun 18, 2019 · Jun 16, 2019 · Jun 17, 2019 · Jun 17, 2019
diff --git a/rdsn b/rdsn
diff --git a/src/server/config-server.ini b/src/server/config-server.ini
@@ -274,16 +274,19 @@ falcon_path = /v1/push
 
 [pegasus.collector]
 cluster = onebox
+
 available_detect_app = @APP_NAME@
 available_detect_alert_script_dir = ./package/bin
 available_detect_alert_email_address =
 available_detect_interval_seconds = 3
 available_detect_alert_fail_count = 30
 available_detect_timeout = 5000
+
 app_stat_interval_seconds = 10
 
-cu_stat_app = stat
+cu_stat_app = @APP_NAME@
 cu_fetch_interval_seconds = 8
+st_fetch_interval_seconds = 60
 
 [pegasus.clusters]
 onebox = @LOCAL_IP@:34601,@LOCAL_IP@:34602,@LOCAL_IP@:34603

diff --git a/src/server/config.ini b/src/server/config.ini
@@ -285,16 +285,19 @@
 
 [pegasus.collector]
   cluster = %{cluster.name}
+
   available_detect_app = temp
   available_detect_alert_script_dir = ./package/bin
   available_detect_alert_email_address =
   available_detect_interval_seconds = 3
   available_detect_alert_fail_count = 30
   available_detect_timeout = 5000
+
   app_stat_interval_seconds = 10
 
   cu_stat_app = stat
   cu_fetch_interval_seconds = 8
+  st_fetch_interval_seconds = 3600
 
 [pegasus.clusters]
   %{cluster.name} = %{meta.server.list}

diff --git a/src/server/info_collector.cpp b/src/server/info_collector.cpp
@@ -26,6 +26,7 @@ namespace server {
 
 DEFINE_TASK_CODE(LPC_PEGASUS_APP_STAT_TIMER, TASK_PRIORITY_COMMON, ::dsn::THREAD_POOL_DEFAULT)
 DEFINE_TASK_CODE(LPC_PEGASUS_CU_STAT_TIMER, TASK_PRIORITY_COMMON, ::dsn::THREAD_POOL_DEFAULT)
+DEFINE_TASK_CODE(LPC_PEGASUS_ST_STAT_TIMER, TASK_PRIORITY_COMMON, ::dsn::THREAD_POOL_DEFAULT)
 
 info_collector::info_collector()
 {
@@ -65,6 +66,17 @@ info_collector::info_collector()
                                               "cu_fetch_interval_seconds",
                                               8, // default value 8s
                                               "capacity unit fetch interval seconds");
+    _cu_fetch_retry_count = 3;
+    _cu_fetch_retry_wait_seconds = 1;
+
+    _st_fetch_interval_seconds =
+        (uint32_t)dsn_config_get_value_uint64("pegasus.collector",
+                                              "st_fetch_interval_seconds",
+                                              3600, // default value 1h
+                                              "storage size fetch interval seconds");
+    _st_fetch_retry_count = 3;
+    // _st_fetch_retry_wait_seconds is in range of [1, 60]
+    _st_fetch_retry_wait_seconds = std::min(60u, std::max(1u, _st_fetch_interval_seconds / 10));
 }
 
 info_collector::~info_collector()
@@ -88,10 +100,18 @@ void info_collector::start()
     _cu_stat_timer_task =
         ::dsn::tasking::enqueue_timer(LPC_PEGASUS_CU_STAT_TIMER,
                                       &_tracker,
-                                      [this] { on_capacity_unit_stat(); },
+                                      [this] { on_capacity_unit_stat(_cu_fetch_retry_count); },
                                       std::chrono::seconds(_cu_fetch_interval_seconds),
                                       0,
                                       std::chrono::minutes(1));
+
+    _st_stat_timer_task =
+        ::dsn::tasking::enqueue_timer(LPC_PEGASUS_ST_STAT_TIMER,
+                                      &_tracker,
+                                      [this] { on_storage_size_stat(_st_fetch_retry_count); },
+                                      std::chrono::seconds(_st_fetch_interval_seconds),
+                                      0,
+                                      std::chrono::minutes(1));
 }
 
 void info_collector::stop() { _tracker.cancel_outstanding_tasks(); }
@@ -230,21 +250,34 @@ info_collector::AppStatCounters *info_collector::get_app_counters(const std::str
     return counters;
 }
 
-void info_collector::on_capacity_unit_stat()
+void info_collector::on_capacity_unit_stat(int remaining_retry_count)
 {
     ddebug("start to stat capacity unit");
     std::vector<node_capacity_unit_stat> nodes_stat;
     if (!get_capacity_unit_stat(&_shell_context, nodes_stat)) {
-        derror("get capacity unit stat failed");
+        if (remaining_retry_count > 0) {
+            derror("get capacity unit stat failed, remaining_retry_count = %d, "
+                   "wait %u seconds to retry",
+                   remaining_retry_count,
+                   _cu_fetch_retry_wait_seconds);
+            ::dsn::tasking::enqueue(LPC_PEGASUS_CU_STAT_TIMER,
+                                    &_tracker,
+                                    [=] { on_capacity_unit_stat(remaining_retry_count - 1); },
+                                    0,
+                                    std::chrono::seconds(_cu_fetch_retry_wait_seconds));
+        } else {
+            derror("get capacity unit stat failed, remaining_retry_count = 0, no retry anymore");
+        }
         return;
     }
-    for (auto elem : nodes_stat) {
-        if (!has_capacity_unit_updated(elem.node_address, elem.timestamp)) {
+    for (node_capacity_unit_stat &elem : nodes_stat) {
+        if (elem.node_address.empty() || elem.timestamp.empty() ||
+            !has_capacity_unit_updated(elem.node_address, elem.timestamp)) {
             dinfo("recent read/write capacity unit value of node %s has not updated",
                   elem.node_address.c_str());
             continue;
         }
-        _result_writer->set_result(elem.timestamp, elem.node_address, elem.dump_to_json());
+        _result_writer->set_result(elem.timestamp, "cu@" + elem.node_address, elem.dump_to_json());
     }
 }
 
@@ -258,10 +291,34 @@ bool info_collector::has_capacity_unit_updated(const std::string &node_address,
         return true;
     }
     if (timestamp > find->second) {
-        _cu_update_info[node_address] = timestamp;
+        find->second = timestamp;
         return true;
     }
     return false;
 }
+
+void info_collector::on_storage_size_stat(int remaining_retry_count)
+{
+    ddebug("start to stat storage size");
+    app_storage_size_stat st_stat;
+    if (!get_storage_size_stat(&_shell_context, st_stat)) {
+        if (remaining_retry_count > 0) {
+            derror("get storage size stat failed, remaining_retry_count = %d, "
+                   "wait %u seconds to retry",
+                   remaining_retry_count,
+                   _st_fetch_retry_wait_seconds);
+            ::dsn::tasking::enqueue(LPC_PEGASUS_ST_STAT_TIMER,
+                                    &_tracker,
+                                    [=] { on_storage_size_stat(remaining_retry_count - 1); },
+                                    0,
+                                    std::chrono::seconds(_st_fetch_retry_wait_seconds));
+        } else {
+            derror("get storage size stat failed, remaining_retry_count = 0, no retry anymore");
+        }
+        return;
+    }
+    _result_writer->set_result(st_stat.timestamp, "st", st_stat.dump_to_json());
+}
+
 } // namespace server
 } // namespace pegasus
diff --git a/src/server/info_collector.h b/src/server/info_collector.h
@@ -66,9 +66,11 @@ class info_collector
     void on_app_stat();
     AppStatCounters *get_app_counters(const std::string &app_name);
 
-    void on_capacity_unit_stat();
+    void on_capacity_unit_stat(int remaining_retry_count);
     bool has_capacity_unit_updated(const std::string &node_address, const std::string &timestamp);
 
+    void on_storage_size_stat(int remaining_retry_count);
+
 private:
     dsn::task_tracker _tracker;
     ::dsn::rpc_address _meta_servers;
@@ -86,7 +88,13 @@ class info_collector
     // for writing cu stat result
     std::unique_ptr<result_writer> _result_writer;
     uint32_t _cu_fetch_interval_seconds;
+    uint32_t _cu_fetch_retry_count;
+    uint32_t _cu_fetch_retry_wait_seconds;
     ::dsn::task_ptr _cu_stat_timer_task;
+    uint32_t _st_fetch_interval_seconds;
+    uint32_t _st_fetch_retry_count;
+    uint32_t _st_fetch_retry_wait_seconds;
+    ::dsn::task_ptr _st_stat_timer_task;
     ::dsn::utils::ex_lock_nr _cu_update_info_lock;
     // mapping 'node address' --> 'last updated timestamp'
     std::map<std::string, string> _cu_update_info;
+1 −1		bin/dsn.cmake
+0 −5		doc/about.h
+0 −25		doc/api.h
+0 −19		doc/extensions.h
+0 −8		doc/footer.rDSN.html
+0 −54		doc/header.rDSN.html
+0 −7		doc/how-to.h
+0 −39		doc/install.h
+0 −1,475		doc/rDSN.css
+0 −35		doc/tools.h
+0 −54		doc/tutorials.h
+8 −1		include/dsn/perf_counter/perf_counters.h
+0 −80		run.cmd
+72 −2		src/core/perf_counter/perf_counters.cpp
+188 −0		src/core/tests/http_server_test.cpp
+95 −34		src/core/tools/http/http_message_parser.cpp
+49 −24		src/core/tools/http/http_message_parser.h
+5 −1		src/core/tools/http/http_server.cpp
+35 −9		src/dist/failure_detector/failure_detector.cpp
+8 −8		src/dist/failure_detector_multimaster/failure_detector_multimaster.cpp
+8 −5		src/dist/replication/meta_server/meta_server_failure_detector.cpp