From c24982b6f0ee02d934b9f974a01aa3455d52daa4 Mon Sep 17 00:00:00 2001 From: Jerry Blakley Date: Fri, 29 Apr 2016 23:54:48 +0000 Subject: [PATCH] Send stats through socket rather than through logs --- src/fusedav-statsd.c | 68 +++++++++++++--- src/fusedav-statsd.h | 6 ++ src/fusedav.c | 4 +- src/log.c | 2 +- src/session.c | 181 ++++++------------------------------------- 5 files changed, 88 insertions(+), 173 deletions(-) diff --git a/src/fusedav-statsd.c b/src/fusedav-statsd.c index 48052a67..a7472fc0 100644 --- a/src/fusedav-statsd.c +++ b/src/fusedav-statsd.c @@ -35,8 +35,8 @@ // e.g. fusedav.valhallayolo1b.server-104_130_221_144.exceeded-time-small-GET-latency (82 characters) #define STATS_MSG_LEN 128 -// Needs to hold the prefix for a stat, e.g. fusedav.valhalla2.server- -#define STATS_PREFIX_LEN 64 +// Needs to hold the prefix for a stat, e.g. fusedav +#define STATS_PREFIX_LEN 32 /* Socket and server for function sendto */ /* server is a sockaddr_storage so it can hold either IPV4 or IPV6 */ @@ -188,8 +188,7 @@ int stats_init(const char *domain, const char *port) { memset(&server, 0, sizeof(server)); // stats_prefix is, after being set here, an unchanging global string - // fusedav keeps many stats by filesystem cluster, so include it - snprintf(server.stats_prefix, STATS_PREFIX_LEN, "fusedav.%s.server-", get_filesystem_cluster()); + snprintf(server.stats_prefix, STATS_PREFIX_LEN, "fusedav"); if (set_addr(domain, port, &server)) { // FAILURE; pass it on @@ -238,16 +237,26 @@ static int stats_send(const char *statmsg) { return 0; } -static int compose_message(const char *statname, const signed int value, char *type, char *msg) { - return snprintf(msg, STATS_MSG_LEN, "%s%s.%s:%d|%s\n", server.stats_prefix, get_nodeaddr(), statname, value, type); +static int compose_message(const char *statname, const signed int value, char *type, char *msg, const char *cluster, const char *node) { + if (node) { + return snprintf(msg, STATS_MSG_LEN, "%s.%s.server-%s.%s:%d|%s\n", + server.stats_prefix, cluster, node, statname, value, type); + } else if (cluster) { + return snprintf(msg, STATS_MSG_LEN, "%s.%s.%s:%d|%s\n", + server.stats_prefix, cluster, statname, value, type); + } else { + return snprintf(msg, STATS_MSG_LEN, "%s.%s:%d|%s\n", + server.stats_prefix, statname, value, type); + } + } -int stats_counter(const char *statname, const int value) { +static int stats_counter_common(const char *statname, const int value, const char *cluster, const char *node) { int error; int res; char msg[STATS_MSG_LEN]; char type[] = "c"; - res = compose_message(statname, value, type, msg); + res = compose_message(statname, value, type, msg, cluster, node); if (res < 0) { return res; } @@ -255,12 +264,24 @@ int stats_counter(const char *statname, const int value) { return error; } -int stats_gauge(const char *statname, const int value) { +int stats_counter(const char *statname, const int value) { + return stats_counter_common(statname, value, get_filesystem_cluster(), get_nodeaddr()); +} + +int stats_counter_cluster(const char *statname, const int value) { + return stats_counter_common(statname, value, get_filesystem_cluster(), NULL); +} + +int stats_counter_local(const char *statname, const int value) { + return stats_counter_common(statname, value, NULL, NULL); +} + +static int stats_gauge_common(const char *statname, const int value, const char *cluster, const char *node) { int error; int res; char msg[STATS_MSG_LEN]; char type[] = "g"; - res = compose_message(statname, value, type, msg); + res = compose_message(statname, value, type, msg, cluster, node); if (res < 0) { return res; } @@ -268,12 +289,24 @@ int stats_gauge(const char *statname, const int value) { return error; } -int stats_timer(const char *statname, const int value) { +int stats_gauge(const char *statname, const int value) { + return stats_gauge_common(statname, value, get_filesystem_cluster(), get_nodeaddr()); +} + +int stats_gauge_cluster(const char *statname, const int value) { + return stats_gauge_common(statname, value, get_filesystem_cluster(), NULL); +} + +int stats_gauge_local(const char *statname, const int value) { + return stats_gauge_common(statname, value, NULL, NULL); +} + +static int stats_timer_common(const char *statname, const int value, const char *cluster, const char *node) { int error; int res; char msg[STATS_MSG_LEN]; char type[] = "ms"; - res = compose_message(statname, value, type, msg); + res = compose_message(statname, value, type, msg, cluster, node); if (res < 0) { return res; } @@ -281,6 +314,17 @@ int stats_timer(const char *statname, const int value) { return error; } +int stats_timer(const char *statname, const int value) { + return stats_timer_common(statname, value, get_filesystem_cluster(), get_nodeaddr()); +} + +int stats_timer_cluster(const char *statname, const int value) { + return stats_timer_common(statname, value, get_filesystem_cluster(), NULL); +} + +int stats_timer_local(const char *statname, const int value) { + return stats_timer_common(statname, value, NULL, NULL); +} /* For reference: * diff --git a/src/fusedav-statsd.h b/src/fusedav-statsd.h index 720b5bad..1c48f24d 100644 --- a/src/fusedav-statsd.h +++ b/src/fusedav-statsd.h @@ -23,7 +23,13 @@ int stats_init(const char *domain, const char *port); int stats_close(void); int stats_counter(const char *statname, const int value); +int stats_counter_cluster(const char *statname, const int value); +int stats_counter_local(const char *statname, const int value); int stats_gauge(const char *statname, const int value); +int stats_gauge_cluster(const char *statname, const int value); +int stats_gauge_local(const char *statname, const int value); int stats_timer(const char *statname, const int value); +int stats_timer_cluster(const char *statname, const int value); +int stats_timer_local(const char *statname, const int value); #endif diff --git a/src/fusedav.c b/src/fusedav.c index ce86919d..e8ba9973 100644 --- a/src/fusedav.c +++ b/src/fusedav.c @@ -711,7 +711,7 @@ static void get_stat(const char *path, struct stat *stbuf, GError **gerr) { if (parent_children_update_ts < (time(NULL) - STAT_CACHE_NEGATIVE_TTL)) { GError *subgerr = NULL; - stats_counter("propfind-nonnegative-cache", 1); + stats_counter_local("propfind-nonnegative-cache", 1); log_print(LOG_INFO, SECTION_FUSEDAV_STAT, "get_stat: Calling update_directory: %s; attempt_progressive_update will be %d", parent_path, (parent_children_update_ts > 0)); // If parent_children_update_ts is 0, there are no entries for updated_children in statcache @@ -722,7 +722,7 @@ static void get_stat(const char *path, struct stat *stbuf, GError **gerr) { goto fail; } } else { - stats_counter("propfind-negative-cache", 1); + stats_counter_local("propfind-negative-cache", 1); BUMP(propfind_negative_cache); } diff --git a/src/log.c b/src/log.c index 773d5b0c..0f76f3ad 100644 --- a/src/log.c +++ b/src/log.c @@ -149,7 +149,7 @@ void set_dynamic_logging(void) { else { dynamic_logging_start = now.tv_sec; log_print(LOG_NOTICE, SECTION_FUSEDAV_DEFAULT, - "Setting dynamic_logging for %lu seconds. fusedav.dynamic_logging:1|c", dynamic_logging_duration); + "Setting dynamic_logging for %lu seconds.", dynamic_logging_duration); LOG_DYNAMIC = LOG_INFO - 1; } } diff --git a/src/session.c b/src/session.c index 281ab0cd..e7e78630 100644 --- a/src/session.c +++ b/src/session.c @@ -42,6 +42,7 @@ #include "log_sections.h" #include "util.h" #include "session.h" +#include "fusedav-statsd.h" static pthread_once_t session_once = PTHREAD_ONCE_INIT; static pthread_key_t session_tsd_key; @@ -242,10 +243,7 @@ static void print_errors(const int iter, const char *type_str, const char *fcn_n // Stats log for all errors // Distinguish curl from 500-status failures from slow requests - log_print(LOG_INFO, SECTION_ENHANCED, - "%s: curl iter %d on path %s; %s -- fusedav.%s.server-%s.%s:1|c", - fcn_name, iter, path, error_str, filesystem_cluster, nodeaddr, type_str); - + stats_counter(error_str, 1); log_print(LOG_ERR, SECTION_SESSION_DEFAULT, "%s: curl iter %d on path %s; %s -- fusedav.%s.server-%s.%s", fcn_name, iter, path, error_str, filesystem_cluster, nodeaddr, type_str); @@ -255,19 +253,13 @@ static void print_errors(const int iter, const char *type_str, const char *fcn_n char *failure_str = NULL; asprintf(&failure_str, "%d_failures", iter + 1); - // Stats log // Is this the first, second, or third failure for this request? - log_print(LOG_INFO, SECTION_ENHANCED, - "%s: curl iter %d on path %s; %s -- fusedav.%s.server-%s.%s:1|c", - fcn_name, iter, path, error_str, filesystem_cluster, nodeaddr, failure_str); + stats_counter(failure_str, 1); free(failure_str); - // Stats log // Total failures - log_print(LOG_INFO, SECTION_ENHANCED, - "%s: curl iter %d on path %s; %s -- fusedav.%s.server-%s.failures:1|c", - fcn_name, iter, path, error_str, filesystem_cluster, nodeaddr); + stats_counter("failures", 1); } free(error_str); @@ -339,70 +331,16 @@ static void increment_node_success(char *addr) { } } -static void log_filesystem_nodes(const char *fcn_name, const int iter, const char *path) { - static __thread unsigned long count = 0; - static __thread time_t previous_time = 0; - static __thread char previous_nodeaddr[LOGSTRSZ]; - // Print every 100th access - const unsigned long count_trigger = 1000; - // Print every 60th second - const time_t time_trigger = 60; - time_t current_time; - bool print_it; - int nodeaddr_changed; - - ++count; - // Track curl accesses to this filesystem node - // fusedav.conf will always set SECTION_ENHANCED to 6 in LOG_SECTIONS. These log entries will always - // print, but at INFO will be easier to filter out - // We're overloading the journal, so only log every print_count_trigger count or every print_interval time - current_time = time(NULL); - // Always print the first one. Then print if our interval has expired - print_it = (previous_time == 0) || (current_time - previous_time >= time_trigger); - // If this is the very first call, initialize previous to current - if (previous_nodeaddr[0] == '\0') strncpy(previous_nodeaddr, nodeaddr, LOGSTRSZ); - nodeaddr_changed = strncmp(nodeaddr, previous_nodeaddr, LOGSTRSZ); - // Also print if we have exceeded count - if (print_it || count >= count_trigger || nodeaddr_changed) { - if (nodeaddr_changed) --count; // Print for previous node, which doesn't include this call, then for this call - log_print(LOG_INFO, SECTION_ENHANCED, - "curl iter %d on path %s -- fusedav.%s.server-%s.attempts:%lu|c", iter, path, filesystem_cluster, previous_nodeaddr, count); - count = 0; - previous_time = current_time; - if (nodeaddr_changed) { - log_print(LOG_INFO, SECTION_ENHANCED, - "curl iter %d changed to path %s -- fusedav.%s.server-%s.attempts:%lu|c", - iter, path, filesystem_cluster, nodeaddr, 1); - strncpy(previous_nodeaddr, nodeaddr, LOGSTRSZ); - } - } - - if (iter > 0) { - log_print(LOG_INFO, SECTION_ENHANCED, - "%s: curl iter %d on path %s -- fusedav.%s.server-%s.recoveries:1|c", fcn_name, iter, path, filesystem_cluster, nodeaddr); - log_print(LOG_NOTICE, SECTION_SESSION_DEFAULT, - "%s: curl iter %d on path %s -- fusedav.%s.server-%s.recoveries", fcn_name, iter, path, filesystem_cluster, nodeaddr); - increment_node_success(nodeaddr); - } - else if (iter != -1) { // -1 is like a sentinel for when we call this during switch to new handle - increment_node_success(nodeaddr); - } -} - // Call session_cleanup when reinitializing a handle, or called from session_destroy when thread exits static void session_cleanup(void *s) { CURL *session = s; if (!session) return; - // The first log statement will get stripped from logstash because it has the stats designator |c, so log a second one - log_print(LOG_INFO, SECTION_ENHANCED, - "Destroying cURL handle -- fusedav.%s.sessions:-1|c fusedav.%s.session-duration:%lu|c", - filesystem_cluster, filesystem_cluster, time(NULL) - session_start_time); + stats_counter_cluster("sessions", 1); + stats_timer_cluster("session-duration", time(NULL) - session_start_time); log_print(LOG_INFO, SECTION_SESSION_DEFAULT, "Destroying cURL handle"); - // Before we go, make sure we've printed the number of curl accesses we accumulated - log_filesystem_nodes("session_cleanup", -1, "no path"); curl_easy_cleanup(session); session = NULL; pthread_setspecific(session_tsd_key, session); @@ -540,7 +478,8 @@ void action_s3_e1 (void) { failure_timestamp = now.tv_sec; try_release_request_outstanding(); saint_state = STATE_SAINT_MODE; - log_print(LOG_INFO, SECTION_ENHANCED, "Setting cluster saint mode for %lu seconds. fusedav.saint_mode:1|c", saint_mode_duration); + stats_counter_cluster("saint_mode", 1); + log_print(LOG_NOTICE, SECTION_ENHANCED, "Setting cluster saint mode for %lu seconds.", saint_mode_duration); log_print(LOG_NOTICE, SECTION_SESSION_DEFAULT, "Event CLUSTER_FAILURE; transitioned to STATE_SAINT_MODE from STATE_ATTEMPTING_TO_EXIT_SAINT_MODE."); } void action_s3_e2 (void) {} @@ -557,8 +496,9 @@ void trigger_saint_mode_expired_if_needed(void) { state_table[saint_state][SAINT_MODE_DURATION_EXPIRED](); // If we've been in saintmode for longer than saint_mode_warning_threshold, emit a stat saying so. if (now.tv_sec >= unhealthy_since_timestamp + saint_mode_warning_threshold) { - log_print(LOG_INFO, SECTION_ENHANCED, - "saint_mode active for %d seconds -- fusedav.%s.server-%s.long_running_saint_mode:1|c", now.tv_sec-unhealthy_since_timestamp, filesystem_cluster, nodeaddr); + // TODO JB 'local' + stats_counter("long_running_saint_mode", 1); + log_print(LOG_INFO, SECTION_ENHANCED, "saint_mode active for %d seconds", now.tv_sec-unhealthy_since_timestamp); } } } @@ -1055,7 +995,8 @@ void process_status(const char *fcn_name, CURL *session, const CURLcode res, const long response_code, const long elapsed_time, const int iter, const char *path, bool tmp_session) { - log_filesystem_nodes(fcn_name, iter, path); + stats_counter("attempts", 1); + if (res != CURLE_OK) { print_errors(iter, "curl_failures", fcn_name, res, response_code, elapsed_time, path); increment_node_failure(nodeaddr, res, response_code, elapsed_time); @@ -1080,6 +1021,14 @@ void process_status(const char *fcn_name, CURL *session, const CURLcode res, delete_session(session, tmp_session); return; } + + // If it wasn't an error, and it isn't the 0'th iter, then we must have failed previously and now recovered + if (iter > 0) { + stats_counter("recoveries", 1); + log_print(LOG_NOTICE, SECTION_SESSION_DEFAULT, + "%s: curl iter %d on path %s -- fusedav.%s.server-%s.recoveries", fcn_name, iter, path, filesystem_cluster, nodeaddr); + increment_node_success(nodeaddr); + } } static bool valid_slist(void) { @@ -1183,8 +1132,8 @@ static CURL *update_session(bool tmp_session) { // We do this when the thread is initialized. We want the hashtable to survive reinitialization of the handle, // since the hashtable keeps track of the health status of connections causing the reinitialization - // The first log print will be stripped by log stash because it has the stats designator 1|c, so log one without it - log_print(LOG_INFO, SECTION_ENHANCED, "Opening cURL session -- fusedav.%s.sessions:1|c", filesystem_cluster); + // TODO JB 'local' + stats_counter("sessions", 1); log_print(LOG_INFO, SECTION_SESSION_DEFAULT, "Opening cURL session"); // if tmp_session, we need to get a new session for this request; otherwise see if we already have a session @@ -1302,87 +1251,3 @@ CURL *session_request_init(const char *path, const char *query_string, bool tmp_ return session; } - -static void common_aggregate_log_print(unsigned int log_level, unsigned int section, const char *cluster, const char *server, - const char *name, time_t *previous_time, const char *description1, unsigned long *count1, unsigned long value1, - const char *description2, long *count2, long value2) { - - /* Print aggregate stats. It is in this file for ready access to server names and the ability - * to call it on thread destroy. - */ - // Print every 100th access - const unsigned long count_trigger = 1000; - // Print every 60th second - const time_t time_trigger = 60; - time_t current_time; - bool print_it = false; - - *count1 += value1; - if (count2) *count2 += value2; - // Track curl accesses to this filesystem node - // fusedav.conf will always set SECTION_ENHANCED to 6 in LOG_SECTIONS. These log entries will always - // print, but at INFO will be easier to filter out - // We're overloading the journal, so only log every print_count_trigger count or every print_interval time - current_time = time(NULL); - - // if previous_time is NULL then this is a pair to an earlier call, and we always print it - if (previous_time != NULL) { - // Always print the first one. Then print if our interval has expired - print_it = (*previous_time == 0) || (current_time - *previous_time >= time_trigger); - } - else { - print_it = true; - } - // Also print if we have exceeded count - if (print_it || *count1 >= count_trigger) { - if (cluster && server) { - log_print(log_level, section, "%s: fusedav.%s.server-%s.%s:%lu|c", name, filesystem_cluster, nodeaddr, description1, *count1); - } - else if(cluster) { - log_print(log_level, section, "%s: fusedav.%s.%s:%lu|c", name, filesystem_cluster, description1, *count1); - } - else { - log_print(log_level, section, "%s: fusedav.%s:%lu|c", name, description1, *count1); - } - if (description2 && count2) { - long result; - // Cheating. We just know that the second value is a latency total which needs to - // be passed through as an average latency. - if (*count1 == 0) result = 0; - else result = (*count2 / *count1); - if (cluster && server) { - log_print(log_level, section, "%s: fusedav.%s.server-%s.%s:%ld|c", name, filesystem_cluster, nodeaddr, description2, result); - } - else if (cluster) { - log_print(log_level, section, "%s: fusedav.%s.%s:%ld|c", name, filesystem_cluster, description2, result); - } - else { - log_print(log_level, section, "%s: fusedav.%s:%ld|c", name, description2, result); - } - *count2 = 0; - } - *count1 = 0; - if (previous_time) *previous_time = current_time; - } - return; -} - -void aggregate_log_print_server(unsigned int log_level, unsigned int section, const char *name, time_t *previous_time, - const char *description1, unsigned long *count1, unsigned long value1, - const char *description2, long *count2, long value2) { - - // pass in filesystem_cluster and nodeaddr - common_aggregate_log_print(log_level, section, filesystem_cluster, nodeaddr, name, previous_time, - description1, count1, value1, description2, count2, value2); - -} - -void aggregate_log_print_local(unsigned int log_level, unsigned int section, const char *name, time_t *previous_time, - const char *description1, unsigned long *count1, unsigned long value1, - const char *description2, long *count2, long value2) { - - // don't pass in filesystem_cluster and nodeaddr - common_aggregate_log_print(log_level, section, NULL, NULL, name, previous_time, - description1, count1, value1, description2, count2, value2); - -}