Skip to content

Commit

Permalink
[PLAT-906] counters for sainted read and write (#487) (#489)
Browse files Browse the repository at this point in the history
* [PLAT-870] Problem: stats for types of requests not distinct

Solution: Account for their number and latency distinctly.

With the Prometheus Statsd exporter in place, we have real histograms that are
fed by `stats_timer`, which means we can ditch related calls to `stats_counter`
and we can just reconfigure the bucket widths in the exporter config rather than
having a const that says how long is too long.

We instrument in `print_errors` because it has the error and latency information
as well as the function name. To that end, I've changed the function name passed
to `process_status` to more closely align with what someone looking at logs or
timeseries might understand because

1) that's easier than more extensive refactoring
2) the other people who care are the ones reading the source and they can just
as easily search for calls to `print_errors`

* move the timers to process_status to observe successes also

* counters for sainted read/write
  • Loading branch information
winmillwill authored May 7, 2019
1 parent b3fa1ee commit 48961c0
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 36 deletions.
10 changes: 5 additions & 5 deletions src/filecache.c
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ static void get_fresh_fd(filecache_t *cache,
if (response_fd >= 0) close(response_fd);
if (response_filename[0] != '\0') unlink(response_filename);

session = session_request_init(path, NULL, false);
session = session_request_init(path, NULL, false, false);
if (!session || inject_error(filecache_error_freshsession)) {
g_set_error(gerr, curl_quark(), E_FC_CURLERR, "%s: Failed session_request_init on GET", funcname);
// TODO(kibra): Manually cleaning up this lock sucks. We should make sure this happens in a better way.
Expand Down Expand Up @@ -503,7 +503,7 @@ static void get_fresh_fd(filecache_t *cache,

if (slist) curl_slist_free_all(slist);

bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status("get", session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
Expand Down Expand Up @@ -1059,9 +1059,9 @@ static void put_return_etag(const char *path, int fd, char *etag, GError **gerr)

// REVIEW: We didn't use to check for sesssion == NULL, so now we
// also call try_release_request_outstanding. Is this OK?
session = session_request_init(path, NULL, false);
session = session_request_init(path, NULL, false, true);
if (!session || inject_error(filecache_error_freshsession)) {
g_set_error(gerr, curl_quark(), E_FC_CURLERR, "%s: Failed session_request_init on GET", funcname);
g_set_error(gerr, curl_quark(), E_FC_CURLERR, "%s: Failed session_request_init on PUT", funcname);
// TODO(kibra): Manually cleaning up this lock sucks. We should make sure this happens in a better way.
try_release_request_outstanding();
goto finish;
Expand All @@ -1086,7 +1086,7 @@ static void put_return_etag(const char *path, int fd, char *etag, GError **gerr)

if (slist) curl_slist_free_all(slist);

bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status("put", session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
Expand Down
29 changes: 6 additions & 23 deletions src/fusedav.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,29 +102,12 @@ static int simple_propfind_with_redirect(
GError **gerr) {

GError *subgerr = NULL;
struct timespec start_time;
struct timespec now;
long elapsed_time;
// Alert on propfind taking longer than 4 seconds. This is rather arbitrary.
static const unsigned propfind_time_allotment = 4000; // 4 seconds
int ret;
float samplerate = 1.0; // Always sample these stats

log_print(LOG_DEBUG, SECTION_FUSEDAV_STAT, "simple_propfind_with_redirect: Performing (%s) PROPFIND of depth %d on path %s.",
last_updated > 0 ? "progressive" : "complete", depth, path);

clock_gettime(CLOCK_MONOTONIC, &start_time);
ret = simple_propfind(path, depth, last_updated, result_callback, userdata, &subgerr);
clock_gettime(CLOCK_MONOTONIC, &now);
elapsed_time = ((now.tv_sec - start_time.tv_sec) * 1000) + ((now.tv_nsec - start_time.tv_nsec) / (1000 * 1000));
stats_counter("propfind-count", 1, samplerate);
stats_timer("propfind-latency", elapsed_time);
if (elapsed_time > propfind_time_allotment) {
log_print(LOG_WARNING, SECTION_FUSEDAV_STAT, "simple_propfind_with_redirect: (%s) PROPFIND exceeded time allotment of %u ms; took %u ms.",
last_updated > 0 ? "progressive" : "complete", propfind_time_allotment, elapsed_time);
stats_counter("exceeded-time-propfind-count", 1, samplerate);
stats_timer("exceeded-time-propfind-latency", elapsed_time);
}
if (subgerr) {
g_propagate_prefixed_error(gerr, subgerr, "simple_propfind_with_redirect: ");
return ret;
Expand Down Expand Up @@ -326,7 +309,7 @@ static void getdir_propfind_callback(__unused void *userdata, const char *path,
bool tmp_session = true;
long elapsed_time = 0;

if (!(session = session_request_init(path, NULL, tmp_session)) || inject_error(fusedav_error_propfindsession)) {
if (!(session = session_request_init(path, NULL, tmp_session, false)) || inject_error(fusedav_error_propfindsession)) {
g_set_error(gerr, fusedav_quark(), ENETDOWN, "%s(%s): failed to get request session", funcname, path);
// TODO(kibra): Manually cleaning up this lock sucks. We should make sure this happens in a better way.
try_release_request_outstanding();
Expand All @@ -341,7 +324,7 @@ static void getdir_propfind_callback(__unused void *userdata, const char *path,
"%s: saw %lu; calling HEAD on %s", funcname, status_code, path);
timed_curl_easy_perform(session, &res, &response_code, &elapsed_time);

bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, tmp_session);
bool non_retriable_error = process_status("propfind-head", session, res, response_code, elapsed_time, idx, path, tmp_session);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
Expand Down Expand Up @@ -1170,7 +1153,7 @@ static void common_unlink(const char *path, bool do_unlink, GError **gerr) {
struct curl_slist *slist = NULL;
long elapsed_time = 0;

if (!(session = session_request_init(path, NULL, false)) || inject_error(fusedav_error_cunlinksession)) {
if (!(session = session_request_init(path, NULL, false, true)) || inject_error(fusedav_error_cunlinksession)) {
g_set_error(gerr, fusedav_quark(), ENETDOWN, "%s(%s): failed to get request session", funcname, path);
// TODO(kibra): Manually cleaning up this lock sucks. We should make sure this happens in a better way.
try_release_request_outstanding();
Expand Down Expand Up @@ -1302,7 +1285,7 @@ static int dav_rmdir(const char *path) {
struct curl_slist *slist = NULL;
long elapsed_time = 0;

if (!(session = session_request_init(fn, NULL, false))) {
if (!(session = session_request_init(fn, NULL, false, true))) {
log_print(LOG_ERR, SECTION_FUSEDAV_DIR, "%s(%s): failed to get session", funcname, path);
// TODO(kibra): Manually cleaning up this lock sucks. We should make sure this happens in a better way.
try_release_request_outstanding();
Expand Down Expand Up @@ -1377,7 +1360,7 @@ static int dav_mkdir(const char *path, mode_t mode) {
struct curl_slist *slist = NULL;
long elapsed_time = 0;

if (!(session = session_request_init(fn, NULL, false))) {
if (!(session = session_request_init(fn, NULL, false, true))) {
log_print(LOG_ERR, SECTION_FUSEDAV_DIR, "%s(%s): failed to get session", funcname, path);
// TODO(kibra): Manually cleaning up this lock sucks. We should make sure this happens in a better way.
try_release_request_outstanding();
Expand Down Expand Up @@ -1470,7 +1453,7 @@ static int dav_rename(const char *from, const char *to) {
char *escaped_to;
long elapsed_time = 0;

if (!(session = session_request_init(from, NULL, false))) {
if (!(session = session_request_init(from, NULL, false, true))) {
log_print(LOG_ERR, SECTION_FUSEDAV_FILE, "%s: failed to get session for %d:%s", funcname, fd, from);
// TODO(kibra): Manually cleaning up this lock sucks. We should make sure this happens in a better way.
try_release_request_outstanding();
Expand Down
7 changes: 3 additions & 4 deletions src/props.c
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ int simple_propfind(const char *path, size_t depth, time_t last_updated, props_r
void *userdata, GError **gerr) {
static const char *funcname = "simple_propfind";
char *description = NULL;
asprintf(&description, "%s-propfind", last_updated > 0 ? "progressive" : "complete");
// Local variables for cURL.
long response_code = 500; // seed it as bad so we can enter the loop
CURLcode res = CURLE_OK;
Expand All @@ -393,7 +394,7 @@ int simple_propfind(const char *path, size_t depth, time_t last_updated, props_r
if (last_updated > 0) {
asprintf(&query_string, "changes_since=%lu", last_updated);
}
session = session_request_init(path, query_string, false);
session = session_request_init(path, query_string, false, false);
if (!session || inject_error(props_error_spropfindsession)) {
g_set_error(gerr, props_quark(), ENETDOWN, "%s(%s): failed to get request session", funcname, path);
free(query_string);
Expand Down Expand Up @@ -451,7 +452,7 @@ int simple_propfind(const char *path, size_t depth, time_t last_updated, props_r
free(query_string);
query_string = NULL;

bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status(description, session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
Expand Down Expand Up @@ -538,8 +539,6 @@ int simple_propfind(const char *path, size_t depth, time_t last_updated, props_r
ret = 0;

finish:
asprintf(&description, "%s-propfinds", last_updated > 0 ? "progressive" : "complete");
stats_counter(description, 1, pfsamplerate);
free(description);
XML_ParserFree(parser);
return ret;
Expand Down
22 changes: 19 additions & 3 deletions src/session.c
Original file line number Diff line number Diff line change
Expand Up @@ -257,13 +257,12 @@ static void print_errors(const int iter, const char *type_str, const char *fcn_n
log_print(LOG_ERR, SECTION_SESSION_DEFAULT,
"%s: curl iter %d on path %s; %s -- fusedav.%s.server-%s.%s",
fcn_name, iter, path, error_str, filesystem_cluster, nodeaddr, type_str);

// Don't treat slow requests as 'failures'; it messes up the failure/recovery stats
if (!slow_request) {
// Is this the first, second, or third failure for this request?
char *failure_str = NULL;
asprintf(&failure_str, "%d_failures", iter + 1);

// Is this the first, second, or third failure for this request?
stats_counter(failure_str, 1, samplerate);

free(failure_str);
Expand Down Expand Up @@ -1027,6 +1026,8 @@ bool process_status(const char *fcn_name, CURL *session, const CURLcode res,
bool non_retriable_error = false; // default to retry
float samplerate = 1.0;

char *metric_str = NULL;

stats_counter("attempts", 1, samplerate);

if (res != CURLE_OK) {
Expand All @@ -1036,13 +1037,23 @@ bool process_status(const char *fcn_name, CURL *session, const CURLcode res,
} else {
print_errors(iter, "curl_failures", fcn_name, res, response_code, elapsed_time, path);
}
// set the response code part of the metric str to 000 because that's what commandline curl does,
// and we need some kind of value for path globbing to be well-behaved.
asprintf(&metric_str, "%s.%s", fcn_name, "000");
stats_timer(metric_str, elapsed_time);
free(metric_str);
increment_node_failure(nodeaddr, res, response_code, elapsed_time);
delete_session(session, tmp_session);
return non_retriable_error;
}

// We want to count the outcome and latency for every response code, regardless of whether its an error.
asprintf(&metric_str, "%s.%lu", fcn_name, response_code);
stats_timer(metric_str, elapsed_time);
free(metric_str);
if (response_code >= 500) {
// We could treat 50x errors differently here

print_errors(iter, "status500_failures", fcn_name, res, response_code, elapsed_time, path);
increment_node_failure(nodeaddr, res, response_code, elapsed_time);
delete_session(session, tmp_session);
Expand Down Expand Up @@ -1224,7 +1235,7 @@ static CURL *get_session(bool tmp_session) {
return session;
}

CURL *session_request_init(const char *path, const char *query_string, bool tmp_session) {
CURL *session_request_init(const char *path, const char *query_string, bool tmp_session, bool rw) {
CURL *session;
char *full_url = NULL;
char *escaped_path;
Expand All @@ -1234,6 +1245,11 @@ CURL *session_request_init(const char *path, const char *query_string, bool tmp_
// Calls to this function, on detecting this error, set ENETDOWN, which is appropriate
if (use_saint_mode()) {
log_print(LOG_NOTICE, SECTION_SESSION_DEFAULT, "%s: already in saint mode", funcname);
if (rw) {
stats_counter("saint_write", 1, 1.0);
return NULL;
}
stats_counter("saint_read", 1, 1.0);
return NULL;
}

Expand Down
2 changes: 1 addition & 1 deletion src/session.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
extern int num_filesystem_server_nodes;

int session_config_init(char *base, char *ca_cert, char *client_cert, bool grace);
CURL *session_request_init(const char *path, const char *query_string, bool temporary_handle);
CURL *session_request_init(const char *path, const char *query_string, bool temporary_handle, bool rw);
void session_config_free(void);
bool process_status(const char *fcn_name, CURL *session, const CURLcode res,
const long response_code, const long elapsed_time, const int iter,
Expand Down

0 comments on commit 48961c0

Please sign in to comment.