Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Track DB scheduling delay per-request
Browse files Browse the repository at this point in the history
For each request, track the amount of time spent waiting for a db
connection. This entails adding it to the LoggingContext and we may as well add
metrics for it while we are passing.
  • Loading branch information
richvdh committed Jan 12, 2018
1 parent ca8c4cd commit 39b2998
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 6 deletions.
12 changes: 12 additions & 0 deletions synapse/http/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@
"response_db_txn_duration", labels=["method", "servlet", "tag"]
)

# seconds spent waiting for a db connection, when processing this request
#
# it's a counter rather than a distribution, because the count would always
# be the same as that of all the other distributions.
#
# FIXME: use a floating-point rather than integer metric
response_db_sched_duration = metrics.register_distribution(
"response_db_sched_duration", labels=["method", "servlet", "tag"]
)

_next_request_id = 0

Expand Down Expand Up @@ -345,6 +354,9 @@ def stop(self, clock, request):
response_db_txn_duration.inc_by(
context.db_txn_duration_ms / 1000., request.method, self.name, tag
)
response_db_sched_duration.inc_by(
context.db_sched_duration_ms / 1000., request.method, self.name, tag
)


class RootRedirect(resource.Resource):
Expand Down
4 changes: 3 additions & 1 deletion synapse/http/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,20 +67,22 @@ def finished_processing(self):
ru_utime, ru_stime = context.get_resource_usage()
db_txn_count = context.db_txn_count
db_txn_duration_ms = context.db_txn_duration_ms
db_sched_duration_ms = context.db_sched_duration_ms
except Exception:
ru_utime, ru_stime = (0, 0)
db_txn_count, db_txn_duration_ms = (0, 0)

self.site.access_logger.info(
"%s - %s - {%s}"
" Processed request: %dms (%dms, %dms) (%dms/%d)"
" Processed request: %dms (%dms, %dms) (%dms/%dms/%d)"
" %sB %s \"%s %s %s\" \"%s\"",
self.getClientIP(),
self.site.site_tag,
self.authenticated_entity,
int(time.time() * 1000) - self.start_time,
int(ru_utime * 1000),
int(ru_stime * 1000),
db_sched_duration_ms,
db_txn_duration_ms,
int(db_txn_count),
self.sentLength,
Expand Down
4 changes: 3 additions & 1 deletion synapse/storage/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,9 @@ def inner_func(conn, *args, **kwargs):
txn_end_time_ms = time.time() * 1000
txn_duration = txn_end_time_ms - txn_start_time_ms

current_context.add_database_transaction(txn_duration)
current_context.add_database_transaction(
txn_duration, sched_delay_ms,
)
self._current_txn_total_time += txn_duration
self._txn_perf_counters.update(
desc, txn_start_time_ms, txn_end_time_ms,
Expand Down
11 changes: 8 additions & 3 deletions synapse/util/logcontext.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ class LoggingContext(object):

__slots__ = [
"previous_context", "name", "ru_stime", "ru_utime",
"db_txn_count", "db_txn_duration_ms", "usage_start", "usage_end",
"db_txn_count", "db_txn_duration_ms", "db_sched_duration_ms",
"usage_start", "usage_end",
"main_thread", "alive",
"request", "tag",
]
Expand All @@ -83,7 +84,7 @@ def start(self):
def stop(self):
pass

def add_database_transaction(self, duration_ms):
def add_database_transaction(self, duration_ms, sched_ms):
pass

def __nonzero__(self):
Expand All @@ -101,6 +102,9 @@ def __init__(self, name=None):
# ms spent waiting for db txns, excluding scheduling time
self.db_txn_duration_ms = 0

# ms spent waiting for db txns to be scheduled
self.db_sched_duration_ms = 0

self.usage_start = None
self.usage_end = None
self.main_thread = threading.current_thread()
Expand Down Expand Up @@ -206,9 +210,10 @@ def get_resource_usage(self):

return ru_utime, ru_stime

def add_database_transaction(self, duration_ms):
def add_database_transaction(self, duration_ms, sched_ms):
self.db_txn_count += 1
self.db_txn_duration_ms += duration_ms
self.db_sched_duration_ms += sched_ms


class LoggingContextFilter(logging.Filter):
Expand Down
19 changes: 18 additions & 1 deletion synapse/util/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@
"block_db_txn_duration", labels=["block_name"]
)

# seconds spent waiting for a db connection, in this block
#
# it's a counter rather than a distribution, because the count would always
# be the same as that of all the other distributions.
#
# FIXME: use a floating-point rather than integer metric
block_db_sched_duration = metrics.register_counter(
"block_db_sched_duration", labels=["block_name"]
)


def measure_func(name):
def wrapper(func):
Expand All @@ -65,7 +75,9 @@ def measured_func(self, *args, **kwargs):
class Measure(object):
__slots__ = [
"clock", "name", "start_context", "start", "new_context", "ru_utime",
"ru_stime", "db_txn_count", "db_txn_duration_ms", "created_context"
"ru_stime",
"db_txn_count", "db_txn_duration_ms", "db_sched_duration_ms",
"created_context",
]

def __init__(self, clock, name):
Expand All @@ -86,6 +98,7 @@ def __enter__(self):
self.ru_utime, self.ru_stime = self.start_context.get_resource_usage()
self.db_txn_count = self.start_context.db_txn_count
self.db_txn_duration_ms = self.start_context.db_txn_duration_ms
self.db_sched_duration_ms = self.start_context.db_sched_duration_ms

def __exit__(self, exc_type, exc_val, exc_tb):
if isinstance(exc_type, Exception) or not self.start_context:
Expand Down Expand Up @@ -118,6 +131,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
(context.db_txn_duration_ms - self.db_txn_duration_ms) / 1000.,
self.name
)
block_db_sched_duration.inc_by(
(context.db_sched_duration_ms - self.db_sched_duration_ms) / 1000.,
self.name
)

if self.created_context:
self.start_context.__exit__(exc_type, exc_val, exc_tb)

0 comments on commit 39b2998

Please sign in to comment.