Skip to content

Commit

Permalink
Merge #25308
Browse files Browse the repository at this point in the history
25308: sql: expose metrics snapshot in crdb_internal r=bdarnell a=tschottdorf

This includes a metrics snapshot in crdb_internal.node_metrics, and also
scrapes this table as part of `debug zip`.

In follow-up work, we can from these metrics extract a health check that
operates on the metric snapshot. For example, you might check whether
there are underreplicated ranges, or high heartbeat latencies. Taking
this one step further, we may introduce a periodic query on this table
as a health check, with a distress signal sent through gossip when
problems are detected.

Release note: None

Co-authored-by: Tobias Schottdorf <[email protected]>
  • Loading branch information
craig[bot] and tbg committed May 6, 2018
2 parents 4615dd1 + 21e0784 commit a03c07b
Show file tree
Hide file tree
Showing 10 changed files with 183 additions and 117 deletions.
1 change: 1 addition & 0 deletions pkg/cli/cli_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2203,6 +2203,7 @@ writing ` + os.DevNull + `
debug/settings
debug/gossip/liveness
debug/gossip/nodes
debug/metrics
debug/nodes/1/status
debug/nodes/1/gossip
debug/nodes/1/stacks
Expand Down
11 changes: 8 additions & 3 deletions pkg/cli/zip.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ func runDebugZip(cmd *cobra.Command, args []string) error {
eventsName = base + "/events"
gossipLName = base + "/gossip/liveness"
gossipNName = base + "/gossip/nodes"
metricsName = base + "/metrics"
livenessName = base + "/liveness"
nodesPrefix = base + "/nodes"
schemaPrefix = base + "/schema"
Expand Down Expand Up @@ -190,11 +191,15 @@ func runDebugZip(cmd *cobra.Command, args []string) error {
{
queryLiveness := "SELECT * FROM crdb_internal.gossip_liveness;"
queryNodes := "SELECT * FROM crdb_internal.gossip_nodes;"
queryMetrics := "SELECT * FROM crdb_internal.node_metrics;"

if err := dumpGossipData(z, sqlConn, queryLiveness, gossipLName); err != nil {
if err := dumpTableDataForZip(z, sqlConn, queryLiveness, gossipLName); err != nil {
return err
}
if err := dumpGossipData(z, sqlConn, queryNodes, gossipNName); err != nil {
if err := dumpTableDataForZip(z, sqlConn, queryNodes, gossipNName); err != nil {
return err
}
if err := dumpTableDataForZip(z, sqlConn, queryMetrics, metricsName); err != nil {
return err
}
}
Expand Down Expand Up @@ -358,7 +363,7 @@ func runDebugZip(cmd *cobra.Command, args []string) error {
return nil
}

func dumpGossipData(z *zipper, conn *sqlConn, query string, name string) error {
func dumpTableDataForZip(z *zipper, conn *sqlConn, query string, name string) error {
w, err := z.create(name)
if err != nil {
return err
Expand Down
5 changes: 4 additions & 1 deletion pkg/cmd/roachtest/debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ func registerDebug(r *registry) {
return err
}

if err := c.RunE(ctx, c.Node(node), "sudo apt-get install unzip"); err != nil {
if err := c.RunE(ctx, c.Node(node), "unzip -v || sudo apt-get install unzip"); err != nil {
return err
}

Expand All @@ -53,6 +53,9 @@ func registerDebug(r *registry) {
return err
}

if err := c.RunE(ctx, c.Node(node), "grep -F 'liveness.heartbeatlatency-p99' ./debug/metrics"); err != nil {
return err
}
if err := c.RunE(ctx, c.Node(node), "rm -rf debug"); err != nil {
return err
}
Expand Down
1 change: 1 addition & 0 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
AmbientCtx: s.cfg.AmbientCtx,
DB: s.db,
Gossip: s.gossip,
MetricsRecorder: s.recorder,
DistSender: s.distSender,
RPCContext: s.rpcContext,
LeaseManager: s.leaseMgr,
Expand Down
41 changes: 41 additions & 0 deletions pkg/sql/crdb_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ var crdbInternal = virtualSchema{
crdbInternalLeasesTable,
crdbInternalLocalQueriesTable,
crdbInternalLocalSessionsTable,
crdbInternalLocalMetricsTable,
crdbInternalPartitionsTable,
crdbInternalRangesTable,
crdbInternalRuntimeInfoTable,
Expand Down Expand Up @@ -867,6 +868,46 @@ func populateSessionsTable(
return nil
}

// crdbInternalLocalMetricsTable exposes a snapshot of the metrics on the
// current node.
var crdbInternalLocalMetricsTable = virtualSchemaTable{
schema: `CREATE TABLE crdb_internal.node_metrics (
store_id INT NULL, -- the store, if any, to which this metric belongs
name STRING, -- name of the metric
value FLOAT -- value of the metric
);`,

populate: func(ctx context.Context, p *planner, _ *DatabaseDescriptor, addRow func(...tree.Datum) error) error {
if err := p.RequireSuperUser(ctx, "read crdb_internal.node_metrics"); err != nil {
return err
}

mr := p.ExecCfg().MetricsRecorder
if mr == nil {
return nil
}
nodeStatus := mr.GetStatusSummary(ctx)
for i := 0; i <= len(nodeStatus.StoreStatuses); i++ {
storeID := tree.DNull
mtr := nodeStatus.Metrics
if i > 0 {
storeID = tree.NewDInt(tree.DInt(nodeStatus.StoreStatuses[i-1].Desc.StoreID))
mtr = nodeStatus.StoreStatuses[i-1].Metrics
}
for name, value := range mtr {
if err := addRow(
storeID,
tree.NewDString(name),
tree.NewDFloat(tree.DFloat(value)),
); err != nil {
return err
}
}
}
return nil
},
}

// crdbInternalBuiltinFunctionsTable exposes the built-in function
// metadata.
var crdbInternalBuiltinFunctionsTable = virtualSchemaTable{
Expand Down
2 changes: 2 additions & 0 deletions pkg/sql/exec_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/rpc"
"github.com/cockroachdb/cockroach/pkg/security"
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/server/status"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/distsqlrun"
Expand Down Expand Up @@ -194,6 +195,7 @@ type ExecutorConfig struct {
Clock *hlc.Clock
DistSQLSrv *distsqlrun.ServerImpl
StatusServer serverpb.StatusServer
MetricsRecorder *status.MetricsRecorder
SessionRegistry *SessionRegistry
JobRegistry *jobs.Registry
VirtualSchemas *VirtualSchemaHolder
Expand Down
4 changes: 4 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/crdb_internal
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ kv_node_status
kv_store_status
leases
node_build_info
node_metrics
node_queries
node_runtime_info
node_sessions
Expand Down Expand Up @@ -314,6 +315,9 @@ select * from crdb_internal.gossip_nodes
query error pq: only superusers are allowed to read crdb_internal.gossip_liveness
select * from crdb_internal.gossip_liveness

query error pq: only superusers are allowed to read crdb_internal.node_metrics
select * from crdb_internal.node_metrics

query error pq: only superusers are allowed to read crdb_internal.kv_node_status
select * from crdb_internal.kv_node_status

Expand Down
6 changes: 3 additions & 3 deletions pkg/sql/logictest/testdata/logic_test/explain
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ sort · ·
└── render · ·
└── filter · ·
└── values · ·
· size 6 columns, 81 rows
· size 6 columns, 82 rows

query TTT
EXPLAIN SHOW DATABASE
Expand Down Expand Up @@ -251,7 +251,7 @@ sort · ·
├── render · ·
│ └── filter · ·
│ └── values · ·
│ size 17 columns, 744 rows
│ size 17 columns, 747 rows
└── render · ·
└── filter · ·
└── values · ·
Expand All @@ -265,7 +265,7 @@ sort · ·
└── render · ·
└── filter · ·
└── values · ·
· size 8 columns, 370 rows
· size 8 columns, 373 rows


query TTT
Expand Down
Loading

0 comments on commit a03c07b

Please sign in to comment.