Skip to content

Commit

Permalink
Merge pull request #20 from pingcap/update-for-value
Browse files Browse the repository at this point in the history
Rule: increase for value from 1m to 5m.
  • Loading branch information
qiffang authored Nov 11, 2019
2 parents 033c940 + 18114d9 commit 72d6aa3
Show file tree
Hide file tree
Showing 97 changed files with 101,064 additions and 620 deletions.
16 changes: 14 additions & 2 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"fmt"
"github.com/hashicorp/go-version"
"github.com/pkg/errors"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/pkg/rulefmt"
"github.com/spf13/cobra"
"github.com/tidwall/gjson"
Expand All @@ -33,10 +34,14 @@ import (
"reflect"
"strings"
yaml "gopkg.in/yaml.v2"
"time"
)

// expect_basic_file_size is used to check file number in auto generated directory.
const expect_basic_file_size = 17
const (
// expect_basic_file_size is used to check file number in auto generated directory.
expect_basic_file_size = 17
ALERT_FOR_CONFIG = "5m"
)

var (
lowest_version string
Expand Down Expand Up @@ -77,9 +82,12 @@ var (
strings.ToUpper("pd_cluster_lost_connect_tikv_nums"): `(sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)`,
strings.ToUpper("pd_pending_peer_region_count"): `(sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) > 100) and (sum(etcd_server_is_leader) by (instance) > 0)`,
}

forConfig, configerr = model.ParseDuration(ALERT_FOR_CONFIG)
)

func main() {
checkErr(configerr, "config for duration failed")
var rootCmd = &cobra.Command{
Use: "monitoring",
Run: func(co *cobra.Command, args []string) {
Expand Down Expand Up @@ -392,6 +400,10 @@ func replaceAlertExpr(content []byte) ([]byte, error){
return rule
}

if time.Duration(rule.For) <= (time.Second * 60) {
rule.For = forConfig
}

rule.Expr = newExpr
if _, ok := rule.Labels["expr"]; ok {
rule.Labels["expr"] = newExpr
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.10/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.11/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.12/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.13/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.14/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.15/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.16/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.17/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
11 changes: 11 additions & 0 deletions monitor/v2.1.18/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM busybox

ADD init.sh /usr/bin/init.sh
RUN chmod +x /usr/bin/init.sh

COPY dashboards/*.json /tmp/
COPY rules/*.rules.yml /tmp/
COPY datasources/*.yaml /tmp/

ENTRYPOINT ["/usr/bin/init.sh"]
CMD ["TIDB-Cluster", "/grafana-dashboard-definitions/tidb/", "false", "/etc/prometheus"]
Loading

0 comments on commit 72d6aa3

Please sign in to comment.