Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rule: increase for value from 1m to 5m. #20

Merged
merged 2 commits into from
Nov 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"fmt"
"github.com/hashicorp/go-version"
"github.com/pkg/errors"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/pkg/rulefmt"
"github.com/spf13/cobra"
"github.com/tidwall/gjson"
Expand All @@ -33,10 +34,14 @@ import (
"reflect"
"strings"
yaml "gopkg.in/yaml.v2"
"time"
)

// expect_basic_file_size is used to check file number in auto generated directory.
const expect_basic_file_size = 17
const (
// expect_basic_file_size is used to check file number in auto generated directory.
expect_basic_file_size = 17
ALERT_FOR_CONFIG = "5m"
)

var (
lowest_version string
Expand Down Expand Up @@ -77,9 +82,12 @@ var (
strings.ToUpper("pd_cluster_lost_connect_tikv_nums"): `(sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)`,
strings.ToUpper("pd_pending_peer_region_count"): `(sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) > 100) and (sum(etcd_server_is_leader) by (instance) > 0)`,
}

forConfig, configerr = model.ParseDuration(ALERT_FOR_CONFIG)
)

func main() {
checkErr(configerr, "config for duration failed")
var rootCmd = &cobra.Command{
Use: "monitoring",
Run: func(co *cobra.Command, args []string) {
Expand Down Expand Up @@ -392,6 +400,10 @@ func replaceAlertExpr(content []byte) ([]byte, error){
return rule
}

if time.Duration(rule.For) <= (time.Second * 60) {
rule.For = forConfig
}

rule.Expr = newExpr
if _, ok := rule.Labels["expr"]; ok {
rule.Labels["expr"] = newExpr
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.10/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.11/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.12/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.13/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.14/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.15/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.16/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
6 changes: 3 additions & 3 deletions monitor/v2.1.17/rules/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ groups:
- alert: PD_cluster_lost_connect_tikv_nums
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
> 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance)
Expand All @@ -56,7 +56,7 @@ groups:
- alert: PD_cluster_low_space
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0)
and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) >
Expand Down Expand Up @@ -122,7 +122,7 @@ groups:
- alert: PD_pending_peer_region_count
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
100) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
for: 5m
labels:
env: ENV_LABELS_ENV
expr: (sum( pd_regions_status{type="pending_peer_region_count"} ) by (instance) >
Expand Down
11 changes: 11 additions & 0 deletions monitor/v2.1.18/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM busybox

ADD init.sh /usr/bin/init.sh
RUN chmod +x /usr/bin/init.sh

COPY dashboards/*.json /tmp/
COPY rules/*.rules.yml /tmp/
COPY datasources/*.yaml /tmp/

ENTRYPOINT ["/usr/bin/init.sh"]
CMD ["TIDB-Cluster", "/grafana-dashboard-definitions/tidb/", "false", "/etc/prometheus"]
Loading