diff --git a/README.md b/README.md index 2bb8040..134e332 100644 --- a/README.md +++ b/README.md @@ -3,15 +3,33 @@ The user setup behind [stats.c4dt.org](https://stats.c4dt.org). At root, it contains a bunch of script to run measurement, each regularly -started by an user systemd timer, found in `.config`. +started by a user systemd timer, found in `.config`. The timers output the results in a graphite database and shown in a grafana. Both are handle by the root `docker-compose.yaml`. To add or modify grafana's panels, look in `data/grafana`. -Deployment is done via Github Actions, which do: - -- rsync of the repo to the home directory of the user on the stats' server -- stop all timers of the user -- start timers in the repo +Deployment is done via ansible: [stats role](https://github.com/c4dt/ansible-config/tree/main/playbooks/roles/stats) If you need help with systemd, there is a [Systemd Cheatsheet](README.systemd.md) + +## Testing + +To test the new values, the easiest way is to do the following: + +- ssh to the stats-server, then + +```bash +sudo -iu stats +git pull +git checkout your_branch +make +``` + +Once you've finished testing, don't forget to + +```bash +git checkout main +make +``` + +And then re-apply ansible. diff --git a/dashboards-to-provision/servers.dashboard.py b/dashboards-to-provision/servers.dashboard.py index f57d775..9c5d550 100644 --- a/dashboards-to-provision/servers.dashboard.py +++ b/dashboards-to-provision/servers.dashboard.py @@ -22,6 +22,8 @@ from grafanalib.influxdb import InfluxDBTarget +# the 'every' in the 'aggregateWindow' will always aggregate to 800 points, +# no matter the range being shown. def target(server_name: str) -> InfluxDBTarget: """Generate Target for disk usage on given server""" return InfluxDBTarget( @@ -33,7 +35,8 @@ def target(server_name: str) -> InfluxDBTarget: |> filter(fn: (r) => r.host == "{server_name}") |> filter(fn: (r) => r._field == "used_percent") |> keep(columns: ["_time", "_value", "path"]) - |> last() + |> aggregateWindow(every: duration(v:(uint(v: v.timeRangeStop) - + uint(v: v.timeRangeStart))/uint(v: 800)), fn: mean) """, ) @@ -66,46 +69,46 @@ def target(server_name: str) -> InfluxDBTarget: ] ), panels=[ - GaugePanel( - title="$server: disk usage", - gridPos=GridPos(h=8, w=32, x=0, y=0), - targets=[target("$server")], - dataSource=DATA_SOURCE, - calc="lastNotNull", - thresholds=[ - Threshold(index=0, color="green", value=0.0), - Threshold(index=1, color="orange", value=75.0), - Threshold(index=2, color="red", value=90.0), - ], - format=UNITS.PERCENT_FORMAT, - thresholdMarkers=False, - repeat=Repeat(direction="v", variable="server"), - ) - ] - + [ - # one graph/serie per host to actually generate alerts - Graph( - title=f"{host}: trigger disk usage alert", - gridPos=GridPos(h=0, w=0, x=0, y=0), # invisible - targets=[target(host)], - dataSource=DATA_SOURCE, - nullPointMode=NULL_AS_NULL, - alert=Alert( - name=f"{host} disk nearly full", - message=f"{host} disk nearly full", - frequency="10s", - gracePeriod="1m", - alertConditions=[ - AlertCondition( - target=target(host), - timeRange=TimeRange("1m", "now"), - evaluator=GreaterThan(90), - operator=OP_AND, - reducerType=RTYPE_LAST, - ), - ], - ), - ) - for host in HOSTS - ], + GaugePanel( + title="$server: disk usage", + gridPos=GridPos(h=8, w=32, x=0, y=0), + targets=[target("$server")], + dataSource=DATA_SOURCE, + calc="lastNotNull", + thresholds=[ + Threshold(index=0, color="green", value=0.0), + Threshold(index=1, color="orange", value=75.0), + Threshold(index=2, color="red", value=90.0), + ], + format=UNITS.PERCENT_FORMAT, + thresholdMarkers=False, + repeat=Repeat(direction="v", variable="server"), + ) + ] + + [ + # one graph/serie per host to actually generate alerts + Graph( + title=f"{host}: trigger disk usage alert", + gridPos=GridPos(h=0, w=0, x=0, y=0), # invisible + targets=[target(host)], + dataSource=DATA_SOURCE, + nullPointMode=NULL_AS_NULL, + alert=Alert( + name=f"{host} disk nearly full", + message=f"{host} disk nearly full", + frequency="10s", + gracePeriod="1m", + alertConditions=[ + AlertCondition( + target=target(host), + timeRange=TimeRange("1m", "now"), + evaluator=GreaterThan(90), + operator=OP_AND, + reducerType=RTYPE_LAST, + ), + ], + ), + ) + for host in HOSTS + ], ).auto_panel_ids() diff --git a/dashboards-to-provision/services/drand.dashboard.py b/dashboards-to-provision/services/drand.dashboard.py index a5319bb..8e267f8 100644 --- a/dashboards-to-provision/services/drand.dashboard.py +++ b/dashboards-to-provision/services/drand.dashboard.py @@ -29,9 +29,9 @@ "drand: CPU usage", Target(target="drand.get-cpu-percentage"), GridPos(h=8, w=12, x=12, y=8), - UNITS.PERCENT_FORMAT, + UNITS.NO_FORMAT, frequency=1 * 60, - alert_at=1.5, + alert_at=10, ), simple_graph( "drand.c4dt.org: TTY activity",