Skip to content

Commit

Permalink
Fixing small issues
Browse files Browse the repository at this point in the history
Closes #68
Closes #69 (except the 'edit' part which I cannot find)

Also updates the README.md wrt how it's deployed.
  • Loading branch information
ineiti committed Jan 24, 2024
1 parent 68ab716 commit 0c38621
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 51 deletions.
30 changes: 24 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,33 @@
The user setup behind [stats.c4dt.org](https://stats.c4dt.org).

At root, it contains a bunch of script to run measurement, each regularly
started by an user systemd timer, found in `.config`.
started by a user systemd timer, found in `.config`.
The timers output the results in a graphite database and shown in a grafana.
Both are handle by the root `docker-compose.yaml`.
To add or modify grafana's panels, look in `data/grafana`.

Deployment is done via Github Actions, which do:

- rsync of the repo to the home directory of the user on the stats' server
- stop all timers of the user
- start timers in the repo
Deployment is done via ansible: [stats role](https://github.com/c4dt/ansible-config/tree/main/playbooks/roles/stats)

If you need help with systemd, there is a [Systemd Cheatsheet](README.systemd.md)

## Testing

To test the new values, the easiest way is to do the following:

- ssh to the stats-server, then

```bash
sudo -iu stats
git pull
git checkout your_branch
make
```

Once you've finished testing, don't forget to

```bash
git checkout main
make
```

And then re-apply ansible.
89 changes: 46 additions & 43 deletions dashboards-to-provision/servers.dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from grafanalib.influxdb import InfluxDBTarget


# the 'every' in the 'aggregateWindow' will always aggregate to 800 points,
# no matter the range being shown.
def target(server_name: str) -> InfluxDBTarget:
"""Generate Target for disk usage on given server"""
return InfluxDBTarget(
Expand All @@ -33,7 +35,8 @@ def target(server_name: str) -> InfluxDBTarget:
|> filter(fn: (r) => r.host == "{server_name}")
|> filter(fn: (r) => r._field == "used_percent")
|> keep(columns: ["_time", "_value", "path"])
|> last()
|> aggregateWindow(every: duration(v:(uint(v: v.timeRangeStop) -
uint(v: v.timeRangeStart))/uint(v: 800)), fn: mean)
""",
)

Expand Down Expand Up @@ -66,46 +69,46 @@ def target(server_name: str) -> InfluxDBTarget:
]
),
panels=[
GaugePanel(
title="$server: disk usage",
gridPos=GridPos(h=8, w=32, x=0, y=0),
targets=[target("$server")],
dataSource=DATA_SOURCE,
calc="lastNotNull",
thresholds=[
Threshold(index=0, color="green", value=0.0),
Threshold(index=1, color="orange", value=75.0),
Threshold(index=2, color="red", value=90.0),
],
format=UNITS.PERCENT_FORMAT,
thresholdMarkers=False,
repeat=Repeat(direction="v", variable="server"),
)
]
+ [
# one graph/serie per host to actually generate alerts
Graph(
title=f"{host}: trigger disk usage alert",
gridPos=GridPos(h=0, w=0, x=0, y=0), # invisible
targets=[target(host)],
dataSource=DATA_SOURCE,
nullPointMode=NULL_AS_NULL,
alert=Alert(
name=f"{host} disk nearly full",
message=f"{host} disk nearly full",
frequency="10s",
gracePeriod="1m",
alertConditions=[
AlertCondition(
target=target(host),
timeRange=TimeRange("1m", "now"),
evaluator=GreaterThan(90),
operator=OP_AND,
reducerType=RTYPE_LAST,
),
],
),
)
for host in HOSTS
],
GaugePanel(
title="$server: disk usage",
gridPos=GridPos(h=8, w=32, x=0, y=0),
targets=[target("$server")],
dataSource=DATA_SOURCE,
calc="lastNotNull",
thresholds=[
Threshold(index=0, color="green", value=0.0),
Threshold(index=1, color="orange", value=75.0),
Threshold(index=2, color="red", value=90.0),
],
format=UNITS.PERCENT_FORMAT,
thresholdMarkers=False,
repeat=Repeat(direction="v", variable="server"),
)
]
+ [
# one graph/serie per host to actually generate alerts
Graph(
title=f"{host}: trigger disk usage alert",
gridPos=GridPos(h=0, w=0, x=0, y=0), # invisible
targets=[target(host)],
dataSource=DATA_SOURCE,
nullPointMode=NULL_AS_NULL,
alert=Alert(
name=f"{host} disk nearly full",
message=f"{host} disk nearly full",
frequency="10s",
gracePeriod="1m",
alertConditions=[
AlertCondition(
target=target(host),
timeRange=TimeRange("1m", "now"),
evaluator=GreaterThan(90),
operator=OP_AND,
reducerType=RTYPE_LAST,
),
],
),
)
for host in HOSTS
],
).auto_panel_ids()
4 changes: 2 additions & 2 deletions dashboards-to-provision/services/drand.dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
"drand: CPU usage",
Target(target="drand.get-cpu-percentage"),
GridPos(h=8, w=12, x=12, y=8),
UNITS.PERCENT_FORMAT,
UNITS.NO_FORMAT,
frequency=1 * 60,
alert_at=1.5,
alert_at=10,
),
simple_graph(
"drand.c4dt.org: TTY activity",
Expand Down

0 comments on commit 0c38621

Please sign in to comment.