Skip to content

Commit

Permalink
Merge pull request #8 from tmknight/develop
Browse files Browse the repository at this point in the history
Refine several points of logic and error handling
  • Loading branch information
tmknight authored Jan 11, 2024
2 parents 63c508b + de56207 commit 3119aad
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 44 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "docker-autoheal"
version = "0.2.3"
version = "0.2.7"
authors = ["Travis M Knight <[email protected]>"]
license = "MIT"
description = "Monitor and restart unhealthy docker containers"
Expand Down
15 changes: 9 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The `docker-autoheal` binary may be executed via a native OS or via a Docker con
| Variable | Default | Description |
|:----------------------------:|:---------------------:|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
| **AUTOHEAL_CONNECTON_TYPE** | local | This determines how `docker-autheal` connects to Docker (One of: local, socket, http |
| **AUTOHEAL_CONTAINER_LABEL** | autoheal | This is the label (set to `true`) that `docker-autoheal` will monitor and remediate - or set to `all` to simply monitor all containers on the host |
| **AUTOHEAL_CONTAINER_LABEL** | not set | This is the container label that `docker-autoheal` will use as filter criteria for monitoring - or set to `all` to simply monitor all containers on the host |
| **AUTOHEAL_STOP_TIMEOUT** | 10 | Docker waits `n` seconds for a container to stop before killing it during restarts <!-- (overridable via label; see below) --> |
| **AUTOHEAL_INTERVAL** | 5 | Check container health every`n` seconds** |
| **AUTOHEAL_START_DELAY** | 0 | Wait `n` seconds before first health check |
Expand Down Expand Up @@ -45,6 +45,7 @@ The `docker-autoheal` binary may be executed via a native OS or via a Docker con
export AUTOHEAL_CONTAINER_LABEL=all
/usr/local/bin/docker-autoheal > /var/log/docker-autoheal.log &
```
Will connect to the local Docker host and monitor all containers

### Socket

Expand All @@ -53,10 +54,11 @@ docker run -d \
--name autoheal \
--restart=always \
-e AUTOHEAL_CONNECTON_TYPE=socket
-e AUTOHEAL_CONTAINER_LABEL=all \
-e AUTOHEAL_CONTAINER_LABEL=autoheal \
-v /var/run/docker.sock:/var/run/docker.sock \
tmknight/docker-autoheal
```
Will connect to the Docker host via unix socket location /var/run/docker.sock or Windows named pipe location //./pipe/docker_engine and monitor only containers with a label named `autoheal`

### Http

Expand All @@ -65,19 +67,20 @@ docker run -d \
--name autoheal \
--restart=always \
-e AUTOHEAL_CONNECTON_TYPE=socket
-e AUTOHEAL_CONTAINER_LABEL=all \
-e DOCKER_SOCK=tcp://HOST:PORT \
-e AUTOHEAL_CONTAINER_LABEL=watch-me \
-e DOCKER_SOCK=MYHOST:2375 \
-v /path/to/certs/:/certs/:ro \
tmknight/docker-autoheal
```
Will connect to the Docker host via hostname or IP and the specified port and monitor only containers with a label named `watch-me`

## Other info

### Docker labels

a) Apply the label `autoheal=true` to your container to have it watched
a) Apply the label `autoheal=true` to your container to have it watched (only the label name is assessed, the value is not currently used)

b) Set ENV `AUTOHEAL_CONTAINER_LABEL` to the label name that has the value `true` (e.g. `AUTOHEAL_CONTAINER_LABEL=autoheal`)
b) Set ENV `AUTOHEAL_CONTAINER_LABEL` to that label name (e.g. `AUTOHEAL_CONTAINER_LABEL=autoheal`)

OR

Expand Down
1 change: 1 addition & 0 deletions docker/docker-autoheal.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

101 changes: 65 additions & 36 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ async fn log_message(msg: &str) {
// Return environment variable
fn get_env(key: &str, default: &str) -> String {
match std::env::var(key) {
Ok(val) => return val.to_lowercase(),
Err(_e) => return default.to_string().to_lowercase(),
Ok(val) => val.to_lowercase(),
Err(_e) => default.to_string().to_lowercase(),
}
}

Expand Down Expand Up @@ -83,13 +83,23 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
}
}
// Unwrap final connection paramaters
let msg0 = format!("Monitoring Docker via {}", autoheal_connection_type);
let msg0 = format!("[INFO] Monitoring Docker via {}", autoheal_connection_type);
log_message(&msg0).await;
if autoheal_connection_type == "http" {
let msg1 = format!(
"[INFO] Connecting to {}:{}",
autoheal_tcp_host, autoheal_tcp_port
);
log_message(&msg1).await;
}
let docker = docker_tmp.unwrap();

// Delay start of loop if specified
if autoheal_start_delay > 0 {
let msg0 = format!("Delaying evaluation {}s on request", autoheal_start_delay);
let msg0 = format!(
"[INFO] Delaying evaluation {}s on request",
autoheal_start_delay
);
log_message(&msg0).await;
std::thread::sleep(Duration::from_secs(autoheal_start_delay));
}
Expand All @@ -100,6 +110,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Build container assessment criteria
let mut filters = HashMap::new();
filters.insert("health", vec!["unhealthy"]);
filters.insert("status", vec!["running", "exited", "dead"]);
if autoheal_container_label != "all" {
filters.insert("label", vec![&autoheal_container_label]);
}
Expand All @@ -116,46 +127,64 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
let docker_clone = docker.clone();
let join = tokio::task::spawn(async move {
// Get name of container
let name0 = &container.names.unwrap()[0];
let name = name0.trim_matches('/').trim();
let name_tmp = match &container.names {
Some(names) => &names[0],
None => {
let msg0 = format!("[ERROR] Could not reliably determine container name");
log_message(&msg0).await;
""
}
};
let name = name_tmp.trim_matches('/').trim();

// Get id of container
let id: String = container.id.unwrap().chars().take(12).collect();
let id: String = match container.id {
Some(id) => id.chars().take(12).collect(),
None => {
let msg0 = format!("[ERROR] Could not reliably determine container id");
log_message(&msg0).await;
"".to_string()
}
};

// Determine if state is readable
if let Some(state) = container.state {
// Determine if matches restart criteria
if !matches!(state.as_str(), "paused" | "restarting") {
// Build restart options
let restart_options = Some(RestartContainerOptions {
t: autoheal_stop_timeout,
..Default::default()
});
if !(name.is_empty() && id.is_empty()) {
// Report unhealthy container
let msg0 = format!("[WARNING] [{}] Container ({}) unhealthy", name, id);
log_message(&msg0).await;

// Report what is transpiring
let msg0 = format!("Container '{}' ({}) unhealthy", name, id);
let msg1 = format!(
"Restarting '{}' with {}s timeout",
name, autoheal_stop_timeout
);
log_message(&msg0).await;
log_message(&msg1).await;
// Build restart options
let restart_options = Some(RestartContainerOptions {
t: autoheal_stop_timeout,
..Default::default()
});

// Restart unhealthy container
let rslt = docker_clone.restart_container(&id, restart_options).await;
match rslt {
Ok(()) => {
let msg0 = format!("Restart of '{}' was successful", name);
log_message(&msg0).await;
}
Err(e) => {
let msg0 = format!("Restart of '{}' failed: {}", name, e);
log_message(&msg0).await;
}
// Report container restart
let msg1 = format!(
"[WARNING] [{}] Restarting container ({}) with {}s timeout",
name, id, autoheal_stop_timeout
);
log_message(&msg1).await;

// Restart unhealthy container
let rslt = docker_clone.restart_container(&id, restart_options).await;
match rslt {
Ok(()) => {
let msg0 = format!(
"[INFO] [{}] Restart of container ({}) was successful",
name, id
);
log_message(&msg0).await;
}
Err(e) => {
let msg0 = format!(
"[ERROR] [{}] Restart of container ({}) failed: {}",
name, id, e
);
log_message(&msg0).await;
}
}
} else {
let msg0 = format!("Could not determine state of {}", name);
let msg0 = format!("[ERROR] Could not reliably identify the container");
log_message(&msg0).await;
}
});
Expand Down

0 comments on commit 3119aad

Please sign in to comment.