-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathk3sAndProviderServices.sh
executable file
·330 lines (293 loc) · 12.1 KB
/
k3sAndProviderServices.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#!/usr/bin/env bash
# Kubernetes Installation with Provider Services Script
# Default values for options
disable_components="traefik"
external_ip=""
testing_mode=false
all_in_one_mode=false
install_gpu_drivers=false
mode="init" # 'init' for initial setup, 'add' for adding control-plane nodes
master_ip=""
token=""
internal_network=""
nodefs_dir=""
imagefs_dir=""
tls_san="" # Example: provider.h100.sdg.val.akash.pub
k3s_common_args="--disable=${disable_components} --flannel-backend=none"
# Process command-line options
while getopts ":d:e:tagm:c:r:w:n:s:k:o:" opt; do
case ${opt} in
d )
disable_components=$OPTARG
;;
e )
external_ip=$OPTARG
;;
t )
testing_mode=true
;;
a )
all_in_one_mode=true
;;
g )
install_gpu_drivers=true
;;
m )
master_ip=$OPTARG
;;
c )
token=$OPTARG
mode="add"
;;
r )
remove_node_ip=$OPTARG
;;
w )
remove_worker_ip=$OPTARG
;;
n )
internal_network=$OPTARG
;;
s )
tls_san=$OPTARG
;;
\? )
echo "Invalid option: $OPTARG" 1>&2
exit 1
;;
k )
nodefs_dir="--kubelet-arg=root-dir=$OPTARG"
;;
o )
imagefs_dir="--data-dir=$OPTARG"
;;
: )
echo "Invalid option: $OPTARG requires an argument" 1>&2
exit 1
;;
esac
done
shift $((OPTIND -1))
if [[ -z "$internal_network" ]]; then
echo "Please provide the internal network using the -n option."
exit 1
fi
# Ensure only the first two octets (e.g., 172.18.) are used from the provided network
internal_network=$(echo "$internal_network" | cut -d'.' -f1,2)
# Detect the internal IP based on the first two octets of the provided network
internal_ip=$(hostname -I | tr ' ' '\n' | grep "^${internal_network}\." | head -n 1)
if [[ -z "$internal_ip" ]]; then
echo "No IP found in the network ${internal_network}. Please verify."
exit 1
fi
echo "Selected internal IP: $internal_ip"
# Remove control plane node logic
if [[ -n "$remove_node_ip" ]]; then
# Check if kubectl is available
if ! command -v kubectl &> /dev/null; then
echo "kubectl command could not be found, please install it to proceed."
exit 1
fi
# Check if etcdctl is available
if ! command -v etcdctl &> /dev/null; then
echo "etcdctl command could not be found, attempting to install it..."
apt update
apt install -y etcd-client
if ! command -v etcdctl &> /dev/null; then
echo "Failed to install etcdctl, please install it manually."
exit 1
fi
fi
# Validate node exists
if ! kubectl get node "$remove_node_ip" &> /dev/null; then
echo "Specified node does not exist in the cluster."
exit 1
fi
echo "Draining the node..."
kubectl drain --ignore-daemonsets --delete-local-data $remove_node_ip || { echo "Failed to drain node"; exit 1; }
echo "Removing the node from the cluster..."
kubectl delete node $remove_node_ip || { echo "Failed to delete node"; exit 1; }
# If etcd member needs to be removed:
echo "Removing the etcd member..."
etcd_member_id=$(etcdctl member list | grep $remove_node_ip | awk '{print $1}')
if [ -n "$etcd_member_id" ]; then
etcdctl member remove $etcd_member_id || { echo "Failed to remove etcd member"; exit 1; }
else
echo "No etcd member found for the specified IP."
fi
echo "Control plane node removed successfully."
exit 0
fi
# Remove worker node logic
if [[ -n "$remove_worker_ip" ]]; then
if ! kubectl get node "$remove_worker_ip" &> /dev/null; then
echo "Specified worker node does not exist in the cluster."
exit 1
fi
echo "Draining the worker node..."
kubectl drain "$remove_worker_ip" --ignore-daemonsets --delete-local-data --force || { echo "Failed to drain worker node"; exit 1; }
echo "Deleting the worker node from the cluster..."
kubectl delete node "$remove_worker_ip" || { echo "Failed to delete worker node"; exit 1; }
echo "Worker node removed successfully."
exit 0
fi
# Function to update CoreDNS with 8.8.8.8 1.1.1.1 servers
update_coredns_config() {
while ! kubectl -n kube-system get cm coredns >/dev/null 2>&1; do echo waiting for the coredns configmap resource ...; sleep 2; done
echo "Patching CoreDNS configuration to use 8.8.8.8 1.1.1.1 servers instead of the systemd-resolved default..."
kubectl patch configmap coredns -n kube-system --type merge -p '{"data":{"Corefile":".:53 {\n errors\n health\n ready\n kubernetes cluster.local in-addr.arpa ip6.arpa {\n pods insecure\n fallthrough in-addr.arpa ip6.arpa\n }\n hosts /etc/coredns/NodeHosts {\n ttl 60\n reload 15s\n fallthrough\n }\n prometheus :9153\n forward . 8.8.8.8 1.1.1.1\n cache 30\n loop\n reload\n loadbalance\n import /etc/coredns/custom/*.override\n }\n import /etc/coredns/custom/*.server"}}'
echo "CoreDNS configuration patched."
}
# Add control plane node logic
if [[ "$mode" == "init" ]]; then
# Ensure jq is installed for JSON processing
if ! command -v jq &> /dev/null; then
echo "jq is not installed. Installing jq..."
apt-get update && apt-get install -y jq
fi
# Ensure yq is installed for YAML processing
if ! command -v yq &> /dev/null; then
echo "yq is not installed. Installing yq..."
curl -L https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -o /usr/local/bin/yq && chmod +x /usr/local/bin/yq
fi
echo "Starting initial K3s installation on master node..."
install_exec="--cluster-init"
if [[ -n "$external_ip" ]]; then
install_exec+=" --node-external-ip=${external_ip}"
fi
install_exec+=" --node-ip=${internal_ip}"
if [[ -n "$tls_san" ]]; then
install_exec+=" --tls-san=${tls_san}"
fi
curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="$k3s_common_args $install_exec $nodefs_dir $imagefs_dir" sh -
echo "K3s installation completed."
# display the server token
if [[ -n "$imagefs_dir" ]]; then
# Extract the base path from imagefs_dir
base_path=$(echo "$imagefs_dir" | sed 's/--data-dir=//')
server_token_path="${base_path}/server/token"
fi
# Try to read the token, falling back to default location if needed
if [[ -f "$server_token_path" ]]; then
token=$(cat "$server_token_path")
else
token=$(cat "/var/lib/rancher/k3s/server/token")
fi
echo "K3s control-plane and worker node token: $token"
echo "Installing Calico CNI..."
curl -O https://raw.githubusercontent.com/projectcalico/calico/refs/tags/v3.28.2/manifests/calico.yaml
yq eval-all '(select(.kind == "DaemonSet" and .metadata.name == "calico-node").spec.template.spec.containers[] | select(.name == "calico-node").env) += {"name": "IP_AUTODETECTION_METHOD", "value": "kubernetes-internal-ip"}' -i calico.yaml
kubectl apply -f calico.yaml
echo "Calico CNI installation completed."
update_coredns_config # Update the CoreDNS ConfigMap
# Install provider-services on master
echo "Installing provider-services..."
cd ~
apt-get update
apt-get install -y unzip
curl -sfL https://raw.githubusercontent.com/akash-network/provider/main/install.sh | bash
# Add /root/bin to the path for the current session
NEW_PATH="/root/bin"
export PATH="$PATH:$NEW_PATH"
# Validate provider-services installation
echo "Validating provider-services installation..."
provider_services_version=$(provider-services version 2>&1)
if [[ "$provider_services_version" =~ ^v ]]; then
echo "Provider-services is successfully installed. Version: $provider_services_version"
else
echo "Provider-services installation failed or not accessible in the PATH."
exit 1
fi
# Create and label Kubernetes namespaces
echo "Creating and labeling Kubernetes namespaces..."
for ns in akash-services lease; do
if kubectl get ns $ns > /dev/null 2>&1; then
echo "Namespace $ns already exists."
else
kubectl create ns $ns
echo "Namespace $ns created."
fi
done
kubectl label ns akash-services akash.network/name=akash-services akash.network=true --overwrite
kubectl label ns lease akash.network=true --overwrite
if [[ "$all_in_one_mode" == "false" ]]; then
echo "Please proceed with Akash provider account creation/import and export/storage of private key before running the next script."
fi
else
if [[ -z "$master_ip" || -z "$token" ]]; then
echo "Both master IP (-m) and token (-c) must be provided to add a control-plane node."
exit 1
fi
echo "Adding a new control-plane node to the cluster..."
install_exec=""
if [[ -n "$external_ip" ]]; then
install_exec+=" --node-external-ip=${external_ip}"
fi
install_exec+=" --node-ip=${internal_ip}"
if [[ -n "$tls_san" ]]; then
install_exec+=" --tls-san=${tls_san}"
fi
# when K3S_URL is used, must add "server" when adding a new control-plane nodes to the cluster
# it also must go first in the order, otherwise k3s.service will fail to start
curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server ${k3s_common_args} ${install_exec} $nodefs_dir $imagefs_dir" K3S_URL="https://$master_ip:6443" K3S_TOKEN="$token" sh -
echo "Control-plane node added to the cluster."
fi
# Update the kubeconfig file if an external IP is specified
if [[ -n "$external_ip" ]]; then
echo "Updating kubeconfig file to use both internal and external IP addresses..."
# Define paths for the kubeconfig files
kubeconfig_path=/etc/rancher/k3s/k3s.yaml
# Extract the current certificate-authority-data
ca_data=$(kubectl config view --raw -o jsonpath='{.clusters[0].cluster.certificate-authority-data}')
# Create a new kubeconfig content with both internal and external IPs
cat <<EOF > ${kubeconfig_path}
apiVersion: v1
clusters:
- cluster:
certificate-authority-data: ${ca_data}
server: https://${external_ip}:6443
name: k3s-cluster
contexts:
- context:
cluster: k3s-cluster
user: default
name: default
current-context: default
kind: Config
preferences: {}
users:
- name: default
user:
client-certificate-data: $(kubectl config view --raw -o jsonpath='{.users[0].user.client-certificate-data}')
client-key-data: $(kubectl config view --raw -o jsonpath='{.users[0].user.client-key-data}')
EOF
echo "kubeconfig file updated to use both internal and external IP addresses with a single context."
fi
# GPU host prep, driver, and toolkit install
if [ "$install_gpu_drivers" = true ]; then
echo "Starting GPU host preparation, driver, and toolkit installation..."
apt update
DEBIAN_FRONTEND=noninteractive apt-get -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" dist-upgrade
apt-get autoremove -y
echo "Installing NVIDIA drivers..."
apt-get install -y ubuntu-drivers-common
ubuntu-drivers devices
ubuntu-drivers autoinstall
echo "NVIDIA GPU drivers installation completed."
echo "Installing NVIDIA container runtime..."
distribution="stable/deb"
curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | apt-key add -
curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | tee /etc/apt/sources.list.d/libnvidia-container.list
apt-get update
DEBIAN_FRONTEND=noninteractive apt-get install -y nvidia-container-toolkit nvidia-container-runtime
echo "NVIDIA container runtime installation completed."
CONFIG_FILE="/etc/nvidia-container-runtime/config.toml"
if [ -f "$CONFIG_FILE" ]; then
echo "Updating NVIDIA runtime configuration..."
sed -i 's/#accept-nvidia-visible-devices-as-volume-mounts = false/accept-nvidia-visible-devices-as-volume-mounts = true/' "$CONFIG_FILE"
sed -i 's/#accept-nvidia-visible-devices-envvar-when-unprivileged = true/accept-nvidia-visible-devices-envvar-when-unprivileged = false/' "$CONFIG_FILE"
else
echo "NVIDIA runtime configuration file not found."
fi
fi
echo "Setup completed."