-
Notifications
You must be signed in to change notification settings - Fork 2k
/
init.go
313 lines (256 loc) · 9.87 KB
/
init.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
//go:build linux
package cgroupslib
import (
"bytes"
"fmt"
"os"
"path/filepath"
"strings"
"github.com/hashicorp/go-hclog"
)
const (
// the name of the cpuset interface file
cpusetFile = "cpuset.cpus"
// the name of the cpuset mems interface file
memsFile = "cpuset.mems"
)
// Init will initialize the cgroup tree that the Nomad client will use for
// isolating resources of tasks. cores is the cpuset granted for use by Nomad.
func Init(log hclog.Logger, cores string) error {
log.Info("initializing nomad cgroups", "cores", cores)
switch GetMode() {
case CG1:
// the value to disable inheriting values from parent cgroup
const noClone = "0"
// the name of the clone_children interface file
const cloneFile = "cgroup.clone_children"
// create the /nomad cgroup (or whatever the name is configured to be)
// for each cgroup controller we are going to use
controllers := []string{"freezer", "memory", "cpu", "cpuset"}
for _, ctrl := range controllers {
p := filepath.Join(root, ctrl, NomadCgroupParent)
if err := os.MkdirAll(p, 0755); err != nil {
return fmt.Errorf("failed to create nomad cgroup %s: %w", ctrl, err)
}
}
// determine the memset that will be set on the cgroup for each task
//
// nominally this will be all available but we have to read the root
// cgroup to actually know what those are
//
// additionally if the nomad cgroup parent already exists, we must
// use that memset instead, because it could have been setup out of
// band from nomad itself
var memsSet string
if mems, err := detectMemsCG1(); err != nil {
return fmt.Errorf("failed to detect memset: %w", err)
} else {
memsSet = mems
}
//
// configure cpuset partitioning
//
// the tree is lopsided - tasks making use of reserved cpu cores get
// their own cgroup with a static cpuset.cpus value. other tasks are
// placed in the single share cgroup and share its dynamic cpuset.cpus
// value
//
// e.g.,
// root/cpuset/nomad/
// share/{cgroup.procs, cpuset.cpus, cpuset.mems}
// reserve/
// abc123.task/{cgroup.procs, cpuset.cpus, cpuset.mems}
// def456.task/{cgroup.procs, cpuset.cpus, cpuset.mems}
if err := writeCG(noClone, "cpuset", NomadCgroupParent, cloneFile); err != nil {
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
}
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, memsFile); err != nil {
return fmt.Errorf("failed to set cpuset.mems on nomad cpuset cgroup: %w", err)
}
if err := writeCG(cores, "cpuset", NomadCgroupParent, cpusetFile); err != nil {
return fmt.Errorf("failed to write cores to nomad cpuset cgroup: %w", err)
}
//
// share partition
//
if err := mkCG("cpuset", NomadCgroupParent, SharePartition()); err != nil {
return fmt.Errorf("failed to create share cpuset partition: %w", err)
}
if err := writeCG(noClone, "cpuset", NomadCgroupParent, SharePartition(), cloneFile); err != nil {
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
}
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, SharePartition(), memsFile); err != nil {
return fmt.Errorf("failed to set cpuset.mems on share cpuset partition: %w", err)
}
//
// reserve partition
//
if err := mkCG("cpuset", NomadCgroupParent, ReservePartition()); err != nil {
return fmt.Errorf("failed to create reserve cpuset partition: %w", err)
}
if err := writeCG(noClone, "cpuset", NomadCgroupParent, ReservePartition(), cloneFile); err != nil {
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
}
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, ReservePartition(), memsFile); err != nil {
return fmt.Errorf("failed to set cpuset.mems on reserve cpuset partition: %w", err)
}
log.Debug("nomad cpuset partitions initialized", "cores", cores)
case CG2:
// the cgroup controllers we need to activate at the root and on the nomad slice
const activation = "+cpuset +cpu +io +memory +pids"
// the name of the cgroup subtree interface file
const subtreeFile = "cgroup.subtree_control"
//
// configuring root cgroup (/sys/fs/cgroup)
//
// clients with delegated cgroups typically won't be able to write to
// the subtree file, but that's ok so long as the required controllers
// are activated
if !functionalCgroups2(subtreeFile) {
if err := writeCG(activation, subtreeFile); err != nil {
return fmt.Errorf("failed to create nomad cgroup: %w", err)
}
}
//
// configuring nomad.slice
//
if err := mkCG(NomadCgroupParent); err != nil {
return fmt.Errorf("failed to create nomad cgroup: %w", err)
}
if err := writeCG(activation, NomadCgroupParent, subtreeFile); err != nil {
return fmt.Errorf("failed to set subtree control on nomad cgroup: %w", err)
}
if err := writeCG(cores, NomadCgroupParent, cpusetFile); err != nil {
return fmt.Errorf("failed to write root partition cpuset: %w", err)
}
log.Debug("top level partition root nomad.slice cgroup initialized")
//
// configuring nomad.slice/share (member)
//
if err := mkCG(NomadCgroupParent, SharePartition()); err != nil {
return fmt.Errorf("failed to create share cgroup: %w", err)
}
if err := writeCG(activation, NomadCgroupParent, SharePartition(), subtreeFile); err != nil {
return fmt.Errorf("failed to set subtree control on cpuset share partition: %w", err)
}
log.Debug("partition member nomad.slice/share cgroup initialized")
//
// configuring nomad.slice/reserve (member)
//
if err := mkCG(NomadCgroupParent, ReservePartition()); err != nil {
return fmt.Errorf("failed to create share cgroup: %w", err)
}
if err := writeCG(activation, NomadCgroupParent, ReservePartition(), subtreeFile); err != nil {
return fmt.Errorf("failed to set subtree control on cpuset reserve partition: %w", err)
}
log.Debug("partition member nomad.slice/reserve cgroup initialized")
}
return nil
}
// detectMemsCG1 will determine the cpuset.mems value to use for
// Nomad managed cgroups.
//
// Copy the value from the root cgroup cpuset.mems file, unless the nomad
// parent cgroup exists with a value set, in which case use the cpuset.mems
// value from there.
func detectMemsCG1() (string, error) {
// read root cgroup mems file
memsRootPath := filepath.Join(root, "cpuset", memsFile)
b, err := os.ReadFile(memsRootPath)
if err != nil {
return "", err
}
memsFromRoot := string(bytes.TrimSpace(b))
// read parent cgroup mems file (may not exist)
memsParentPath := filepath.Join(root, "cpuset", NomadCgroupParent, memsFile)
b2, err2 := os.ReadFile(memsParentPath)
if err2 != nil {
return memsFromRoot, nil
}
memsFromParent := string(bytes.TrimSpace(b2))
// we found a value in the parent cgroup file, use that
if memsFromParent != "" {
return memsFromParent, nil
}
// otherwise use the value from the root cgroup
return memsFromRoot, nil
}
func readRootCG2(filename string) (string, error) {
p := filepath.Join(root, filename)
b, err := os.ReadFile(p)
return string(bytes.TrimSpace(b)), err
}
// filepathCG will return the given paths based on the cgroup root
func filepathCG(paths ...string) string {
base := []string{root}
base = append(base, paths...)
p := filepath.Join(base...)
return p
}
// writeCG will write content to the cgroup interface file given by paths
func writeCG(content string, paths ...string) error {
p := filepathCG(paths...)
return os.WriteFile(p, []byte(content), 0644)
}
// mkCG will create a cgroup at the given path
func mkCG(paths ...string) error {
p := filepathCG(paths...)
return os.MkdirAll(p, 0755)
}
// ReadNomadCG2 reads an interface file under the nomad.slice parent cgroup
// (or whatever its name is configured to be)
func ReadNomadCG2(filename string) (string, error) {
p := filepath.Join(root, NomadCgroupParent, filename)
b, err := os.ReadFile(p)
return string(bytes.TrimSpace(b)), err
}
// ReadNomadCG1 reads an interface file under the /nomad cgroup of the given
// cgroup interface.
func ReadNomadCG1(iface, filename string) (string, error) {
p := filepath.Join(root, iface, NomadCgroupParent, filename)
b, err := os.ReadFile(p)
return string(bytes.TrimSpace(b)), err
}
func WriteNomadCG1(iface, filename, content string) error {
p := filepath.Join(root, iface, NomadCgroupParent, filename)
return os.WriteFile(p, []byte(content), 0644)
}
// PathCG1 returns the filepath to the cgroup directory of the given interface
// and allocID / taskName.
func PathCG1(allocID, taskName, iface string) string {
return filepath.Join(root, iface, NomadCgroupParent, ScopeCG1(allocID, taskName))
}
// LinuxResourcesPath returns the filepath to the directory that the field
// x.Resources.LinuxResources.CpusetCgroupPath is expected to hold on to
func LinuxResourcesPath(allocID, task string, reserveCores bool) string {
partition := GetPartitionFromBool(reserveCores)
mode := GetMode()
switch {
case mode == CG1 && reserveCores:
return filepath.Join(root, "cpuset", NomadCgroupParent, partition, ScopeCG1(allocID, task))
case mode == CG1 && !reserveCores:
return filepath.Join(root, "cpuset", NomadCgroupParent, partition)
default:
return filepath.Join(root, NomadCgroupParent, partition, scopeCG2(allocID, task))
}
}
// CustomPathCG1 returns the absolute directory path of the cgroup directory of
// the given controller. If path is already absolute (starts with /), that
// value is used without modification.
func CustomPathCG1(controller, path string) string {
if strings.HasPrefix(path, "/") {
return path
}
return filepath.Join(root, controller, path)
}
// CustomPathCG2 returns the absolute directory path of the given cgroup path.
// If the path is already absolute (starts with /), that value is used without
// modification.
func CustomPathCG2(path string) string {
if strings.HasPrefix(path, "/") || path == "" {
return path
}
return filepath.Join(root, path)
}