Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release v0.0.7 #112

Merged
merged 26 commits into from
Jan 17, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
0bc6d13
Update clientmountd service account name/namespace after DWS move
bdevcich Sep 25, 2023
7b1f34c
add tds configuration information (#89)
ajfloeder Oct 4, 2023
c9c6563
Move to cert-manager 1.13.1 (#90)
roehrich-hpe Oct 5, 2023
cd9d287
Update submodules for ginkgo and controller-runtime (#91)
roehrich-hpe Oct 10, 2023
c0b389c
Move DWS from HewlettPackard/dws to DataWorkflowServices/dws (#92)
roehrich-hpe Oct 20, 2023
3c49e3c
Update the project for kubebuilder go/v4 layout (#93)
roehrich-hpe Oct 31, 2023
ae771cb
Adjust copy action for the texas admin node. Need to use the -O optio…
ajfloeder Nov 1, 2023
5df80d4
Set the maxsize for the audit log in a KIND env (#95)
roehrich-hpe Nov 1, 2023
df39ba6
Reduce the size of the audit log summary (#96)
roehrich-hpe Nov 2, 2023
8620e4f
Requirements for Prometheus (#97)
roehrich-hpe Nov 6, 2023
f432daf
Account for nnf-dm no longer having CRDs
bdevcich Nov 14, 2023
3621330
Add k8sHost + k8sPort to systems config (#99)
bdevcich Nov 17, 2023
4071634
Readme: fix build instructions
jameshcorbett Nov 28, 2023
de5f990
Add examples-htx overlay to htx (#101)
bdevcich Dec 6, 2023
d38a156
Rename the htx computes in the SystemConfiguration
bdevcich Dec 6, 2023
d977d17
Use a bare SystemConfiguration CR (#103)
roehrich-hpe Dec 8, 2023
290687d
Use clientmountd in nnf-sos
matthew-richerson Dec 18, 2023
11cfab5
Merge pull request #104 from NearNodeFlash/clientmountd_move
matthew-richerson Dec 21, 2023
2869774
Add daemon env var support; tune daemon garbage collection, max procs…
bdevcich Jan 3, 2024
8f9794a
Use master for lustre-csi-driver and lustre-fs-operator + update subm…
bdevcich Jan 4, 2024
7fde887
Toss deprecated build.sh. (#107)
roehrich-hpe Jan 11, 2024
525da05
Update the system configuration for texas and the submodules (#110)
ajfloeder Jan 12, 2024
a5561fd
Move the prometheus helm chart values to a helm-specific subdir (#109)
roehrich-hpe Jan 12, 2024
237dfbf
release v0.0.7
ajfloeder Jan 16, 2024
b11a781
release v0.0.7
ajfloeder Jan 16, 2024
6c7c489
Correct the build version tag
ajfloeder Jan 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
all: fmt vet nnf-deploy

nnf-deploy: main.go
go build
nnf-deploy: cmd/main.go
go build -o ./nnf-deploy cmd/main.go

.PHONY: fmt
fmt:
go fmt ./main.go
go fmt cmd/main.go
go fmt ./config/*.go

.PHONY: vet
vet:
go vet ./main.go
go vet cmd/main.go

.PHONY: test
test:
Expand Down
2 changes: 1 addition & 1 deletion Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ nnf-deploy is a golang executable capable of building components of the Rabbit s

### Build

Build using: `go build`
Build using: `make`

Prior to running, ensure correct NNF systems are loaded in [./config/systems.yaml](./config/systems.yaml) and correct ghcr repositories are defined in [./config/repositories.yaml](./config/repositories.yaml)

Expand Down
28 changes: 0 additions & 28 deletions build.sh

This file was deleted.

140 changes: 77 additions & 63 deletions main.go → cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
package main

import (
"encoding/json"
"errors"
"fmt"
"os"
Expand All @@ -30,10 +29,8 @@ import (
"time"

"github.com/alecthomas/kong"
"gopkg.in/yaml.v2"
"k8s.io/apimachinery/pkg/util/intstr"
"gopkg.in/yaml.v3"

dwsv1alpha2 "github.com/DataWorkflowServices/dws/api/v1alpha2"
"github.com/NearNodeFlash/nnf-deploy/config"
)

Expand Down Expand Up @@ -141,7 +138,7 @@ func (cmd *UndeployCmd) Run(ctx *Context) error {

// Uninstall first to ensure the CRDs, and therefore all related custom
// resources, are deleted while the controllers are still running.
if module != "lustre-csi-driver" {
if module != "lustre-csi-driver" && module != "nnf-dm" {
if err := runMakeCommand(ctx, system, module, "uninstall"); err != nil {
return err
}
Expand Down Expand Up @@ -235,6 +232,12 @@ func (cmd *InstallCmd) Run(ctx *Context) error {
return err
}

sysConfigCR, err := config.ReadSystemConfigurationCR("config/" + system.SystemConfiguration)
if err != nil {
return err
}
perRabbit := sysConfigCR.RabbitsAndComputes()

clusterConfig, err := currentClusterConfig()
if err != nil {
return err
Expand All @@ -247,6 +250,16 @@ func (cmd *InstallCmd) Run(ctx *Context) error {
k8sServerHost := clusterConfig[:strings.Index(clusterConfig, ":")]
k8sServerPort := clusterConfig[strings.Index(clusterConfig, ":")+1:]

// Let the config override these values pulled from the cluster config. The values are used for
// daemons on the compute nodes, which may need a different IP/network to hit the cluster than
// the public facing cluster IP that the cluster config is using.
if system.K8sHost != "" {
k8sServerHost = system.K8sHost
}
if system.K8sPort != "" {
k8sServerPort = system.K8sPort
}

return config.EnumerateDaemons(ctx.Daemons, func(d config.Daemon) error {

var token []byte
Expand Down Expand Up @@ -293,10 +306,10 @@ func (cmd *InstallCmd) Run(ctx *Context) error {
}
}

for rabbit := range system.Rabbits {
for rabbit, computes := range perRabbit {
fmt.Printf(" Check clients of rabbit %s\n", rabbit)

for _, compute := range system.Rabbits[rabbit] {
for _, compute := range computes {
fmt.Printf(" Checking for install on Compute Node %s\n", compute)

if shouldSkipNode(compute) {
Expand Down Expand Up @@ -388,24 +401,27 @@ func (cmd *InstallCmd) Run(ctx *Context) error {
fmt.Printf("\n")
}

execStart := ""
execStart += "[Service]\n"
execStart += "ExecStart=\n"
execStart += "ExecStart=/usr/bin/" + d.Bin + " \\\n"
execStart += " --kubernetes-service-host=" + k8sServerHost + " \\\n"
execStart += " --kubernetes-service-port=" + k8sServerPort + " \\\n"
execStart += " --node-name=" + compute + " \\\n"
overrideContents := ""
overrideContents += "[Service]\n"
overrideContents += "ExecStart=\n"
overrideContents += "ExecStart=/usr/bin/" + d.Bin + " \\\n"
overrideContents += " --kubernetes-service-host=" + k8sServerHost + " \\\n"
overrideContents += " --kubernetes-service-port=" + k8sServerPort + " \\\n"
overrideContents += " --node-name=" + compute + " "
if !d.SkipNnfNodeName {
execStart += " --nnf-node-name=" + rabbit + " \\\n"
overrideContents += "\\\n" + " --nnf-node-name=" + rabbit + " "
}
if len(token) != 0 {
execStart += " --service-token-file=" + path.Join(serviceTokenPath, "service.token") + " \\\n"
overrideContents += "\\\n" + " --service-token-file=" + path.Join(serviceTokenPath, "service.token") + " "
}
if len(cert) != 0 {
execStart += " --service-cert-file=" + path.Join(certFilePath, "service.cert") + " \\\n"
overrideContents += "\\\n" + " --service-cert-file=" + path.Join(certFilePath, "service.cert") + " "
}
if len(d.ExtraArgs) > 0 {
execStart += " " + d.ExtraArgs + " \\\n"
overrideContents += "\\\n" + d.ExtraArgs + " "
}
for _, e := range d.Environment {
overrideContents += "\n" + "Environment=" + e.Name + "=" + e.Value
}

fmt.Printf(" Creating override directory...")
Expand All @@ -417,7 +433,7 @@ func (cmd *InstallCmd) Run(ctx *Context) error {
fmt.Printf("\n")

fmt.Println(" Creating override configuration...")
if err := os.WriteFile("override.conf", []byte(execStart), 0644); err != nil {
if err := os.WriteFile("override.conf", []byte(overrideContents), 0644); err != nil {
return err
}

Expand Down Expand Up @@ -466,7 +482,13 @@ func (cmd *InitCmd) Run(ctx *Context) error {
return err
}

if err := applyLabelsTaints(system, ctx); err != nil {
sysConfigCR, err := config.ReadSystemConfigurationCR("config/" + system.SystemConfiguration)
if err != nil {
return err
}
perRabbit := sysConfigCR.RabbitsAndComputes()

if err := applyLabelsTaints(perRabbit, ctx); err != nil {
return err
}

Expand Down Expand Up @@ -529,7 +551,7 @@ func installThirdPartyServices(ctx *Context) error {
return nil
}

func applyLabelsTaints(system *config.System, ctx *Context) error {
func applyLabelsTaints(perRabbit config.Rabbits, ctx *Context) error {
// Labels/Taints to apply to nnf nodes
nnfNodeLabels := []string{
"cray.nnf.node=true",
Expand All @@ -539,7 +561,7 @@ func applyLabelsTaints(system *config.System, ctx *Context) error {
}

nnfNodes := []string{}
for rabbit := range system.Rabbits {
for rabbit := range perRabbit {
nnfNodes = append(nnfNodes, rabbit)
}

Expand Down Expand Up @@ -690,7 +712,7 @@ func checkNeedsUpdate(ctx *Context, name string, compute string, destination str

func copyToNode(ctx *Context, name string, compute string, destination string) error {
fmt.Printf(" Copying %s to %s at %s...", name, compute, destination)
if _, err := runCommand(ctx, exec.Command("scp", "-C", name, compute+":"+destination)); err != nil {
if _, err := runCommand(ctx, exec.Command("scp", "-OC", name, compute+":"+destination)); err != nil {
return err
}

Expand Down Expand Up @@ -749,6 +771,25 @@ func getOverlay(ctx *Context, system *config.System, module string) (string, err
return "", nil
}

func getExampleOverlay(ctx *Context, system *config.System, module string) (string, error) {

repo, _, err := config.FindRepository(ctx.Repos, module)
if err != nil {
return "", err
}

for _, repoOverlay := range repo.Overlays {
for _, systemOverlay := range system.Overlays {
if repoOverlay == systemOverlay && strings.HasPrefix(systemOverlay, "examples-") {
fmt.Printf(" Examples Overlay for %s found: %s\n", module, repoOverlay)
return repoOverlay, nil
}
}
}

return "", nil
}

func deployModule(ctx *Context, system *config.System, module string) error {

cmd := exec.Command("make", "deploy")
Expand All @@ -758,6 +799,13 @@ func deployModule(ctx *Context, system *config.System, module string) error {
return err
}

// Some repos apply examples (e.g. nnf-sos' container/storage profiles) in an additional step in
// deploy.sh, so account for an additional overlay to use in that case.
overlayExample, err := getExampleOverlay(ctx, system, module)
if err != nil {
return err
}

fmt.Print(" Finding Repository...")
repo, buildConfig, err := config.FindRepository(ctx.Repos, module)
if err != nil {
Expand Down Expand Up @@ -802,6 +850,10 @@ func deployModule(ctx *Context, system *config.System, module string) error {
"OVERLAY="+overlay,
)

if len(overlayExample) > 0 {
cmd.Env = append(cmd.Env, "OVERLAY_EXAMPLES="+overlayExample)
}

fmt.Println(" Running Deploy...")
_, err = runCommand(ctx, cmd)
return err
Expand Down Expand Up @@ -923,45 +975,7 @@ func createSystemConfigFromSOS(ctx *Context, system *config.System, module strin

fmt.Println("Creating SystemConfiguration...")

config := dwsv1alpha2.SystemConfiguration{}

config.Name = "default"
config.Namespace = "default"
config.Kind = "SystemConfiguration"
config.APIVersion = fmt.Sprintf("%s/%s", dwsv1alpha2.GroupVersion.Group, dwsv1alpha2.GroupVersion.Version)

// Convert port strings to IntOrString slice
ports := []intstr.IntOrString{}
for _, port := range system.Ports {
ports = append(ports, intstr.FromString(port))
}
config.Spec.Ports = append(config.Spec.Ports, ports...)

for storageName, computes := range system.Rabbits {
storage := dwsv1alpha2.SystemConfigurationStorageNode{}
storage.Type = "Rabbit"
storage.Name = storageName
for index, computeName := range computes {
compute := dwsv1alpha2.SystemConfigurationComputeNode{
Name: computeName,
}
config.Spec.ComputeNodes = append(config.Spec.ComputeNodes, compute)

computeReference := dwsv1alpha2.SystemConfigurationComputeNodeReference{
Name: computeName,
Index: index,
}
storage.ComputesAccess = append(storage.ComputesAccess, computeReference)
}
config.Spec.StorageNodes = append(config.Spec.StorageNodes, storage)
}

configjson, err := json.Marshal(config)
if err != nil {
return err
}

cmd := exec.Command("bash", "-c", fmt.Sprintf("cat <<EOF | kubectl apply -f - \n%s", configjson))
_, err = runCommand(ctx, cmd)
cmd := exec.Command("kubectl", "apply", "-f", "../config/"+system.SystemConfiguration)
_, err := runCommand(ctx, cmd)
return err
}
4 changes: 2 additions & 2 deletions config/audit-policy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# $ jq -M . kube-apiserver-audit.log | less
#
# Dump a quick-to-digest summary of the log events:
# $ jq -M '[.auditID,.verb,.requestURI,.user.username,.objectRef.name,.responseStatus.code,.stageTimestamp]' kube-apiserver-audit.log | less
# $ jq -M '[.auditID,.verb,.requestURI,.user.username,.responseStatus.code,.stageTimestamp]' kube-apiserver-audit.log | less
#
# Extract a specific event record from the log:
# $ jq -M '. | select(.auditID=="d1053ee5-0734-4b40-815f-3f6831f82bac")' kube-apiserver-audit.log | less
Expand All @@ -34,7 +34,7 @@ rules:
# - lustrefilesystems
# - lustrefilesystems/status

- group: dws.cray.hpe.com
- group: dataworkflowservices.github.io
# resources:
# - clientmounts
# - clientmounts/status
Expand Down
Loading