From f5ed02c4c86eea5c1719a2505e90d26bee71ea6a Mon Sep 17 00:00:00 2001 From: Philip Laine Date: Wed, 10 Jul 2024 11:43:11 +0200 Subject: [PATCH] refactor: test and refactor split file --- src/pkg/layout/package.go | 2 +- src/pkg/layout/split.go | 109 +++++++++++++++++++++++++++ src/pkg/layout/split_test.go | 96 +++++++++++++++++++++++ src/pkg/utils/io.go | 142 ----------------------------------- 4 files changed, 206 insertions(+), 143 deletions(-) create mode 100644 src/pkg/layout/split.go create mode 100644 src/pkg/layout/split_test.go diff --git a/src/pkg/layout/package.go b/src/pkg/layout/package.go index e68f77b551..1b99379e81 100644 --- a/src/pkg/layout/package.go +++ b/src/pkg/layout/package.go @@ -243,7 +243,7 @@ func (pp *PackagePaths) ArchivePackage(destinationTarball string, maxPackageSize return fmt.Errorf("unable to split the package archive into multiple files: must be less than 1,000 files") } message.Notef("Package is larger than %dMB, splitting into multiple files", maxPackageSizeMB) - err := utils.SplitFile(destinationTarball, chunkSize) + err := splitFile(destinationTarball, chunkSize) if err != nil { return fmt.Errorf("unable to split the package archive into multiple files: %w", err) } diff --git a/src/pkg/layout/split.go b/src/pkg/layout/split.go new file mode 100644 index 0000000000..aecf295dad --- /dev/null +++ b/src/pkg/layout/split.go @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2021-Present The Zarf Authors + +// Package layout contains functions for interacting with Zarf's package layout on disk. +package layout + +import ( + "crypto/sha256" + "encoding/json" + "errors" + "fmt" + "io" + "os" + + "github.com/defenseunicorns/pkg/helpers/v2" + "github.com/defenseunicorns/zarf/src/pkg/message" + "github.com/defenseunicorns/zarf/src/types" +) + +// splitFile will split the file into chunks and remove the original file. +func splitFile(srcPath string, chunkSize int) error { + srcFile, err := os.Open(srcPath) + if err != nil { + return err + } + defer srcFile.Close() + fi, err := srcFile.Stat() + if err != nil { + return err + } + + title := fmt.Sprintf("[0/%d] MB bytes written", fi.Size()/1000/1000) + progressBar := message.NewProgressBar(fi.Size(), title) + defer progressBar.Close() + + hash := sha256.New() + fileCount := 0 + for { + path := fmt.Sprintf("%s.part%03d", srcPath, fileCount+1) + dstFile, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR, helpers.ReadAllWriteUser) + if err != nil { + return err + } + defer dstFile.Close() + + written, copyErr := io.CopyN(dstFile, srcFile, int64(chunkSize)) + if copyErr != nil && !errors.Is(copyErr, io.EOF) { + return err + } + progressBar.Add(int(written)) + title := fmt.Sprintf("[%d/%d] MB bytes written", progressBar.GetCurrent()/1000/1000, fi.Size()/1000/1000) + progressBar.Updatef(title) + + _, err = dstFile.Seek(0, io.SeekStart) + if err != nil { + return err + } + _, err = io.Copy(hash, dstFile) + if err != nil { + return err + } + err = dstFile.Close() + if err != nil { + return err + } + + // EOF error could be returned on 0 bytes written. + if written == 0 { + err = os.Remove(path) + if err != nil { + return err + } + break + } + + fileCount++ + if errors.Is(copyErr, io.EOF) { + break + } + } + + // Remove original file + err = srcFile.Close() + if err != nil { + return err + } + err = os.Remove(srcPath) + if err != nil { + return err + } + + // Write header file + data := types.ZarfSplitPackageData{ + Count: fileCount, + Bytes: fi.Size(), + Sha256Sum: fmt.Sprintf("%x", hash.Sum(nil)), + } + b, err := json.Marshal(data) + if err != nil { + return fmt.Errorf("unable to marshal the split package data: %w", err) + } + path := fmt.Sprintf("%s.part000", srcPath) + if err := os.WriteFile(path, b, helpers.ReadAllWriteUser); err != nil { + return fmt.Errorf("unable to write the file %s: %w", path, err) + } + progressBar.Successf("Package split across %d files", fileCount+1) + + return nil +} diff --git a/src/pkg/layout/split_test.go b/src/pkg/layout/split_test.go new file mode 100644 index 0000000000..b7f48fc9f6 --- /dev/null +++ b/src/pkg/layout/split_test.go @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2021-Present The Zarf Authors + +package layout + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "testing" + + "github.com/defenseunicorns/zarf/src/types" + "github.com/stretchr/testify/require" +) + +func TestSplitFile(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + fileSize int + chunkSize int + expectedFileSize int64 + expectedLastFileSize int64 + expectedFileCount int + expectedSha256Sum string + }{ + { + name: "split evenly", + fileSize: 2048, + chunkSize: 16, + expectedFileSize: 16, + expectedLastFileSize: 16, + expectedFileCount: 128, + expectedSha256Sum: "93ecad679eff0df493aaf5d7d615211b0f1d7a919016efb15c98f0b8efb1ba43", + }, + { + name: "split with remainder", + fileSize: 2048, + chunkSize: 10, + expectedFileSize: 10, + expectedLastFileSize: 8, + expectedFileCount: 205, + expectedSha256Sum: "fe8460f4d53d3578aa37191acf55b3db7bbcb706056f4b6b02a0c70f24b0d95a", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + name := "random" + p := filepath.Join(dir, name) + f, err := os.Create(p) + require.NoError(t, err) + b := make([]byte, tt.fileSize) + for i := range tt.fileSize { + b[i] = byte(tt.chunkSize) + } + require.NoError(t, err) + _, err = f.Write(b) + require.NoError(t, err) + f.Close() + + err = splitFile(p, tt.chunkSize) + require.NoError(t, err) + + _, err = os.Stat(p) + require.ErrorIs(t, err, os.ErrNotExist) + entries, err := os.ReadDir(dir) + require.NoError(t, err) + require.Len(t, entries, tt.expectedFileCount+1) + for i, entry := range entries[1:] { + require.Equal(t, fmt.Sprintf("%s.part%03d", name, i+1), entry.Name()) + + fi, err := entry.Info() + require.NoError(t, err) + if i == len(entries)-2 { + require.Equal(t, tt.expectedLastFileSize, fi.Size()) + } else { + require.Equal(t, tt.expectedFileSize, fi.Size()) + } + } + + b, err = os.ReadFile(filepath.Join(dir, fmt.Sprintf("%s.part000", name))) + require.NoError(t, err) + var data types.ZarfSplitPackageData + err = json.Unmarshal(b, &data) + require.NoError(t, err) + require.Equal(t, tt.expectedFileCount, data.Count) + require.Equal(t, int64(tt.fileSize), data.Bytes) + require.Equal(t, tt.expectedSha256Sum, data.Sha256Sum) + }) + } +} diff --git a/src/pkg/utils/io.go b/src/pkg/utils/io.go index 8b9592def3..2fd4cfc72b 100755 --- a/src/pkg/utils/io.go +++ b/src/pkg/utils/io.go @@ -5,17 +5,13 @@ package utils import ( - "crypto/sha256" - "encoding/json" "fmt" - "io" "os" "path/filepath" "github.com/defenseunicorns/pkg/helpers/v2" "github.com/defenseunicorns/zarf/src/config" "github.com/defenseunicorns/zarf/src/pkg/message" - "github.com/defenseunicorns/zarf/src/types" ) const ( @@ -73,141 +69,3 @@ func GetFinalExecutableCommand() (string, error) { return zarfCommand, err } - -// SplitFile will take a srcFile path and split it into files based on chunkSizeBytes -// the first file will be a metadata file containing: -// - sha256sum of the original file -// - number of bytes in the original file -// - number of files the srcFile was split into -// SplitFile will delete the original file -// -// Returns: -// - fileNames: list of file paths srcFile was split across -// - sha256sum: sha256sum of the srcFile before splitting -// - err: any errors encountered -func SplitFile(srcPath string, chunkSizeBytes int) (err error) { - var fileNames []string - var sha256sum string - hash := sha256.New() - - // Set buffer size to some multiple of 4096 KiB for modern file system cluster sizes - bufferSize := 16 * 1024 * 1024 // 16 MiB - // if chunkSizeBytes is less than bufferSize, use chunkSizeBytes as bufferSize for simplicity - if chunkSizeBytes < bufferSize { - bufferSize = chunkSizeBytes - } - buf := make([]byte, bufferSize) - - // get file size - fi, err := os.Stat(srcPath) - if err != nil { - return err - } - fileSize := fi.Size() - - // start progress bar - title := fmt.Sprintf("[0/%d] MB bytes written", fileSize/1000/1000) - progressBar := message.NewProgressBar(fileSize, title) - defer progressBar.Close() - - // open srcFile - srcFile, err := os.Open(srcPath) - if err != nil { - return err - } - defer srcFile.Close() - - // create file path starting from part 001 - path := fmt.Sprintf("%s.part001", srcPath) - chunkFile, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, helpers.ReadAllWriteUser) - if err != nil { - return err - } - fileNames = append(fileNames, path) - defer chunkFile.Close() - - // setup counter for tracking how many bytes are left to write to file - chunkBytesRemaining := chunkSizeBytes - // Loop over the tarball hashing as we go and breaking it into chunks based on the chunkSizeBytes - for { - bytesRead, err := srcFile.Read(buf) - - if err != nil { - if err == io.EOF { - // At end of file, break out of loop - break - } - return err - } - - // Pass data to hash - hash.Write(buf[0:bytesRead]) - - // handle if we should split the data between two chunks - if chunkBytesRemaining < bytesRead { - // write the remaining chunk size to file - _, err := chunkFile.Write(buf[0:chunkBytesRemaining]) - if err != nil { - return err - } - err = chunkFile.Close() - if err != nil { - return err - } - - // create new file - path = fmt.Sprintf("%s.part%03d", srcPath, len(fileNames)+1) - chunkFile, err = os.OpenFile(path, os.O_CREATE|os.O_WRONLY, helpers.ReadAllWriteUser) - if err != nil { - return err - } - fileNames = append(fileNames, path) - defer chunkFile.Close() - - // write to new file where we left off - _, err = chunkFile.Write(buf[chunkBytesRemaining:bytesRead]) - if err != nil { - return err - } - - // set chunkBytesRemaining considering how many bytes are already written to new file - chunkBytesRemaining = chunkSizeBytes - (bufferSize - chunkBytesRemaining) - } else { - _, err := chunkFile.Write(buf[0:bytesRead]) - if err != nil { - return err - } - chunkBytesRemaining = chunkBytesRemaining - bytesRead - } - - // update progress bar - progressBar.Add(bufferSize) - title := fmt.Sprintf("[%d/%d] MB bytes written", progressBar.GetCurrent()/1000/1000, fileSize/1000/1000) - progressBar.Updatef(title) - } - srcFile.Close() - _ = os.RemoveAll(srcPath) - - // calculate sha256 sum - sha256sum = fmt.Sprintf("%x", hash.Sum(nil)) - - // Marshal the data into a json file. - jsonData, err := json.Marshal(types.ZarfSplitPackageData{ - Count: len(fileNames), - Bytes: fileSize, - Sha256Sum: sha256sum, - }) - if err != nil { - return fmt.Errorf("unable to marshal the split package data: %w", err) - } - - // write header file - path = fmt.Sprintf("%s.part000", srcPath) - if err := os.WriteFile(path, jsonData, helpers.ReadAllWriteUser); err != nil { - return fmt.Errorf("unable to write the file %s: %w", path, err) - } - fileNames = append(fileNames, path) - progressBar.Successf("Package split across %d files", len(fileNames)) - - return nil -}