From 8ace82d825124a545730dbfb1ff6c7ff29d2b00b Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 7 Jun 2022 14:25:53 +0200 Subject: [PATCH] chunked: drop host dedup feature drop host deduplication by just looking at the file path. It could be useful in very specific use cases, but it is too expensive for generic images. If the need arises, we first need to create an index of the files that we can deduplicate so there is no need to calculate the checksum on the fly. Signed-off-by: Giuseppe Scrivano --- pkg/chunked/storage_linux.go | 105 ++--------------------------------- 1 file changed, 6 insertions(+), 99 deletions(-) diff --git a/pkg/chunked/storage_linux.go b/pkg/chunked/storage_linux.go index 9434499d2d..7b6cd8fe40 100644 --- a/pkg/chunked/storage_linux.go +++ b/pkg/chunked/storage_linux.go @@ -272,14 +272,6 @@ func canDedupFileWithHardLink(file *internal.FileMetadata, fd int, s os.FileInfo return canDedupMetadataWithHardLink(file, &otherFile) } -func getFileDigest(f *os.File, buf []byte) (digest.Digest, error) { - digester := digest.Canonical.Digester() - if _, err := io.CopyBuffer(digester.Hash(), f, buf); err != nil { - return "", err - } - return digester.Digest(), nil -} - // findFileInOSTreeRepos checks whether the requested file already exist in one of the OSTree repo and copies the file content from there if possible. // file is the file to look for. // ostreeRepos is a list of OSTree repos. @@ -330,75 +322,6 @@ func findFileInOSTreeRepos(file *internal.FileMetadata, ostreeRepos []string, di return false, nil, 0, nil } -// findFileOnTheHost checks whether the requested file already exist on the host and copies the file content from there if possible. -// It is currently implemented to look only at the file with the same path. Ideally it can detect the same content also at different -// paths. -// file is the file to look for. -// dirfd is an open fd to the destination checkout. -// useHardLinks defines whether the deduplication can be performed using hard links. -func findFileOnTheHost(file *internal.FileMetadata, dirfd int, useHardLinks bool, buf []byte) (bool, *os.File, int64, error) { - sourceFile := filepath.Clean(filepath.Join("/", file.Name)) - if !strings.HasPrefix(sourceFile, "/usr/") { - // limit host deduplication to files under /usr. - return false, nil, 0, nil - } - - st, err := os.Stat(sourceFile) - if err != nil || !st.Mode().IsRegular() { - return false, nil, 0, nil - } - - if st.Size() != file.Size { - return false, nil, 0, nil - } - - fd, err := unix.Open(sourceFile, unix.O_RDONLY|unix.O_NONBLOCK, 0) - if err != nil { - return false, nil, 0, nil - } - - f := os.NewFile(uintptr(fd), "fd") - defer f.Close() - - manifestChecksum, err := digest.Parse(file.Digest) - if err != nil { - return false, nil, 0, err - } - - checksum, err := getFileDigest(f, buf) - if err != nil { - return false, nil, 0, err - } - - if checksum != manifestChecksum { - return false, nil, 0, nil - } - - // check if the open file can be deduplicated with hard links - useHardLinks = useHardLinks && canDedupFileWithHardLink(file, fd, st) - - dstFile, written, err := copyFileContent(fd, file.Name, dirfd, 0, useHardLinks) - if err != nil { - return false, nil, 0, nil - } - - // calculate the checksum again to make sure the file wasn't modified while it was copied - if _, err := f.Seek(0, 0); err != nil { - dstFile.Close() - return false, nil, 0, err - } - checksum, err = getFileDigest(f, buf) - if err != nil { - dstFile.Close() - return false, nil, 0, err - } - if checksum != manifestChecksum { - dstFile.Close() - return false, nil, 0, nil - } - return true, dstFile, written, nil -} - // findFileInOtherLayers finds the specified file in other layers. // cache is the layers cache to use. // file is the file to look for. @@ -1297,10 +1220,9 @@ func parseBooleanPullOption(storeOpts *storage.StoreOptions, name string, def bo } type findAndCopyFileOptions struct { - useHardLinks bool - enableHostDedup bool - ostreeRepos []string - options *archive.TarOptions + useHardLinks bool + ostreeRepos []string + options *archive.TarOptions } func (c *chunkedDiffer) findAndCopyFile(dirfd int, r *internal.FileMetadata, copyOptions *findAndCopyFileOptions, mode os.FileMode) (bool, error) { @@ -1336,18 +1258,6 @@ func (c *chunkedDiffer) findAndCopyFile(dirfd int, r *internal.FileMetadata, cop return true, nil } - if copyOptions.enableHostDedup { - found, dstFile, _, err = findFileOnTheHost(r, dirfd, copyOptions.useHardLinks, c.copyBuffer) - if err != nil { - return false, err - } - if found { - if err := finalizeFile(dstFile); err != nil { - return false, err - } - return true, nil - } - } return false, nil } @@ -1376,8 +1286,6 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions) (gra return output, errors.New("enable_partial_images not configured") } - enableHostDedup := parseBooleanPullOption(&storeOpts, "enable_host_deduplication", false) - // When the hard links deduplication is used, file attributes are ignored because setting them // modifies the source file as well. useHardLinks := parseBooleanPullOption(&storeOpts, "use_hard_links", false) @@ -1426,10 +1334,9 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions) (gra missingPartsSize, totalChunksSize := int64(0), int64(0) copyOptions := findAndCopyFileOptions{ - useHardLinks: useHardLinks, - enableHostDedup: enableHostDedup, - ostreeRepos: ostreeRepos, - options: options, + useHardLinks: useHardLinks, + ostreeRepos: ostreeRepos, + options: options, } type copyFileJob struct {