Skip to content

Commit

Permalink
chunked: drop host dedup feature
Browse files Browse the repository at this point in the history
drop host deduplication by just looking at the file path.  It could be
useful in very specific use cases, but it is too expensive for generic
images.  If the need arises, we first need to create an index of the
files that we can deduplicate so there is no need to calculate the
checksum on the fly.

Signed-off-by: Giuseppe Scrivano <[email protected]>
  • Loading branch information
giuseppe committed Jun 7, 2022
1 parent 2b2fb9f commit 8ace82d
Showing 1 changed file with 6 additions and 99 deletions.
105 changes: 6 additions & 99 deletions pkg/chunked/storage_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -272,14 +272,6 @@ func canDedupFileWithHardLink(file *internal.FileMetadata, fd int, s os.FileInfo
return canDedupMetadataWithHardLink(file, &otherFile)
}

func getFileDigest(f *os.File, buf []byte) (digest.Digest, error) {
digester := digest.Canonical.Digester()
if _, err := io.CopyBuffer(digester.Hash(), f, buf); err != nil {
return "", err
}
return digester.Digest(), nil
}

// findFileInOSTreeRepos checks whether the requested file already exist in one of the OSTree repo and copies the file content from there if possible.
// file is the file to look for.
// ostreeRepos is a list of OSTree repos.
Expand Down Expand Up @@ -330,75 +322,6 @@ func findFileInOSTreeRepos(file *internal.FileMetadata, ostreeRepos []string, di
return false, nil, 0, nil
}

// findFileOnTheHost checks whether the requested file already exist on the host and copies the file content from there if possible.
// It is currently implemented to look only at the file with the same path. Ideally it can detect the same content also at different
// paths.
// file is the file to look for.
// dirfd is an open fd to the destination checkout.
// useHardLinks defines whether the deduplication can be performed using hard links.
func findFileOnTheHost(file *internal.FileMetadata, dirfd int, useHardLinks bool, buf []byte) (bool, *os.File, int64, error) {
sourceFile := filepath.Clean(filepath.Join("/", file.Name))
if !strings.HasPrefix(sourceFile, "/usr/") {
// limit host deduplication to files under /usr.
return false, nil, 0, nil
}

st, err := os.Stat(sourceFile)
if err != nil || !st.Mode().IsRegular() {
return false, nil, 0, nil
}

if st.Size() != file.Size {
return false, nil, 0, nil
}

fd, err := unix.Open(sourceFile, unix.O_RDONLY|unix.O_NONBLOCK, 0)
if err != nil {
return false, nil, 0, nil
}

f := os.NewFile(uintptr(fd), "fd")
defer f.Close()

manifestChecksum, err := digest.Parse(file.Digest)
if err != nil {
return false, nil, 0, err
}

checksum, err := getFileDigest(f, buf)
if err != nil {
return false, nil, 0, err
}

if checksum != manifestChecksum {
return false, nil, 0, nil
}

// check if the open file can be deduplicated with hard links
useHardLinks = useHardLinks && canDedupFileWithHardLink(file, fd, st)

dstFile, written, err := copyFileContent(fd, file.Name, dirfd, 0, useHardLinks)
if err != nil {
return false, nil, 0, nil
}

// calculate the checksum again to make sure the file wasn't modified while it was copied
if _, err := f.Seek(0, 0); err != nil {
dstFile.Close()
return false, nil, 0, err
}
checksum, err = getFileDigest(f, buf)
if err != nil {
dstFile.Close()
return false, nil, 0, err
}
if checksum != manifestChecksum {
dstFile.Close()
return false, nil, 0, nil
}
return true, dstFile, written, nil
}

// findFileInOtherLayers finds the specified file in other layers.
// cache is the layers cache to use.
// file is the file to look for.
Expand Down Expand Up @@ -1297,10 +1220,9 @@ func parseBooleanPullOption(storeOpts *storage.StoreOptions, name string, def bo
}

type findAndCopyFileOptions struct {
useHardLinks bool
enableHostDedup bool
ostreeRepos []string
options *archive.TarOptions
useHardLinks bool
ostreeRepos []string
options *archive.TarOptions
}

func (c *chunkedDiffer) findAndCopyFile(dirfd int, r *internal.FileMetadata, copyOptions *findAndCopyFileOptions, mode os.FileMode) (bool, error) {
Expand Down Expand Up @@ -1336,18 +1258,6 @@ func (c *chunkedDiffer) findAndCopyFile(dirfd int, r *internal.FileMetadata, cop
return true, nil
}

if copyOptions.enableHostDedup {
found, dstFile, _, err = findFileOnTheHost(r, dirfd, copyOptions.useHardLinks, c.copyBuffer)
if err != nil {
return false, err
}
if found {
if err := finalizeFile(dstFile); err != nil {
return false, err
}
return true, nil
}
}
return false, nil
}

Expand Down Expand Up @@ -1376,8 +1286,6 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions) (gra
return output, errors.New("enable_partial_images not configured")
}

enableHostDedup := parseBooleanPullOption(&storeOpts, "enable_host_deduplication", false)

// When the hard links deduplication is used, file attributes are ignored because setting them
// modifies the source file as well.
useHardLinks := parseBooleanPullOption(&storeOpts, "use_hard_links", false)
Expand Down Expand Up @@ -1426,10 +1334,9 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions) (gra
missingPartsSize, totalChunksSize := int64(0), int64(0)

copyOptions := findAndCopyFileOptions{
useHardLinks: useHardLinks,
enableHostDedup: enableHostDedup,
ostreeRepos: ostreeRepos,
options: options,
useHardLinks: useHardLinks,
ostreeRepos: ostreeRepos,
options: options,
}

type copyFileJob struct {
Expand Down

0 comments on commit 8ace82d

Please sign in to comment.