Skip to content

Commit

Permalink
chunked: drop host dedup feature
Browse files Browse the repository at this point in the history
drop host deduplication by just looking at the file path.  It could be
useful in very specific use cases, but it is too expensive for generic
images.  If the need arises, we first need to create an index of the
files that we can deduplicate so there is no need to calculate the
checksum on the fly.

Signed-off-by: Giuseppe Scrivano <[email protected]>
  • Loading branch information
giuseppe committed Jun 7, 2022
1 parent 72b9d5d commit 268af00
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 109 deletions.
6 changes: 1 addition & 5 deletions docs/containers-storage.conf.5.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ The `storage.options` table supports the following options:
**additionalimagestores**=[]
Paths to additional container image stores. Usually these are read/only and stored on remote network shares.

**pull_options** = {enable_partial_images = "false", enable_host_deduplication = "false", use_hard_links = "false", ostree_repos=""}
**pull_options** = {enable_partial_images = "false", use_hard_links = "false", ostree_repos=""}

Allows specification of how storage is populated when pulling images. This
option can speed the pulling process of images compressed with format zstd:chunked. Containers/storage looks
Expand All @@ -92,10 +92,6 @@ containers/storage supports four keys
* use_hard_links = "false" | "true"
Tells containers/storage to use hard links rather then create new files in
the image, if an identical file already existed in storage.
* enable_host_deduplication = "false" | "true"
Tells containers/storage to search for files under `/usr` in addition to
files in other images when attempting to avoid pulling files from the
container registry.
* ostree_repos = ""
Tells containers/storage where an ostree repository exists that might have
previously pulled content which can be used when attempting to avoid
Expand Down
105 changes: 6 additions & 99 deletions pkg/chunked/storage_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -272,14 +272,6 @@ func canDedupFileWithHardLink(file *internal.FileMetadata, fd int, s os.FileInfo
return canDedupMetadataWithHardLink(file, &otherFile)
}

func getFileDigest(f *os.File, buf []byte) (digest.Digest, error) {
digester := digest.Canonical.Digester()
if _, err := io.CopyBuffer(digester.Hash(), f, buf); err != nil {
return "", err
}
return digester.Digest(), nil
}

// findFileInOSTreeRepos checks whether the requested file already exist in one of the OSTree repo and copies the file content from there if possible.
// file is the file to look for.
// ostreeRepos is a list of OSTree repos.
Expand Down Expand Up @@ -330,75 +322,6 @@ func findFileInOSTreeRepos(file *internal.FileMetadata, ostreeRepos []string, di
return false, nil, 0, nil
}

// findFileOnTheHost checks whether the requested file already exist on the host and copies the file content from there if possible.
// It is currently implemented to look only at the file with the same path. Ideally it can detect the same content also at different
// paths.
// file is the file to look for.
// dirfd is an open fd to the destination checkout.
// useHardLinks defines whether the deduplication can be performed using hard links.
func findFileOnTheHost(file *internal.FileMetadata, dirfd int, useHardLinks bool, buf []byte) (bool, *os.File, int64, error) {
sourceFile := filepath.Clean(filepath.Join("/", file.Name))
if !strings.HasPrefix(sourceFile, "/usr/") {
// limit host deduplication to files under /usr.
return false, nil, 0, nil
}

st, err := os.Stat(sourceFile)
if err != nil || !st.Mode().IsRegular() {
return false, nil, 0, nil
}

if st.Size() != file.Size {
return false, nil, 0, nil
}

fd, err := unix.Open(sourceFile, unix.O_RDONLY|unix.O_NONBLOCK, 0)
if err != nil {
return false, nil, 0, nil
}

f := os.NewFile(uintptr(fd), "fd")
defer f.Close()

manifestChecksum, err := digest.Parse(file.Digest)
if err != nil {
return false, nil, 0, err
}

checksum, err := getFileDigest(f, buf)
if err != nil {
return false, nil, 0, err
}

if checksum != manifestChecksum {
return false, nil, 0, nil
}

// check if the open file can be deduplicated with hard links
useHardLinks = useHardLinks && canDedupFileWithHardLink(file, fd, st)

dstFile, written, err := copyFileContent(fd, file.Name, dirfd, 0, useHardLinks)
if err != nil {
return false, nil, 0, nil
}

// calculate the checksum again to make sure the file wasn't modified while it was copied
if _, err := f.Seek(0, 0); err != nil {
dstFile.Close()
return false, nil, 0, err
}
checksum, err = getFileDigest(f, buf)
if err != nil {
dstFile.Close()
return false, nil, 0, err
}
if checksum != manifestChecksum {
dstFile.Close()
return false, nil, 0, nil
}
return true, dstFile, written, nil
}

// findFileInOtherLayers finds the specified file in other layers.
// cache is the layers cache to use.
// file is the file to look for.
Expand Down Expand Up @@ -1297,10 +1220,9 @@ func parseBooleanPullOption(storeOpts *storage.StoreOptions, name string, def bo
}

type findAndCopyFileOptions struct {
useHardLinks bool
enableHostDedup bool
ostreeRepos []string
options *archive.TarOptions
useHardLinks bool
ostreeRepos []string
options *archive.TarOptions
}

func (c *chunkedDiffer) findAndCopyFile(dirfd int, r *internal.FileMetadata, copyOptions *findAndCopyFileOptions, mode os.FileMode) (bool, error) {
Expand Down Expand Up @@ -1336,18 +1258,6 @@ func (c *chunkedDiffer) findAndCopyFile(dirfd int, r *internal.FileMetadata, cop
return true, nil
}

if copyOptions.enableHostDedup {
found, dstFile, _, err = findFileOnTheHost(r, dirfd, copyOptions.useHardLinks, c.copyBuffer)
if err != nil {
return false, err
}
if found {
if err := finalizeFile(dstFile); err != nil {
return false, err
}
return true, nil
}
}
return false, nil
}

Expand Down Expand Up @@ -1376,8 +1286,6 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions) (gra
return output, errors.New("enable_partial_images not configured")
}

enableHostDedup := parseBooleanPullOption(&storeOpts, "enable_host_deduplication", false)

// When the hard links deduplication is used, file attributes are ignored because setting them
// modifies the source file as well.
useHardLinks := parseBooleanPullOption(&storeOpts, "use_hard_links", false)
Expand Down Expand Up @@ -1426,10 +1334,9 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions) (gra
missingPartsSize, totalChunksSize := int64(0), int64(0)

copyOptions := findAndCopyFileOptions{
useHardLinks: useHardLinks,
enableHostDedup: enableHostDedup,
ostreeRepos: ostreeRepos,
options: options,
useHardLinks: useHardLinks,
ostreeRepos: ostreeRepos,
options: options,
}

type copyFileJob struct {
Expand Down
6 changes: 1 addition & 5 deletions storage.conf
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,11 @@ additionalimagestores = [
# * use_hard_links = "false" | "true"
# Tells containers/storage to use hard links rather then create new files in
# the image, if an identical file already existed in storage.
# * enable_host_deduplication = "false" | "true"
# Tells containers/storage to search for files under `/usr` in addition to
# files in other images when attempting to avoid pulling files from the
# container registry.
# * ostree_repos = ""
# Tells containers/storage where an ostree repository exists that might have
# previously pulled content which can be used when attempting to avoid
# pulling content from the container registry
pull_options = {enable_partial_images = "false", enable_host_deduplication = "false", use_hard_links = "false", ostree_repos=""}
pull_options = {enable_partial_images = "false", use_hard_links = "false", ostree_repos=""}

# Remap-UIDs/GIDs is the mapping from UIDs/GIDs as they should appear inside of
# a container, to the UIDs/GIDs as they should appear outside of the container,
Expand Down

0 comments on commit 268af00

Please sign in to comment.