diff --git a/pkg/sketch/sketch.go b/pkg/sketch/sketch.go index 44888f6..f8fdb05 100644 --- a/pkg/sketch/sketch.go +++ b/pkg/sketch/sketch.go @@ -19,8 +19,13 @@ type Sketcher struct { } type cloneCandidate struct { - descriptors []v1.Descriptor + descriptors []filesegment.Descriptor dirPath string + filename string +} + +func (cc *cloneCandidate) FilePath() string { + return filepath.Join(cc.dirPath, cc.filename) } func resizeFile(filePath string, newSize int64) error { @@ -40,6 +45,43 @@ func resizeFile(filePath string, newSize int64) error { return nil } +// fileExists checks if a file exists and is not a directory +func fileExists(filePath string) bool { + info, err := os.Stat(filePath) + if os.IsNotExist(err) { + return false + } + return !info.IsDir() +} + +func (sc *Sketcher) findBestCloneCandidate(fileBlueprints []*fileBlueprint, cloneCandidates []*cloneCandidate) (*cloneCandidate, error) { + var bestCloneCandidate *cloneCandidate + bestScore := 0 + + for _, fr := range fileBlueprints { + // Create a map of digest segments for each file blueprint + segmentsDigestMap := make(map[string]filesegment.Descriptor) + for _, seg := range fr.Segments { + segmentsDigestMap[seg.Digest().String()] = *seg + } + + // Find the best matching candidate + for _, cc := range cloneCandidates { + score := sc.computeScore(segmentsDigestMap, cc) + if score > bestScore { + bestScore = score + bestCloneCandidate = cc + } + } + } + + if bestCloneCandidate == nil { + return nil, fmt.Errorf("no matching clone candidate found") + } + + return bestCloneCandidate, nil +} + func (sc *Sketcher) Sketch(dir string, manifest v1.Manifest) (bytesClonedCount int64, matchedSegmentsCount int64, err error) { fileBlueprints, err := createBlueprintsFromManifest(manifest) @@ -79,7 +121,7 @@ func (sc *Sketcher) Sketch(dir string, manifest v1.Manifest) (bytesClonedCount i } bytesClonedCount += fr.Size() matchedSegmentsCount += int64(bestScore) - src := filepath.Join(bestCloneCandidate.dirPath, fr.Filename) + src := bestCloneCandidate.FilePath() dest := filepath.Join(dir, fr.Filename) if src == dest { continue @@ -122,21 +164,32 @@ func (sc *Sketcher) findCloneCandidates() ([]*cloneCandidate, error) { if err != nil { return nil, err } + defer f.Close() manifest, err := v1.ParseManifest(f) - if err == nil { - descriptors := make([]v1.Descriptor, 0) - for _, d := range manifest.Layers { - descriptors = append(descriptors, d) + if err != nil { + return nil, err + } + // Map to group descriptors by filename + fileDescriptorMap := make(map[string][]filesegment.Descriptor) + + // Parse each layer and group by filename + for _, l := range manifest.Layers { + segmentDescriptor, err := filesegment.ParseDescriptor(l) + if err != nil { + return nil, fmt.Errorf("unable to parse descriptor: %w", err) } + filename := segmentDescriptor.Filename() + fileDescriptorMap[filename] = append(fileDescriptorMap[filename], *segmentDescriptor) + } + + // Create clone candidates for each file + for filename, descriptors := range fileDescriptorMap { candidates = append(candidates, &cloneCandidate{ - descriptors: descriptors, - dirPath: filepath.Dir(job.path), + descriptors: descriptors, // All descriptors from the same file + dirPath: filepath.Dir(job.path), // Directory path from the job + filename: filename, }) } - f.Close() - if err != nil { - return nil, err - } } return candidates, nil } @@ -144,7 +197,7 @@ func (sc *Sketcher) findCloneCandidates() ([]*cloneCandidate, error) { func (sc *Sketcher) computeScore(segmentDigestMap map[string]filesegment.Descriptor, m *cloneCandidate) int { score := 0 for _, descriptor := range m.descriptors { - _, ok := segmentDigestMap[descriptor.Digest.String()] + _, ok := segmentDigestMap[descriptor.Digest().String()] if ok { score += 1 } diff --git a/pkg/sketch/sketch_test.go b/pkg/sketch/sketch_test.go index 4629447..621d660 100644 --- a/pkg/sketch/sketch_test.go +++ b/pkg/sketch/sketch_test.go @@ -1,9 +1,10 @@ package sketch import ( + "context" "fmt" v1 "github.com/google/go-containerregistry/pkg/v1" - "github.com/google/go-containerregistry/pkg/v1/random" + "github.com/mobileinf/geranos/pkg/dirimage" "github.com/mobileinf/geranos/pkg/filesegment" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -15,30 +16,33 @@ import ( func TestDefaultSketchConstructor_ConstructConstruct(t *testing.T) { const layersCount = 10 const testManifestName = ".oci.test.json" - prepare5CloneCandidatesWith10Layers := func(rootDir string) []*v1.Descriptor { - descriptors := make([]*v1.Descriptor, 0) + prepare5CloneCandidatesWith10Layers := func(rootDir string) []*filesegment.Descriptor { + descriptors := make([]*filesegment.Descriptor, 0) for i := 0; i < 5; i++ { - img, err := random.Image(1024, layersCount) - require.NoError(t, err) - manifestBytes, err := img.RawManifest() - require.NoError(t, err) localDir := filepath.Join(rootDir, fmt.Sprintf("v%d", i)) - err = os.MkdirAll(localDir, os.ModePerm) + err := os.MkdirAll(localDir, os.ModePerm) require.NoError(t, err) - require.NoError(t, err) - manifest, err := img.Manifest() - require.NoError(t, err) - // Create fake file + // Create fake files err = os.WriteFile(filepath.Join(localDir, "disk.img"), []byte("0123456789"), 0o755) require.NoError(t, err) - // Create fake file err = os.WriteFile(filepath.Join(localDir, "disk2.img"), []byte("0123456789"), 0o755) require.NoError(t, err) + + img, err := dirimage.Read(context.Background(), localDir, dirimage.WithChunkSize(1)) + require.NoError(t, err) + // Convert manifest layers to filesegment.Descriptor + manifest, err := img.Manifest() + require.NoError(t, err) for _, d := range manifest.Layers { - descriptors = append(descriptors, &d) + fDescriptor, err := filesegment.ParseDescriptor(d) + require.NoError(t, err) + descriptors = append(descriptors, fDescriptor) } + manifestBytes, err := img.RawManifest() + require.NoError(t, err) err = os.WriteFile(filepath.Join(localDir, testManifestName), manifestBytes, 0o777) + require.NoError(t, err) } return descriptors } @@ -58,16 +62,16 @@ func TestDefaultSketchConstructor_ConstructConstruct(t *testing.T) { tests := []struct { name string - prepareManifest func(ds []*v1.Descriptor) v1.Manifest + prepareManifest func(ds []*filesegment.Descriptor) v1.Manifest bytesClonedCount int64 matchedSegmentsCount int64 - prepareCloneCandidates func(rootDir string) []*v1.Descriptor + prepareCloneCandidates func(rootDir string) []*filesegment.Descriptor expectedErr error }{ { name: "successful construct of single recipe", - prepareManifest: func(ds []*v1.Descriptor) v1.Manifest { - return makeManifest(filesegment.NewDescriptor("disk.img", 0, 1, ds[0].Digest)) + prepareManifest: func(ds []*filesegment.Descriptor) v1.Manifest { + return makeManifest(filesegment.NewDescriptor("disk.img", 0, 1, ds[0].Digest())) }, bytesClonedCount: 2, matchedSegmentsCount: 1, @@ -76,46 +80,35 @@ func TestDefaultSketchConstructor_ConstructConstruct(t *testing.T) { }, { name: "successful construct of all file recipes", - prepareManifest: func(ds []*v1.Descriptor) v1.Manifest { + prepareManifest: func(ds []*filesegment.Descriptor) v1.Manifest { return makeManifest( - filesegment.NewDescriptor("disk.img", 0, 2, ds[1].Digest), - filesegment.NewDescriptor("disk.img", 3, 4, ds[2].Digest), - filesegment.NewDescriptor("disk2.img", 0, 10, ds[0].Digest), + filesegment.NewDescriptor("disk.img", 0, 2, ds[1].Digest()), + filesegment.NewDescriptor("disk.img", 3, 4, ds[2].Digest()), + filesegment.NewDescriptor("disk2.img", 0, 10, ds[0].Digest()), ) }, - bytesClonedCount: 16, - matchedSegmentsCount: 3, - + bytesClonedCount: 16, + matchedSegmentsCount: 3, prepareCloneCandidates: prepare5CloneCandidatesWith10Layers, expectedErr: nil, }, { name: "successful construct from best clone", - prepareManifest: func(ds []*v1.Descriptor) v1.Manifest { - /*fr1 := makeFR("disk.img", - Seg{0, 1, ds[0].Digest}, - Seg{2, 3, ds[1*layersCount+0].Digest}, - Seg{4, 5, ds[2*layersCount+0].Digest}, - Seg{6, 7, ds[2*layersCount+1].Digest}, - Seg{8, 9, ds[2*layersCount+2].Digest}, - Seg{10, 11, ds[3*layersCount+0].Digest}, - Seg{12, 13, ds[3*layersCount+1].Digest}, - )*/ + prepareManifest: func(ds []*filesegment.Descriptor) v1.Manifest { return makeManifest( - filesegment.NewDescriptor("disk.img", 0, 1, ds[0].Digest), - filesegment.NewDescriptor("disk.img", 2, 3, ds[1*layersCount+0].Digest), - filesegment.NewDescriptor("disk.img", 4, 5, ds[2*layersCount+0].Digest), - filesegment.NewDescriptor("disk.img", 6, 7, ds[2*layersCount+1].Digest), - filesegment.NewDescriptor("disk.img", 8, 9, ds[2*layersCount+2].Digest), - filesegment.NewDescriptor("disk.img", 10, 11, ds[3*layersCount+0].Digest), - filesegment.NewDescriptor("disk.img", 12, 13, ds[3*layersCount+1].Digest), + filesegment.NewDescriptor("disk.img", 0, 1, ds[0].Digest()), + filesegment.NewDescriptor("disk.img", 2, 3, ds[1].Digest()), + filesegment.NewDescriptor("disk.img", 4, 5, ds[2].Digest()), + filesegment.NewDescriptor("disk.img", 6, 7, ds[3].Digest()), + filesegment.NewDescriptor("disk.img", 8, 9, ds[4].Digest()), + filesegment.NewDescriptor("disk.img", 10, 11, ds[5].Digest()), + filesegment.NewDescriptor("disk.img", 12, 13, ds[6].Digest()), ) }, - bytesClonedCount: 14, - matchedSegmentsCount: 3, - + bytesClonedCount: 14, + matchedSegmentsCount: 7, prepareCloneCandidates: prepare5CloneCandidatesWith10Layers, expectedErr: nil, }, @@ -148,25 +141,37 @@ func TestDefaultSketchConstructor_ConstructConstruct(t *testing.T) { } func TestSketchConstructor_FindCloneCandidates(t *testing.T) { - const localManifestFile = ".oci.test.json" + const testManifestName = ".oci.test.json" tests := []struct { name string - setup func(rootDir string) // Function to setup the test's file system state + setup func(rootDir string) // Function to set up the test's file system state expectedLength int expectedErr bool // Whether an error is expected }{ { name: "single manifest file", setup: func(rootDir string) { - // Create a directory structure with one manifest - dirPath := filepath.Join(rootDir, "directory1") - err := os.MkdirAll(dirPath, os.ModePerm) + localDir := filepath.Join(rootDir, "directory1") + err := os.MkdirAll(localDir, os.ModePerm) + require.NoError(t, err) + + // Create fake files + err = os.WriteFile(filepath.Join(localDir, "disk.img"), []byte("0123456789"), 0o755) require.NoError(t, err) - manifestPath := filepath.Join(dirPath, localManifestFile) - err = os.WriteFile(manifestPath, []byte("{}"), 0644) // Write an empty JSON object as a placeholder + err = os.WriteFile(filepath.Join(localDir, "disk2.img"), []byte("0123456789"), 0o755) + require.NoError(t, err) + + // Use dirimage to read and generate manifest + img, err := dirimage.Read(context.Background(), localDir, dirimage.WithChunkSize(1)) + require.NoError(t, err) + + manifestBytes, err := img.RawManifest() + require.NoError(t, err) + + err = os.WriteFile(filepath.Join(localDir, testManifestName), manifestBytes, 0o777) require.NoError(t, err) }, - expectedLength: 1, + expectedLength: 2, expectedErr: false, }, { @@ -181,35 +186,47 @@ func TestSketchConstructor_FindCloneCandidates(t *testing.T) { { name: "multiple manifest files", setup: func(rootDir string) { - // Create a directory structure with one manifest for i := 0; i < 5; i++ { - dirPath := filepath.Join(rootDir, fmt.Sprintf("directory%d", i)) - err := os.MkdirAll(dirPath, os.ModePerm) + localDir := filepath.Join(rootDir, fmt.Sprintf("directory%d", i)) + err := os.MkdirAll(localDir, os.ModePerm) + require.NoError(t, err) + + // Create fake files + err = os.WriteFile(filepath.Join(localDir, "disk.img"), []byte("0123456789"), 0o755) + require.NoError(t, err) + err = os.WriteFile(filepath.Join(localDir, "disk2.img"), []byte("01234567"), 0o755) require.NoError(t, err) - err = os.WriteFile(filepath.Join(dirPath, localManifestFile), []byte("{}"), 0644) + err = os.WriteFile(filepath.Join(localDir, "disk3.img"), []byte("66666"), 0o755) require.NoError(t, err) - err = os.WriteFile(filepath.Join(dirPath, "disk.img"), []byte(""), 0644) + + // Use dirimage to read and generate manifest + img, err := dirimage.Read(context.Background(), localDir, dirimage.WithChunkSize(1)) + require.NoError(t, err) + + manifestBytes, err := img.RawManifest() + require.NoError(t, err) + + err = os.WriteFile(filepath.Join(localDir, testManifestName), manifestBytes, 0o777) require.NoError(t, err) } }, - expectedLength: 5, + expectedLength: 15, expectedErr: false, }, - // Add more test cases as needed } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - // Setup temporary directory for the test + // Set up temporary directory for the test rootDir, err := os.MkdirTemp("", "test_find_clone_candidates") assert.NoError(t, err) defer os.RemoveAll(rootDir) // Cleanup after the test - // Setup the specific file system state for this test + // Set up the specific file system state for this test tt.setup(rootDir) // Call the method under test - sc := NewSketcher(rootDir, localManifestFile) + sc := NewSketcher(rootDir, testManifestName) candidates, err := sc.findCloneCandidates() if tt.expectedErr { @@ -222,25 +239,26 @@ func TestSketchConstructor_FindCloneCandidates(t *testing.T) { } } -// TestComputeScore tests the computeScore method of defaultSketchConstructor for various scenarios. func TestSketchConstructor_ComputeScore(t *testing.T) { + // Define test cases // Define test cases newDescriptor := func(hash string) filesegment.Descriptor { return *filesegment.NewDescriptor("test", 0, 0, v1.Hash{Algorithm: "sha256", Hex: hash}) } + tests := []struct { name string segmentDescriptors map[string]filesegment.Descriptor - descriptors []v1.Descriptor + descriptors []filesegment.Descriptor expectedScore int }{ { name: "No match - different hash", segmentDescriptors: map[string]filesegment.Descriptor{ - "hash1": newDescriptor("hash1"), + "sha256:hash1": newDescriptor("hash1"), }, - descriptors: []v1.Descriptor{ - {Digest: v1.Hash{Hex: "hash2"}}, + descriptors: []filesegment.Descriptor{ + *filesegment.NewDescriptor("testfile", 0, 0, v1.Hash{Algorithm: "sha256", Hex: "hash2"}), }, expectedScore: 0, }, @@ -249,8 +267,8 @@ func TestSketchConstructor_ComputeScore(t *testing.T) { segmentDescriptors: map[string]filesegment.Descriptor{ "sha256:hash1": newDescriptor("hash1"), }, - descriptors: []v1.Descriptor{ - {Digest: v1.Hash{Algorithm: "sha256", Hex: "hash1"}}, + descriptors: []filesegment.Descriptor{ + *filesegment.NewDescriptor("testfile", 0, 0, v1.Hash{Algorithm: "sha256", Hex: "hash1"}), }, expectedScore: 1, }, @@ -261,10 +279,10 @@ func TestSketchConstructor_ComputeScore(t *testing.T) { "sha256:hash2": newDescriptor("hash2"), "sha256:hash3": newDescriptor("hash3"), }, - descriptors: []v1.Descriptor{ - {Digest: v1.Hash{Algorithm: "sha256", Hex: "hash2"}}, - {Digest: v1.Hash{Algorithm: "sha256", Hex: "hash1"}}, - {Digest: v1.Hash{Algorithm: "sha256", Hex: "hash3"}}, + descriptors: []filesegment.Descriptor{ + *filesegment.NewDescriptor("testfile", 0, 0, v1.Hash{Algorithm: "sha256", Hex: "hash2"}), + *filesegment.NewDescriptor("testfile", 0, 0, v1.Hash{Algorithm: "sha256", Hex: "hash1"}), + *filesegment.NewDescriptor("testfile", 0, 0, v1.Hash{Algorithm: "sha256", Hex: "hash3"}), }, expectedScore: 3, }, @@ -278,11 +296,11 @@ func TestSketchConstructor_ComputeScore(t *testing.T) { "sha256:hash4": newDescriptor("hash4"), "sha256:hash5": newDescriptor("hash5"), }, - descriptors: []v1.Descriptor{ - {Digest: v1.Hash{Algorithm: "sha256", Hex: "hash5"}}, - {Digest: v1.Hash{Algorithm: "sha256", Hex: "hash3"}}, - {Digest: v1.Hash{Algorithm: "sha256", Hex: "hash1"}}, - {Digest: v1.Hash{Algorithm: "sha256", Hex: "hash11"}}, + descriptors: []filesegment.Descriptor{ + *filesegment.NewDescriptor("testfile", 0, 0, v1.Hash{Algorithm: "sha256", Hex: "hash5"}), + *filesegment.NewDescriptor("testfile", 0, 0, v1.Hash{Algorithm: "sha256", Hex: "hash3"}), + *filesegment.NewDescriptor("testfile", 0, 0, v1.Hash{Algorithm: "sha256", Hex: "hash1"}), + *filesegment.NewDescriptor("testfile", 0, 0, v1.Hash{Algorithm: "sha256", Hex: "hash11"}), }, expectedScore: 3, }, @@ -292,10 +310,9 @@ func TestSketchConstructor_ComputeScore(t *testing.T) { "sha256:hash1": newDescriptor("hash1"), "sha256:hash4": newDescriptor("hash4"), }, - descriptors: []v1.Descriptor{ - {Digest: v1.Hash{Algorithm: "sha256", Hex: "hash1"}}, - {Digest: v1.Hash{Algorithm: "sha256", Hex: "hash2"}}, - {Digest: v1.Hash{Algorithm: "sha256", Hex: "hash3"}}, + descriptors: []filesegment.Descriptor{ + *filesegment.NewDescriptor("testfile", 0, 0, v1.Hash{Algorithm: "sha256", Hex: "hash1"}), + *filesegment.NewDescriptor("testfile", 0, 0, v1.Hash{Algorithm: "sha256", Hex: "hash2"}), }, expectedScore: 1, },