merge: Integrate file level data from scancode

This resolves #480 The first commit modifies loading and saving to the cache for file level data. The second commit modifies the executor for scancode to leverage file level data caching and collection for an image. Signed-off-by: Nisha K [email protected]
tern-tools · Mar 18, 2020 · aa87d3e · aa87d3e
2 parents 2c216e5 + 9dc4f2c
commit aa87d3e
Show file tree

Hide file tree

Showing 4 changed files with 103 additions and 80 deletions.
diff --git a/tern/analyze/common.py b/tern/analyze/common.py
@@ -11,6 +11,7 @@
 import os
 
 from tern.classes.package import Package
+from tern.classes.file_data import FileData
 from tern.classes.notice import Notice
 from tern.classes.command import Command
 from tern.command_lib import command_lib
@@ -37,22 +38,71 @@ def get_shell_commands(shell_command_line):
 
 def load_from_cache(layer, redo=False):
     '''Given a layer object, check against cache to see if that layer id exists
-    if yes then get the package list and load it in the layer and return true.
-    If it doesn't exist return false. Default operation is to not redo the
-    cache. Add notices to the layer's origins matching the origin_str'''
+    if yes then load any relevant layer level information. The default
+    operation is to not redo the cache. Add notices to the layer's origins
+    matching the origin_str'''
     loaded = False
-    if not layer.packages and not redo:
-        # there are no packages in this layer and we are not repopulating the
-        # cache, try to get it from the cache
-        raw_pkg_list = cache.get_packages(layer.fs_hash)
-        if raw_pkg_list:
-            logger.debug('Loaded from cache: layer \"%s\"', layer.fs_hash[:10])
-            for pkg_dict in raw_pkg_list:
-                pkg = Package(pkg_dict['name'])
-                pkg.fill(pkg_dict)
-                layer.add_package(pkg)
+    if not redo:
+        # check if packages are available in the cache
+        if load_packages_from_cache(layer):
+            loaded = True
+        # check if files are available in the cache
+        if load_files_from_cache(layer):
+            loaded = True
+        # load some extra properties into the layer if available
+        if layer.fs_hash in cache.get_layers():
+            layer.files_analyzed = cache.cache[layer.fs_hash]['files_analyzed']
+            # load any origin data
             load_notices_from_cache(layer)
+    return loaded
+
+
+def load_packages_from_cache(layer):
+    '''Given a layer object, populate package level information'''
+    loaded = False
+    raw_pkg_list = cache.get_packages(layer.fs_hash)
+    if raw_pkg_list:
+        logger.debug(
+            'Loading packages from cache: layer \"%s\"', layer.fs_hash[:10])
+        for pkg_dict in raw_pkg_list:
+            pkg = Package(pkg_dict['name'])
+            pkg.fill(pkg_dict)
+            # collect package origins
+            if 'origins' in pkg_dict.keys():
+                for origin_dict in pkg_dict['origins']:
+                    for notice in origin_dict['notices']:
+                        pkg.origins.add_notice_to_origins(
+                            origin_dict['origin_str'], Notice(
+                                notice['message'], notice['level']))
+            layer.add_package(pkg)
+        loaded = True
+    return loaded
+
+
+def load_files_from_cache(layer):
+    '''Given a layer object, populate file level information'''
+    loaded = False
+    raw_file_list = cache.get_files(layer.fs_hash)
+    if raw_file_list:
+        logger.debug(
+            'Loading files from cache: layer \"%s\"', layer.fs_hash[:10])
+        for file_dict in raw_file_list:
+            f = FileData(file_dict['name'], file_dict['path'])
+            f.fill(file_dict)
+            # collect file origins
+            if 'origins' in file_dict.keys():
+                for origin_dict in file_dict['origins']:
+                    for notice in origin_dict['notices']:
+                        f.origins.add_notice_to_origins(
+                            origin_dict['origin_str'], Notice(
+                                notice['message'], notice['level']))
+            layer.add_file(f)
             loaded = True
+    else:
+        # if there are no files, generate them from the pre-calculated
+        # hash file
+        logger.debug('Reading files in filesystem...')
+        layer.add_files()
     return loaded
 
 
@@ -70,7 +120,7 @@ def load_notices_from_cache(layer):
 def save_to_cache(image):
     '''Given an image object, save all layers to the cache'''
     for layer in image.layers:
-        if layer.packages:
+        if layer.packages or layer.files_analyzed:
             cache.add_layer(layer)
 
 

diff --git a/tern/classes/docker_image.py b/tern/classes/docker_image.py
@@ -7,9 +7,7 @@
 import os
 import subprocess  # nosec
 
-from tern.classes.file_data import FileData
 from tern.utils import rootfs
-from tern.utils.cache import get_files
 from tern.utils.general import pushd
 from tern.utils.constants import manifest_file
 from tern.analyze.docker.container import extract_image_metadata
@@ -161,16 +159,6 @@ def load_image(self):
             while layer_diffs and layer_paths:
                 layer = ImageLayer(layer_diffs.pop(0), layer_paths.pop(0))
                 layer.gen_fs_hash()
-                raw_file_list = get_files(layer.fs_hash)
-                # Fetch file info from cache if exists
-                # else extract and store file info
-                if raw_file_list:
-                    for file_dict in raw_file_list:
-                        file = FileData(file_dict['name'], file_dict['path'])
-                        file.fill(file_dict)
-                        layer.add_file(file)
-                else:
-                    layer.add_files()
                 self._layers.append(layer)
             self.set_layer_created_by()
         except NameError:  # pylint: disable=try-except-raise

diff --git a/tern/extensions/scancode/executor.py b/tern/extensions/scancode/executor.py
@@ -16,9 +16,10 @@
 
 import json
 import logging
+import os
 
 from tern.analyze.passthrough import get_filesystem_command
-from tern.analyze.passthrough import get_file_command
+from tern.analyze import common
 from tern.classes.notice import Notice
 from tern.classes.file_data import FileData
 from tern.extensions.executor import Executor
@@ -29,10 +30,12 @@
 logger = logging.getLogger(constants.logger_name)
 
 
-def analyze_layer(layer_obj):
-    '''Use scancode to analyze the layer's contents. Create file objects
-    and add them to the layer object. Add any Notices to the FileData objects
+def collect_layer_data(layer_obj):
+    '''Use scancode to collect data from a layer filesystem. This function will
+    create a FileData object for every file found. After scanning, it will
+    return a list of FileData objects.
     '''
+    files = []
     # run scancode against a directory
     command = 'scancode -ilpcu --quiet --json -'
     full_cmd = get_filesystem_command(layer_obj, command)
@@ -48,8 +51,12 @@ def analyze_layer(layer_obj):
         data = json.loads(result)
         for f in data['files']:
             if f['type'] == 'file':
-                fd = FileData(f['name'], f['path'], f['date'], f['file_type'])
-                fd.set_checksum('sha1', f['sha1'])
+                # scancode records paths from the target directory onwards
+                # which in tern's case is tern.utils.constants.untar_dir
+                # removing that portion of the file path
+                fspath = f['path'].replace(
+                    constants.untar_dir + os.path.sep, '')
+                fd = FileData(f['name'], fspath, f['date'], f['file_type'])
                 if f['licenses']:
                     fd.licenses = [l['short_name'] for l in f['licenses']]
                 fd.license_expressions = f['license_expressions']
@@ -58,52 +65,27 @@ def analyze_layer(layer_obj):
                 if f['urls']:
                     fd.urls = [u['url'] for u in f['urls']]
                 fd.packages = f['packages']
-                fd.authors = f['authors']
+                fd.authors = [a['value'] for a in f['authors']]
                 if f['scan_errors']:
                     # for each scan error make a notice
                     for err in f['scan_errors']:
                         fd.origins.add_notice_to_origins(
                             'File: ' + fd.path, Notice(err, 'error'))
-                # add filedata object to layer
-                layer_obj.add_file(fd)
+                files.append(fd)
+    return files
 
 
-def analyze_file(layer_obj):
-    '''Use scancode to analyze files Tern has already found in an image layer.
-    For each file in the layer, run scancode on the file. We assume that we
-    already have the files names, paths and checksums filled out'''
-    # run scancode against each file
-    command = 'scancode -ilpcu --quiet --json -'
-    for fd in layer_obj.files:
-        full_cmd = get_file_command(layer_obj.tar_file, fd, command)
-        origin_file = 'File: ' + fd.path
-        result, error = rootfs.shell_command(True, full_cmd)
-        if not result:
-            logger.error(
-                "No scancode results for this file: %s", str(error))
-            fd.origins.add_notice_to_origins(
-                origin_file, Notice(str(error), 'error'))
-        else:
-            # Fill the results into the FileData object
-            data = json.loads(result)['files'][0]
-            fd.date = data['date']
-            fd.file_type = data['file_type']
-            if data['licenses']:
-                fd.licenses = [l['short_name'] for l in data['licenses']]
-            fd.license_expressions = data['license_expressions']
-            if data['copyrights']:
-                fd.copyrights = [c['value'] for c in data['copyrights']]
-            if data['urls']:
-                fd.urls = [u['url'] for u in data['urls']]
-            fd.packages = data['packages']
-            fd.authors = data['authors']
-            if data['scan_errors']:
-                # for each scan error make a notice
-                for err in data['scan_errors']:
-                    fd.origins.add_notice_to_origins(
-                        origin_file, Notice(err, 'error'))
-            # add filedata object to layer
-            layer_obj.add_file(fd)
+def add_file_data(layer_obj, collected_files):
+    '''Use the file data collected with scancode to fill in the file level
+    data for an ImageLayer object'''
+    # we'll assume that we are merging the collected_files data with
+    # the file level data already in the layer object
+    logger.debug("Collecting file data...")
+    while collected_files:
+        checkfile = collected_files.pop()
+        for f in layer_obj.files:
+            if f.merge(checkfile):
+                break
 
 
 class Scancode(Executor):
@@ -113,12 +95,13 @@ def execute(self, image_obj):
             scancode -ilpcu --quiet --json - /path/to/directory
         '''
         for layer in image_obj.layers:
-            layer.files_analyzed = True
-            if layer.files:
-                # If the layer already has files processed, then run
-                # scancode per file
-                analyze_file(layer)
-            else:
-                # If there was no file processing done, scancode will process
-                # them for you
-                analyze_layer(layer)
+            # load the layers from cache
+            common.load_from_cache(layer)
+            if not layer.files_analyzed:
+                # the layer doesn't have analyzed files, so run analysis
+                file_list = collect_layer_data(layer)
+                if file_list:
+                    add_file_data(layer, file_list)
+                    layer.files_analyzed = True
+        # save data to the cache
+        common.save_to_cache(image_obj)
diff --git a/tests/test_class_docker_image.py b/tests/test_class_docker_image.py
@@ -97,6 +97,8 @@ def testGetLayerDiffIds(self):
 
     def testLayerFiles(self):
         self.image.load_image()
+        self.assertFalse(self.image.layers[0].files)
+        self.image.layers[0].add_files()
         for file in self.image.layers[0].files:
             self.assertTrue(
                 (file.name, file.path, file.checksum,