From 9dc4f2cf43254ab048ad79f27921733086de9681 Mon Sep 17 00:00:00 2001
From: Nisha K <nishak@vmware.com>
Date: Mon, 16 Mar 2020 09:27:28 -0700
Subject: [PATCH] extensions:scancode: Only scan at directory level

Scancode takes a ridiculous amount of time to scan every file in
a full OS. It is much faster at making decisions on what to scan
when operating at the directory level, especially since it does
its own file level inventorying. To allow for the best possible
user experience, we will only resort to scanning at the directory
level. For this to work, we will also need to check if the files
were analyzed before.

This change introduces the following changes:
1. Remove the file level data collection function.
2. Introduce a collect_layer_data function which will collect the
file level data and return a list of FileData objects.
3. Introduce a add_file_data function which will make use of the
'merge' method in FileData to add the collected file level information
to the ImageLayer object.
4. Load and save to the cache and information that is collected.

Work towards #480

Signed-off-by: Nisha K <nishak@vmware.com>
---
 tern/extensions/scancode/executor.py | 91 +++++++++++-----------------
 1 file changed, 37 insertions(+), 54 deletions(-)

diff --git a/tern/extensions/scancode/executor.py b/tern/extensions/scancode/executor.py
index f75677a3..e0c6e934 100644
--- a/tern/extensions/scancode/executor.py
+++ b/tern/extensions/scancode/executor.py
@@ -16,9 +16,10 @@
 
 import json
 import logging
+import os
 
 from tern.analyze.passthrough import get_filesystem_command
-from tern.analyze.passthrough import get_file_command
+from tern.analyze import common
 from tern.classes.notice import Notice
 from tern.classes.file_data import FileData
 from tern.extensions.executor import Executor
@@ -29,10 +30,12 @@
 logger = logging.getLogger(constants.logger_name)
 
 
-def analyze_layer(layer_obj):
-    '''Use scancode to analyze the layer's contents. Create file objects
-    and add them to the layer object. Add any Notices to the FileData objects
+def collect_layer_data(layer_obj):
+    '''Use scancode to collect data from a layer filesystem. This function will
+    create a FileData object for every file found. After scanning, it will
+    return a list of FileData objects.
     '''
+    files = []
     # run scancode against a directory
     command = 'scancode -ilpcu --quiet --json -'
     full_cmd = get_filesystem_command(layer_obj, command)
@@ -48,8 +51,12 @@ def analyze_layer(layer_obj):
         data = json.loads(result)
         for f in data['files']:
             if f['type'] == 'file':
-                fd = FileData(f['name'], f['path'], f['date'], f['file_type'])
-                fd.set_checksum('sha1', f['sha1'])
+                # scancode records paths from the target directory onwards
+                # which in tern's case is tern.utils.constants.untar_dir
+                # removing that portion of the file path
+                fspath = f['path'].replace(
+                    constants.untar_dir + os.path.sep, '')
+                fd = FileData(f['name'], fspath, f['date'], f['file_type'])
                 if f['licenses']:
                     fd.licenses = [l['short_name'] for l in f['licenses']]
                 fd.license_expressions = f['license_expressions']
@@ -58,52 +65,27 @@ def analyze_layer(layer_obj):
                 if f['urls']:
                     fd.urls = [u['url'] for u in f['urls']]
                 fd.packages = f['packages']
-                fd.authors = f['authors']
+                fd.authors = [a['value'] for a in f['authors']]
                 if f['scan_errors']:
                     # for each scan error make a notice
                     for err in f['scan_errors']:
                         fd.origins.add_notice_to_origins(
                             'File: ' + fd.path, Notice(err, 'error'))
-                # add filedata object to layer
-                layer_obj.add_file(fd)
+                files.append(fd)
+    return files
 
 
-def analyze_file(layer_obj):
-    '''Use scancode to analyze files Tern has already found in an image layer.
-    For each file in the layer, run scancode on the file. We assume that we
-    already have the files names, paths and checksums filled out'''
-    # run scancode against each file
-    command = 'scancode -ilpcu --quiet --json -'
-    for fd in layer_obj.files:
-        full_cmd = get_file_command(layer_obj.tar_file, fd, command)
-        origin_file = 'File: ' + fd.path
-        result, error = rootfs.shell_command(True, full_cmd)
-        if not result:
-            logger.error(
-                "No scancode results for this file: %s", str(error))
-            fd.origins.add_notice_to_origins(
-                origin_file, Notice(str(error), 'error'))
-        else:
-            # Fill the results into the FileData object
-            data = json.loads(result)['files'][0]
-            fd.date = data['date']
-            fd.file_type = data['file_type']
-            if data['licenses']:
-                fd.licenses = [l['short_name'] for l in data['licenses']]
-            fd.license_expressions = data['license_expressions']
-            if data['copyrights']:
-                fd.copyrights = [c['value'] for c in data['copyrights']]
-            if data['urls']:
-                fd.urls = [u['url'] for u in data['urls']]
-            fd.packages = data['packages']
-            fd.authors = data['authors']
-            if data['scan_errors']:
-                # for each scan error make a notice
-                for err in data['scan_errors']:
-                    fd.origins.add_notice_to_origins(
-                        origin_file, Notice(err, 'error'))
-            # add filedata object to layer
-            layer_obj.add_file(fd)
+def add_file_data(layer_obj, collected_files):
+    '''Use the file data collected with scancode to fill in the file level
+    data for an ImageLayer object'''
+    # we'll assume that we are merging the collected_files data with
+    # the file level data already in the layer object
+    logger.debug("Collecting file data...")
+    while collected_files:
+        checkfile = collected_files.pop()
+        for f in layer_obj.files:
+            if f.merge(checkfile):
+                break
 
 
 class Scancode(Executor):
@@ -113,12 +95,13 @@ def execute(self, image_obj):
             scancode -ilpcu --quiet --json - /path/to/directory
         '''
         for layer in image_obj.layers:
-            layer.files_analyzed = True
-            if layer.files:
-                # If the layer already has files processed, then run
-                # scancode per file
-                analyze_file(layer)
-            else:
-                # If there was no file processing done, scancode will process
-                # them for you
-                analyze_layer(layer)
+            # load the layers from cache
+            common.load_from_cache(layer)
+            if not layer.files_analyzed:
+                # the layer doesn't have analyzed files, so run analysis
+                file_list = collect_layer_data(layer)
+                if file_list:
+                    add_file_data(layer, file_list)
+                    layer.files_analyzed = True
+        # save data to the cache
+        common.save_to_cache(image_obj)