Skip to content

Commit

Permalink
extensions:scancode: Only scan at directory level
Browse files Browse the repository at this point in the history
Scancode takes a ridiculous amount of time to scan every file in
a full OS. It is much faster at making decisions on what to scan
when operating at the directory level, especially since it does
its own file level inventorying. To allow for the best possible
user experience, we will only resort to scanning at the directory
level. For this to work, we will also need to check if the files
were analyzed before.

This change introduces the following changes:
1. Remove the file level data collection function.
2. Introduce a collect_layer_data function which will collect the
file level data and return a list of FileData objects.
3. Introduce a add_file_data function which will make use of the
'merge' method in FileData to add the collected file level information
to the ImageLayer object.
4. Load and save to the cache and information that is collected.

Work towards tern-tools#480

Signed-off-by: Nisha K <[email protected]>
  • Loading branch information
Nisha K committed Mar 16, 2020
1 parent d99461e commit 9dc4f2c
Showing 1 changed file with 37 additions and 54 deletions.
91 changes: 37 additions & 54 deletions tern/extensions/scancode/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@

import json
import logging
import os

from tern.analyze.passthrough import get_filesystem_command
from tern.analyze.passthrough import get_file_command
from tern.analyze import common
from tern.classes.notice import Notice
from tern.classes.file_data import FileData
from tern.extensions.executor import Executor
Expand All @@ -29,10 +30,12 @@
logger = logging.getLogger(constants.logger_name)


def analyze_layer(layer_obj):
'''Use scancode to analyze the layer's contents. Create file objects
and add them to the layer object. Add any Notices to the FileData objects
def collect_layer_data(layer_obj):
'''Use scancode to collect data from a layer filesystem. This function will
create a FileData object for every file found. After scanning, it will
return a list of FileData objects.
'''
files = []
# run scancode against a directory
command = 'scancode -ilpcu --quiet --json -'
full_cmd = get_filesystem_command(layer_obj, command)
Expand All @@ -48,8 +51,12 @@ def analyze_layer(layer_obj):
data = json.loads(result)
for f in data['files']:
if f['type'] == 'file':
fd = FileData(f['name'], f['path'], f['date'], f['file_type'])
fd.set_checksum('sha1', f['sha1'])
# scancode records paths from the target directory onwards
# which in tern's case is tern.utils.constants.untar_dir
# removing that portion of the file path
fspath = f['path'].replace(
constants.untar_dir + os.path.sep, '')
fd = FileData(f['name'], fspath, f['date'], f['file_type'])
if f['licenses']:
fd.licenses = [l['short_name'] for l in f['licenses']]
fd.license_expressions = f['license_expressions']
Expand All @@ -58,52 +65,27 @@ def analyze_layer(layer_obj):
if f['urls']:
fd.urls = [u['url'] for u in f['urls']]
fd.packages = f['packages']
fd.authors = f['authors']
fd.authors = [a['value'] for a in f['authors']]
if f['scan_errors']:
# for each scan error make a notice
for err in f['scan_errors']:
fd.origins.add_notice_to_origins(
'File: ' + fd.path, Notice(err, 'error'))
# add filedata object to layer
layer_obj.add_file(fd)
files.append(fd)
return files


def analyze_file(layer_obj):
'''Use scancode to analyze files Tern has already found in an image layer.
For each file in the layer, run scancode on the file. We assume that we
already have the files names, paths and checksums filled out'''
# run scancode against each file
command = 'scancode -ilpcu --quiet --json -'
for fd in layer_obj.files:
full_cmd = get_file_command(layer_obj.tar_file, fd, command)
origin_file = 'File: ' + fd.path
result, error = rootfs.shell_command(True, full_cmd)
if not result:
logger.error(
"No scancode results for this file: %s", str(error))
fd.origins.add_notice_to_origins(
origin_file, Notice(str(error), 'error'))
else:
# Fill the results into the FileData object
data = json.loads(result)['files'][0]
fd.date = data['date']
fd.file_type = data['file_type']
if data['licenses']:
fd.licenses = [l['short_name'] for l in data['licenses']]
fd.license_expressions = data['license_expressions']
if data['copyrights']:
fd.copyrights = [c['value'] for c in data['copyrights']]
if data['urls']:
fd.urls = [u['url'] for u in data['urls']]
fd.packages = data['packages']
fd.authors = data['authors']
if data['scan_errors']:
# for each scan error make a notice
for err in data['scan_errors']:
fd.origins.add_notice_to_origins(
origin_file, Notice(err, 'error'))
# add filedata object to layer
layer_obj.add_file(fd)
def add_file_data(layer_obj, collected_files):
'''Use the file data collected with scancode to fill in the file level
data for an ImageLayer object'''
# we'll assume that we are merging the collected_files data with
# the file level data already in the layer object
logger.debug("Collecting file data...")
while collected_files:
checkfile = collected_files.pop()
for f in layer_obj.files:
if f.merge(checkfile):
break


class Scancode(Executor):
Expand All @@ -113,12 +95,13 @@ def execute(self, image_obj):
scancode -ilpcu --quiet --json - /path/to/directory
'''
for layer in image_obj.layers:
layer.files_analyzed = True
if layer.files:
# If the layer already has files processed, then run
# scancode per file
analyze_file(layer)
else:
# If there was no file processing done, scancode will process
# them for you
analyze_layer(layer)
# load the layers from cache
common.load_from_cache(layer)
if not layer.files_analyzed:
# the layer doesn't have analyzed files, so run analysis
file_list = collect_layer_data(layer)
if file_list:
add_file_data(layer, file_list)
layer.files_analyzed = True
# save data to the cache
common.save_to_cache(image_obj)

0 comments on commit 9dc4f2c

Please sign in to comment.