Skip to content

Commit

Permalink
merge: Integrate file level data from scancode
Browse files Browse the repository at this point in the history
This resolves #480

The first commit modifies loading and saving to the cache
for file level data.
The second commit modifies the executor for scancode to
leverage file level data caching and collection for an image.

Signed-off-by: Nisha K [email protected]
  • Loading branch information
Nisha K authored Mar 18, 2020
2 parents 2c216e5 + 9dc4f2c commit aa87d3e
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 80 deletions.
78 changes: 64 additions & 14 deletions tern/analyze/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import os

from tern.classes.package import Package
from tern.classes.file_data import FileData
from tern.classes.notice import Notice
from tern.classes.command import Command
from tern.command_lib import command_lib
Expand All @@ -37,22 +38,71 @@ def get_shell_commands(shell_command_line):

def load_from_cache(layer, redo=False):
'''Given a layer object, check against cache to see if that layer id exists
if yes then get the package list and load it in the layer and return true.
If it doesn't exist return false. Default operation is to not redo the
cache. Add notices to the layer's origins matching the origin_str'''
if yes then load any relevant layer level information. The default
operation is to not redo the cache. Add notices to the layer's origins
matching the origin_str'''
loaded = False
if not layer.packages and not redo:
# there are no packages in this layer and we are not repopulating the
# cache, try to get it from the cache
raw_pkg_list = cache.get_packages(layer.fs_hash)
if raw_pkg_list:
logger.debug('Loaded from cache: layer \"%s\"', layer.fs_hash[:10])
for pkg_dict in raw_pkg_list:
pkg = Package(pkg_dict['name'])
pkg.fill(pkg_dict)
layer.add_package(pkg)
if not redo:
# check if packages are available in the cache
if load_packages_from_cache(layer):
loaded = True
# check if files are available in the cache
if load_files_from_cache(layer):
loaded = True
# load some extra properties into the layer if available
if layer.fs_hash in cache.get_layers():
layer.files_analyzed = cache.cache[layer.fs_hash]['files_analyzed']
# load any origin data
load_notices_from_cache(layer)
return loaded


def load_packages_from_cache(layer):
'''Given a layer object, populate package level information'''
loaded = False
raw_pkg_list = cache.get_packages(layer.fs_hash)
if raw_pkg_list:
logger.debug(
'Loading packages from cache: layer \"%s\"', layer.fs_hash[:10])
for pkg_dict in raw_pkg_list:
pkg = Package(pkg_dict['name'])
pkg.fill(pkg_dict)
# collect package origins
if 'origins' in pkg_dict.keys():
for origin_dict in pkg_dict['origins']:
for notice in origin_dict['notices']:
pkg.origins.add_notice_to_origins(
origin_dict['origin_str'], Notice(
notice['message'], notice['level']))
layer.add_package(pkg)
loaded = True
return loaded


def load_files_from_cache(layer):
'''Given a layer object, populate file level information'''
loaded = False
raw_file_list = cache.get_files(layer.fs_hash)
if raw_file_list:
logger.debug(
'Loading files from cache: layer \"%s\"', layer.fs_hash[:10])
for file_dict in raw_file_list:
f = FileData(file_dict['name'], file_dict['path'])
f.fill(file_dict)
# collect file origins
if 'origins' in file_dict.keys():
for origin_dict in file_dict['origins']:
for notice in origin_dict['notices']:
f.origins.add_notice_to_origins(
origin_dict['origin_str'], Notice(
notice['message'], notice['level']))
layer.add_file(f)
loaded = True
else:
# if there are no files, generate them from the pre-calculated
# hash file
logger.debug('Reading files in filesystem...')
layer.add_files()
return loaded


Expand All @@ -70,7 +120,7 @@ def load_notices_from_cache(layer):
def save_to_cache(image):
'''Given an image object, save all layers to the cache'''
for layer in image.layers:
if layer.packages:
if layer.packages or layer.files_analyzed:
cache.add_layer(layer)


Expand Down
12 changes: 0 additions & 12 deletions tern/classes/docker_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
import os
import subprocess # nosec

from tern.classes.file_data import FileData
from tern.utils import rootfs
from tern.utils.cache import get_files
from tern.utils.general import pushd
from tern.utils.constants import manifest_file
from tern.analyze.docker.container import extract_image_metadata
Expand Down Expand Up @@ -161,16 +159,6 @@ def load_image(self):
while layer_diffs and layer_paths:
layer = ImageLayer(layer_diffs.pop(0), layer_paths.pop(0))
layer.gen_fs_hash()
raw_file_list = get_files(layer.fs_hash)
# Fetch file info from cache if exists
# else extract and store file info
if raw_file_list:
for file_dict in raw_file_list:
file = FileData(file_dict['name'], file_dict['path'])
file.fill(file_dict)
layer.add_file(file)
else:
layer.add_files()
self._layers.append(layer)
self.set_layer_created_by()
except NameError: # pylint: disable=try-except-raise
Expand Down
91 changes: 37 additions & 54 deletions tern/extensions/scancode/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@

import json
import logging
import os

from tern.analyze.passthrough import get_filesystem_command
from tern.analyze.passthrough import get_file_command
from tern.analyze import common
from tern.classes.notice import Notice
from tern.classes.file_data import FileData
from tern.extensions.executor import Executor
Expand All @@ -29,10 +30,12 @@
logger = logging.getLogger(constants.logger_name)


def analyze_layer(layer_obj):
'''Use scancode to analyze the layer's contents. Create file objects
and add them to the layer object. Add any Notices to the FileData objects
def collect_layer_data(layer_obj):
'''Use scancode to collect data from a layer filesystem. This function will
create a FileData object for every file found. After scanning, it will
return a list of FileData objects.
'''
files = []
# run scancode against a directory
command = 'scancode -ilpcu --quiet --json -'
full_cmd = get_filesystem_command(layer_obj, command)
Expand All @@ -48,8 +51,12 @@ def analyze_layer(layer_obj):
data = json.loads(result)
for f in data['files']:
if f['type'] == 'file':
fd = FileData(f['name'], f['path'], f['date'], f['file_type'])
fd.set_checksum('sha1', f['sha1'])
# scancode records paths from the target directory onwards
# which in tern's case is tern.utils.constants.untar_dir
# removing that portion of the file path
fspath = f['path'].replace(
constants.untar_dir + os.path.sep, '')
fd = FileData(f['name'], fspath, f['date'], f['file_type'])
if f['licenses']:
fd.licenses = [l['short_name'] for l in f['licenses']]
fd.license_expressions = f['license_expressions']
Expand All @@ -58,52 +65,27 @@ def analyze_layer(layer_obj):
if f['urls']:
fd.urls = [u['url'] for u in f['urls']]
fd.packages = f['packages']
fd.authors = f['authors']
fd.authors = [a['value'] for a in f['authors']]
if f['scan_errors']:
# for each scan error make a notice
for err in f['scan_errors']:
fd.origins.add_notice_to_origins(
'File: ' + fd.path, Notice(err, 'error'))
# add filedata object to layer
layer_obj.add_file(fd)
files.append(fd)
return files


def analyze_file(layer_obj):
'''Use scancode to analyze files Tern has already found in an image layer.
For each file in the layer, run scancode on the file. We assume that we
already have the files names, paths and checksums filled out'''
# run scancode against each file
command = 'scancode -ilpcu --quiet --json -'
for fd in layer_obj.files:
full_cmd = get_file_command(layer_obj.tar_file, fd, command)
origin_file = 'File: ' + fd.path
result, error = rootfs.shell_command(True, full_cmd)
if not result:
logger.error(
"No scancode results for this file: %s", str(error))
fd.origins.add_notice_to_origins(
origin_file, Notice(str(error), 'error'))
else:
# Fill the results into the FileData object
data = json.loads(result)['files'][0]
fd.date = data['date']
fd.file_type = data['file_type']
if data['licenses']:
fd.licenses = [l['short_name'] for l in data['licenses']]
fd.license_expressions = data['license_expressions']
if data['copyrights']:
fd.copyrights = [c['value'] for c in data['copyrights']]
if data['urls']:
fd.urls = [u['url'] for u in data['urls']]
fd.packages = data['packages']
fd.authors = data['authors']
if data['scan_errors']:
# for each scan error make a notice
for err in data['scan_errors']:
fd.origins.add_notice_to_origins(
origin_file, Notice(err, 'error'))
# add filedata object to layer
layer_obj.add_file(fd)
def add_file_data(layer_obj, collected_files):
'''Use the file data collected with scancode to fill in the file level
data for an ImageLayer object'''
# we'll assume that we are merging the collected_files data with
# the file level data already in the layer object
logger.debug("Collecting file data...")
while collected_files:
checkfile = collected_files.pop()
for f in layer_obj.files:
if f.merge(checkfile):
break


class Scancode(Executor):
Expand All @@ -113,12 +95,13 @@ def execute(self, image_obj):
scancode -ilpcu --quiet --json - /path/to/directory
'''
for layer in image_obj.layers:
layer.files_analyzed = True
if layer.files:
# If the layer already has files processed, then run
# scancode per file
analyze_file(layer)
else:
# If there was no file processing done, scancode will process
# them for you
analyze_layer(layer)
# load the layers from cache
common.load_from_cache(layer)
if not layer.files_analyzed:
# the layer doesn't have analyzed files, so run analysis
file_list = collect_layer_data(layer)
if file_list:
add_file_data(layer, file_list)
layer.files_analyzed = True
# save data to the cache
common.save_to_cache(image_obj)
2 changes: 2 additions & 0 deletions tests/test_class_docker_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ def testGetLayerDiffIds(self):

def testLayerFiles(self):
self.image.load_image()
self.assertFalse(self.image.layers[0].files)
self.image.layers[0].add_files()
for file in self.image.layers[0].files:
self.assertTrue(
(file.name, file.path, file.checksum,
Expand Down

0 comments on commit aa87d3e

Please sign in to comment.