From 12ba2c5d52abea8801ec6b6645fe7176b3c0df6b Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Wed, 18 Jul 2018 15:49:19 +0200 Subject: [PATCH] Add CLI option to detect generated code #377 Signed-off-by: Philippe Ombredanne --- setup.py | 1 + src/summarycode/generated.py | 76 ++++++++++++++++++- tests/scancode/data/help/help.txt | 1 + .../data/generated/cli.expected.json | 59 ++++++++++++++ tests/summarycode/test_generated.py | 29 ++++--- 5 files changed, 153 insertions(+), 13 deletions(-) create mode 100644 tests/summarycode/data/generated/cli.expected.json diff --git a/setup.py b/setup.py index 1bf7082525c..ab6b67e2b89 100644 --- a/setup.py +++ b/setup.py @@ -231,6 +231,7 @@ def read(*names, **kwargs): 'packages = scancode.plugin_package:PackageManifestScanner', 'emails = scancode.plugin_email:EmailScanner', 'urls = scancode.plugin_url:UrlScanner', + 'generated = summarycode.generated:GeneratedCodeDetector', ], # scancode_post_scan is the entry point for post_scan plugins executed diff --git a/src/summarycode/generated.py b/src/summarycode/generated.py index 0599dfde781..fbb3bb9eb53 100644 --- a/src/summarycode/generated.py +++ b/src/summarycode/generated.py @@ -27,12 +27,79 @@ from __future__ import print_function from __future__ import unicode_literals +from collections import OrderedDict from itertools import islice +from commoncode.datautils import Boolean +from commoncode.text import toascii +from plugincode.scan import ScanPlugin +from plugincode.scan import scan_impl +from scancode import CommandLineOption +from scancode import OTHER_SCAN_GROUP import typecode.contenttype -import commoncode.text -generated_keywords = ( +""" +Tag files as generated. +""" + +# Tracing flag +TRACE = False + + +def logger_debug(*args): + pass + + +if TRACE: + import logging + import sys + + logger = logging.getLogger(__name__) + logging.basicConfig(stream=sys.stdout) + logger.setLevel(logging.DEBUG) + + def logger_debug(*args): + return logger.debug(' '.join(isinstance(a, unicode) and a or repr(a) for a in args)) + + +@scan_impl +class GeneratedCodeDetector(ScanPlugin): + """ + Tag a file as generated. + """ + attributes = OrderedDict([ + ('is_generated', + Boolean(help='True if this file is likely an automatically generated file.')), + ]) + + sort_order = 50 + + options = [ + CommandLineOption(('--generated',), + is_flag=True, default=False, + help='Classify automatically generated code files with a flag.', + help_group=OTHER_SCAN_GROUP, + sort_order=50, + ) + ] + + def is_enabled(self, generated, **kwargs): + return generated + + def get_scanner(self, **kwargs): + return generated_scanner + + +def generated_scanner(location, **kwargs): + is_generated = False + for _clue in get_generated_code_hint(location): + # TODO: consider returning the "clue" + is_generated = True + break + return dict(is_generated=is_generated) + + +GENERATED_KEYWORDS = ( 'generated by', 'auto-generated', 'automatically generated', @@ -83,7 +150,7 @@ max_lines = 150 -def generated_code(location): +def get_generated_code_hint(location, generated_keywords=GENERATED_KEYWORDS): """ Return a line of extracted text from a file if that file is likely generated source code. @@ -97,8 +164,9 @@ def generated_code(location): return with open(location, 'rb') as filein: for line in islice(filein, max_lines): - text = commoncode.text.toascii(line.strip()) + text = toascii(line.strip()) textl = text.lower() if any(kw in textl for kw in generated_keywords): # yield only the first 100 chars.. yield text[:100] + diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt index 0bb7ae04f33..61d94ecb43b 100644 --- a/tests/scancode/data/help/help.txt +++ b/tests/scancode/data/help/help.txt @@ -14,6 +14,7 @@ Options: other scans: -i, --info Scan for file information (size, checksums, etc). + --generated Classify automatically generated code files with a flag. -e, --email Scan for emails. -u, --url Scan for urls. diff --git a/tests/summarycode/data/generated/cli.expected.json b/tests/summarycode/data/generated/cli.expected.json new file mode 100644 index 00000000000..7a5f8461b5f --- /dev/null +++ b/tests/summarycode/data/generated/cli.expected.json @@ -0,0 +1,59 @@ +{ + "scancode_notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.", + "scancode_options": { + "input": "", + "--generated": true, + "--json-pp": "" + }, + "files_count": 7, + "files": [ + { + "path": "simple", + "type": "directory", + "is_generated": false, + "scan_errors": [] + }, + { + "path": "simple/configure", + "type": "file", + "is_generated": true, + "scan_errors": [] + }, + { + "path": "simple/generated_1.java", + "type": "file", + "is_generated": true, + "scan_errors": [] + }, + { + "path": "simple/generated_2.java", + "type": "file", + "is_generated": true, + "scan_errors": [] + }, + { + "path": "simple/generated_3.java", + "type": "file", + "is_generated": true, + "scan_errors": [] + }, + { + "path": "simple/generated_4.java", + "type": "file", + "is_generated": true, + "scan_errors": [] + }, + { + "path": "simple/generated_5.java", + "type": "file", + "is_generated": true, + "scan_errors": [] + }, + { + "path": "simple/generated_6.c", + "type": "file", + "is_generated": true, + "scan_errors": [] + } + ] +} \ No newline at end of file diff --git a/tests/summarycode/test_generated.py b/tests/summarycode/test_generated.py index 070d943666b..0551adeee05 100644 --- a/tests/summarycode/test_generated.py +++ b/tests/summarycode/test_generated.py @@ -22,11 +22,14 @@ # ScanCode is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/nexB/scancode-toolkit/ for support and download. -from __future__ import absolute_import, print_function +from __future__ import absolute_import +from __future__ import unicode_literals import os from commoncode.testcase import FileBasedTesting +from scancode.cli_test_utils import check_json_scan +from scancode.cli_test_utils import run_scan_click from summarycode import generated @@ -39,22 +42,22 @@ def test_basic(self): 'for XML Binding(JAXB) Reference Implementation' ] test_file = self.get_test_loc('generated/simple/generated_1.java') - result = list(generated.generated_code(location=test_file)) + result = list(generated.get_generated_code_hint(location=test_file)) assert expected == result test_file = self.get_test_loc('generated/simple/generated_3.java') - result = list(generated.generated_code(location=test_file)) + result = list(generated.get_generated_code_hint(location=test_file)) assert expected == result def test_basic2(self): expected = ['* This class was generated by the JAX-WS RI.'] test_file = self.get_test_loc('generated/simple/generated_2.java') - result = list(generated.generated_code(location=test_file)) + result = list(generated.get_generated_code_hint(location=test_file)) assert expected == result def test_basic3(self): expected = ['/* This class was automatically generated'] test_file = self.get_test_loc('generated/simple/generated_4.java') - result = list(generated.generated_code(location=test_file)) + result = list(generated.get_generated_code_hint(location=test_file)) assert expected == result def test_basic4(self): @@ -63,13 +66,13 @@ def test_basic4(self): 'expected content contained within this class.' ] test_file = self.get_test_loc('generated/simple/generated_5.java') - result = list(generated.generated_code(location=test_file)) + result = list(generated.get_generated_code_hint(location=test_file)) assert expected == result def test_basic5(self): expected = ['/* DO NOT EDIT THIS FILE - it is machine generated */'] test_file = self.get_test_loc('generated/simple/generated_6.c') - result = list(generated.generated_code(location=test_file)) + result = list(generated.get_generated_code_hint(location=test_file)) assert expected == result def test_configure(self): @@ -77,7 +80,7 @@ def test_configure(self): '# Generated by GNU Autoconf 2.64 for Apache CouchDB 1.0.1.' ] test_file = self.get_test_loc('generated/simple/configure') - result = list(generated.generated_code(location=test_file)) + result = list(generated.get_generated_code_hint(location=test_file)) assert expected == result def test_tomcat_jspc(self): @@ -85,5 +88,13 @@ def test_tomcat_jspc(self): '