diff --git a/CHANGELOG.md b/CHANGELOG.md index 7dcbafb..1d5c774 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # CHANGELOG +## v0.1.10 +Resolved [#7](https://github.com/SamuraiAku/PkgToSoftwareBOM.jl/issues/7), Fill in Declared License field in SBOM +* Uses LicenseCheck.jl to scan packages and artifacts for license files and licenses embedded in source files. +* Also fill in package field License Info From Files. + ## v0.1.9 Update SPDX package compatibility to v0.4. This update enables the following: * Updates the algorithm for computing the package verification code to a hopefully correct implementation. diff --git a/Project.toml b/Project.toml index 97b6e8d..6cbf26b 100644 --- a/Project.toml +++ b/Project.toml @@ -10,11 +10,14 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" RegistryInstances = "2792f1a3-b283-48e8-9a74-f99dce5104f3" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +LicenseCheck = "726dbf0d-6eb6-41af-b36c-cd770e0f00cc" +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" [compat] SPDX = "0.4" RegistryInstances = "0.1" Reexport = "1" +LicenseCheck = "0.2" julia = "1.8" [extras] diff --git a/README.md b/README.md index 4b30e15..d47eebd 100644 --- a/README.md +++ b/README.md @@ -13,13 +13,15 @@ PkgToSoftwareBOM interfaces with the standard library Pkg to fill in the SBOM da - versions in use - where the package can be downloaded from - SPDX verification code + - determines the declared license and scans all source files for additional licenses present - A complete artifact list - artifact version resolved to the target platform - target platform may be changed by an advanced user - where the artifact can be downloaded from - download checksum + - determines the declared license and scans all source files for additional licenses present -Future versions may be able to fill in additional fields including copyright text and software license. +Future versions may be able to fill in additional fields including copyright text. PkgToSoftwareBOM defaults to using the General registry but can use other registries and even mutiple registries as the source(s) of package information. @@ -61,6 +63,15 @@ There are two use cases envisioned: - Users: Create an SBOM of your current environment. Submit this file to your organization - Developers: Create an SBOM to be included with your package source code. This becomes your official declaration of what your package dependencies, copyright, license, and download location. +### !!! Important note about stability and license scanning !!! +PkgToSoftwareBOM uses [LicenseCheck.jl](https://github.com/ericphanson/LicenseCheck.jl) to scan package and artifact directories for license file information. LicenseCheck has been known to occasionally crash when run on Apple Silicon, see [Issue #11](https://github.com/ericphanson/LicenseCheck.jl/issues/11). I have observed it happening every time when run within VSCode with the julia-vscode extension. There are some early indications this issue may be resolved in Julia 1.11 when it is released, but it is not certain yet. + +If you wish to disable license scanning for stability reasons, use the keyword licenseScan when creating a spdxCreationData object (see examples below) + +```julia +spdxCreationData(licenseScan= false) +``` + ### User Environment SBOM To create an SBOM of your entire environment type: @@ -190,6 +201,16 @@ sbom= generateSPDX(spdxCreationData(), ["PrivateRegistry", "General"]); The second argument is a list of all the registries you would like to use. If you have a package that exists in both registries (for example, you've cloned the respository to your local network and you want to list that as the download location), PkgToSoftwareBOM will use the information from the first registry in the list that has valid information and ignore all subsequent registries +## How does PkgToSoftwareBOM determine what the license of the package or artifact is? +PkgToSoftwareBOM scans the entire julia package or artifact for license information. If the scanning locates a file containing a recognized software license, the license is recorded in the `LicenseInfoFromFiles` property of the SBOM package description but does not record which file(s) the license was found in. The license scan follows these rules (LicenseCheck.jl, version 0.2.2) +- All plaintext files less than 450 KB are scanned + +During that search PkgToSoftwareBOM looks for an overall package license in the following locations: +- For Julia packages, in the package root directory +- For artifacts, in the root directory and in the directory `share/licenses` + +If files with a valid license are found in the expected location, PkgToSoftwareBOM declares the file where the license takes up the greatest percentage of the total file to be the package license, as you would expect a package license to contain only the license text and nothing else. + ## How does PkgToSoftwareBOM target hardware platforms other than the one it is running on? Advanced users may wish to create an SBOM in which the artifacts are targeted to a different platform than the one that PkgToSoftwareBOM is running on. For example, create an SBOM for an x86 linux installation from an M1 Macbook. diff --git a/src/PkgToSoftwareBOM.jl b/src/PkgToSoftwareBOM.jl index 4474132..e7b9bb2 100644 --- a/src/PkgToSoftwareBOM.jl +++ b/src/PkgToSoftwareBOM.jl @@ -5,10 +5,12 @@ module PkgToSoftwareBOM using Pkg using UUIDs using Reexport +using LicenseCheck @reexport using SPDX using Artifacts using RegistryInstances using Base.BinaryPlatforms +using Logging export spdxCreationData, spdxPackageInstructions @@ -36,7 +38,7 @@ Base.@kwdef struct spdxPackageInstructions excluded_dirs::Vector{String}= String[".git"] excluded_patterns::Vector{Regex}= Regex[] originator::SpdxCreatorV2= SpdxCreatorV2("NOASSERTION") - declaredLicense= SpdxLicenseExpressionV2("NOASSERTION") + declaredLicense::Union{SpdxSimpleLicenseExpressionV2, SpdxComplexLicenseExpressionV2}= SpdxLicenseExpressionV2("NOASSERTION") copyright::String= "NOASSERTION" end @@ -47,6 +49,7 @@ Base.@kwdef struct spdxPackageData packagesinsbom::Set{UUID}= Set{UUID}() packageInstructions::Dict{UUID, spdxPackageInstructions} artifactsinsbom::Set{String}= Set{String}() + licenseScan::Bool end # TODO: When abandoning julia 1.8 compatibility, update the default Creator below to include the package version @@ -60,6 +63,7 @@ Base.@kwdef struct spdxCreationData DocumentComment::Union{AbstractString, Missing}= missing rootpackages::Dict{String, Base.UUID}= Pkg.project().dependencies packageInstructions::Dict{UUID, spdxPackageInstructions}= Dict{UUID, spdxPackageInstructions}() + licenseScan::Bool= true end include("Registry.jl") diff --git a/src/Registry.jl b/src/Registry.jl index b5d101f..403c0f9 100644 --- a/src/Registry.jl +++ b/src/Registry.jl @@ -1,5 +1,6 @@ # SPDX-License-Identifier: MIT +############################### # Think of a name that would be good fit for the Pkg API function registry_packagequery(packages::Dict{UUID, Pkg.API.PackageInfo}, registries::Vector{<:AbstractString}) if length(registries) == 1 @@ -17,6 +18,7 @@ function registry_packagequery(packages::Dict{UUID, Pkg.API.PackageInfo}, regist return registry_pkg end +############################### function _registry_packagequery(packages::Dict{UUID, Pkg.API.PackageInfo}, registry::AbstractString) #Get the requested registry active_regs= reachable_registries() @@ -38,6 +40,7 @@ function _registry_packagequery(packages::Dict{UUID, Pkg.API.PackageInfo}, regis return registry_pkg end +############################### function populate_registryinfo(uuid::UUID, package::Pkg.API.PackageInfo, registry::RegistryInstance) package.is_tracking_repo && return nothing is_stdlib(uuid) && return nothing diff --git a/src/packageInfo.jl b/src/packageInfo.jl index aa233fb..e64e243 100644 --- a/src/packageInfo.jl +++ b/src/packageInfo.jl @@ -1,5 +1,6 @@ # SPDX-License-Identifier: MIT +############################### function resolve_pkgsource!(package::SpdxPackageV2, packagedata::Pkg.API.PackageInfo, registrydata::Union{Nothing, Missing, PackageRegistryInfo}) # The location of the SPDX package's source code depend on whether Pkg is tracking the package via: # 1) A package registry @@ -42,7 +43,7 @@ function resolve_pkgsource!(package::SpdxPackageV2, packagedata::Pkg.API.Package return nothing end - +############################### function resolve_pkgsource!(package::SpdxPackageV2, artifact::Dict{String, Any}) platform_keys= setdiff(keys(artifact), Set(["download", "git-tree-sha1", "lazy"])) if length(platform_keys) > 0 @@ -82,4 +83,86 @@ function resolve_pkgsource!(package::SpdxPackageV2, artifact::Dict{String, Any}) package.HomePage= "NOASSERTION" return nothing +end + +############################### +function resolve_pkglicense!(package::SpdxPackageV2, packagepath::AbstractString, packageInstructions, licenseScan::Bool) + package.LicenseConcluded= SpdxLicenseExpressionV2("NOASSERTION") + + if ismissing(packageInstructions) + if false == licenseScan + package.LicenseDeclared= SpdxLicenseExpressionV2("NOASSERTION") + else + scanresults= scan_for_licenses(packagepath) # Returns an array of found license files in top level of packagepath with scanner results + if isempty(scanresults) + package.LicenseDeclared= SpdxLicenseExpressionV2("NOASSERTION") + else + # If multiple licenses exist, pick the first one as declared and log the rest + # As long as it exists at the top of the pkg + if splitdir(scanresults[1].license_filename)[1] == packagepath + package.LicenseDeclared= SpdxLicenseExpressionV2(scanresults[1].licenses_found[1]) + @logmsg Logging.LogLevel(-50) "Declared License:" LicenseDeclared= package.LicenseDeclared LicenseFile= scanresults[1].license_filename + else + package.LicenseDeclared= SpdxLicenseExpressionV2("NOASSERTION") + @logmsg Logging.LogLevel(-50) "Declared License cannot be determined" + end + package.LicenseInfoFromFiles= [SpdxLicenseExpressionV2(license) for f in scanresults for license in f.licenses_found] + @logmsg Logging.LogLevel(-75) "License data found in:" licenselist= [(a.license_filename, a.licenses_found) for a in scanresults] + package.LicenseInfoFromFiles= unique(package.LicenseInfoFromFiles) # Remove duplicates + end + end + else + package.LicenseDeclared= packageInstructions.declaredLicense + end +end + +############################### +function resolve_pkglicense!(package::SpdxPackageV2, artifact::Dict{String, Any}, licenseScan::Bool) + package.LicenseConcluded= SpdxLicenseExpressionV2("NOASSERTION") + + if false == licenseScan + package.LicenseDeclared= SpdxLicenseExpressionV2("NOASSERTION") + else + artifact_src= artifact_path(Base.SHA1(artifact["git-tree-sha1"])) + scanresults= scan_for_licenses(artifact_src) + if isempty(scanresults) + package.LicenseDeclared= SpdxLicenseExpressionV2("NOASSERTION") + else + # If multiple licenses exist, pick the first one at the top or the first one in the share/licenses directory + declared_licenses= filter(lic -> contains(splitdir(lic.license_filename)[1], joinpath(artifact_src, "share", "licenses")) + || (splitdir(lic.license_filename)[1] == artifact_src) + ,scanresults) + if isempty(declared_licenses) + package.LicenseDeclared= SpdxLicenseExpressionV2("NOASSERTION") + @logmsg Logging.LogLevel(-50) "Declared License cannot be determined" + else + package.LicenseDeclared= SpdxLicenseExpressionV2(declared_licenses[1].licenses_found[1]) + @logmsg Logging.LogLevel(-50) "Declared License:" LicenseDeclared= package.LicenseDeclared LicenseFile= declared_licenses[1].license_filename + end + package.LicenseInfoFromFiles= [SpdxLicenseExpressionV2(license) for f in scanresults for license in f.licenses_found] + @logmsg Logging.LogLevel(-75) "License data found in:" licenselist= [(a.license_filename, a.licenses_found) for a in scanresults] + package.LicenseInfoFromFiles= unique(package.LicenseInfoFromFiles) # Remove duplicates + end + end +end + +############################### +function scan_for_licenses(dir::AbstractString) + licenses_list= Vector{NamedTuple{(:license_filename, :licenses_found, :license_file_percent_covered), Tuple{String, Vector{String}, Float64}}}() + for dirdata in walkdir(dir) + root= dirdata[1] + files= dirdata[3] + files= [f for f in files if isfile(joinpath(root, f))] # Remove anything that isn't an actual file, i.e. a broken symlink or symlinks to directories + licenses_found= find_licenses_by_bruteforce(root; files=files) + + # If not empty, rebuild the licenses_found list with the complete path + licenses_fullpath= typeof(licenses_list)() + for lic in licenses_found + push!(licenses_fullpath, (license_filename= joinpath(root, lic.license_filename), licenses_found= lic.licenses_found, license_file_percent_covered= lic.license_file_percent_covered)) + end + + licenses_list= isempty(licenses_fullpath) ? licenses_list : vcat(licenses_list, licenses_fullpath) + end + + return licenses_list end \ No newline at end of file diff --git a/src/spdxBuild.jl b/src/spdxBuild.jl index 616726b..bcc79a6 100644 --- a/src/spdxBuild.jl +++ b/src/spdxBuild.jl @@ -2,11 +2,24 @@ export generateSPDX +############################### +""" + generateSPDX(docData::spdxCreationData= spdxCreationData(), sbomRegistries::Vector{<:AbstractString}= ["General"]) + +Generate a software BOM in the SPDX format. By default, the SBOM will describe all the packages and artifacts in the active environment using the General registry to retrieve download information. + +If you would like to use a different registry or search multiple registries, you just call `generateSPDX` with two arguments. + +For example to create a User Environment SBOM using the General registry and another registry called "PrivateRegistry", type: +```julia-repl +sbom= generateSPDX(spdxCreationData(), ["PrivateRegistry", "General"]); +``` +""" function generateSPDX(docData::spdxCreationData= spdxCreationData(), sbomRegistries::Vector{<:AbstractString}= ["General"], envpkgs::Dict{Base.UUID, Pkg.API.PackageInfo}= Pkg.dependencies()) # Query the registries for package information registry_packages= registry_packagequery(envpkgs, sbomRegistries) - packagebuilddata= spdxPackageData(targetplatform= docData.TargetPlatform, packages= envpkgs, registrydata= registry_packages, packageInstructions= docData.packageInstructions) + packagebuilddata= spdxPackageData(targetplatform= docData.TargetPlatform, packages= envpkgs, registrydata= registry_packages, packageInstructions= docData.packageInstructions, licenseScan= docData.licenseScan) # Create the SPDX Document spdxDoc= SpdxDocumentV2() @@ -50,6 +63,7 @@ function generateSPDX(docData::spdxCreationData= spdxCreationData(), sbomRegistr return spdxDoc end +############################### ## Building an SPDX Package for a Julia package function buildSPDXpackage!(spdxDoc::SpdxDocumentV2, uuid::UUID, builddata::spdxPackageData) packagedata= builddata.packages[uuid] @@ -62,16 +76,16 @@ function buildSPDXpackage!(spdxDoc::SpdxDocumentV2, uuid::UUID, builddata::spdxP # Check if it's a standard library is_stdlib(uuid) && return nothing + + @logmsg Logging.LogLevel(-50) "******* Entering package $(packagedata.name) *******" package.Name= packagedata.name package.Version= string(packagedata.version) package.Supplier= SpdxCreatorV2("NOASSERTION") # TODO: That would be the person/org who hosts package server?. Julialang would be the supplier for General registry but how would that be determined in generic case package.Originator= ismissing(packageInstructions) ? SpdxCreatorV2("NOASSERTION") : packageInstructions.originator # TODO: Use the person or group that hosts the repo on Github. Is there an API to query? resolve_pkgsource!(package, packagedata, registrydata) + resolve_pkglicense!(package, packagedata.source, packageInstructions, builddata.licenseScan) package.VerificationCode= spdxpkgverifcode(packagedata.source, packageInstructions) - package.LicenseConcluded= SpdxLicenseExpressionV2("NOASSERTION") - push!(package.LicenseInfoFromFiles, SpdxLicenseExpressionV2("NOASSERTION")) - package.LicenseDeclared= ismissing(packageInstructions) ? SpdxLicenseExpressionV2("NOASSERTION") : packageInstructions.declaredLicense # TODO: Scan source for licenses and/or query Github API package.Copyright= ismissing(packageInstructions) ? "NOASSERTION" : packageInstructions.copyright # TODO: Scan license files for the first line that says "Copyright"? That would about work. package.Summary= "This is a Julia package, written in the Julia language." @@ -109,6 +123,7 @@ function buildSPDXpackage!(spdxDoc::SpdxDocumentV2, uuid::UUID, builddata::spdxP return package.SPDXID end +############################### ## Building an SPDX Package for a Julia artifact function buildSPDXpackage!(spdxDoc::SpdxDocumentV2, artifact_name::AbstractString, artifact::Dict{String, Any}, builddata::spdxPackageData) git_tree_sha1= artifact["git-tree-sha1"] @@ -118,13 +133,13 @@ function buildSPDXpackage!(spdxDoc::SpdxDocumentV2, artifact_name::AbstractStrin # TODO: Maybe add additional names to the name field? That would require searching for the package and then parsing (git_tree_sha1 in builddata.artifactsinsbom) && (return package.SPDXID) + @logmsg Logging.LogLevel(-50) "******* Entering artifact $(artifact_name) *******" + package.Name= artifact_name package.Supplier= SpdxCreatorV2("NOASSERTION") package.Originator= SpdxCreatorV2("NOASSERTION") # TODO: Should there be instructions like for packages? resolve_pkgsource!(package, artifact) - package.LicenseConcluded= SpdxLicenseExpressionV2("NOASSERTION") - push!(package.LicenseInfoFromFiles, SpdxLicenseExpressionV2("NOASSERTION")) - package.LicenseDeclared= SpdxLicenseExpressionV2("NOASSERTION") + resolve_pkglicense!(package, artifact, builddata.licenseScan) package.Copyright= "NOASSERTION" # TODO: Should there be instructions like for packages? Scan license files for the first line that says "Copyright"? That would about work. package.Summary= "This is a Julia artifact. \nAn artifact is a binary runtime or other data store not written in the Julia language that is used by a Julia package." if haskey(artifact, ["lazy"]) && artifact["lazy"] == true @@ -138,6 +153,7 @@ function buildSPDXpackage!(spdxDoc::SpdxDocumentV2, artifact_name::AbstractStrin return package.SPDXID end +############################### function spdxpkgverifcode(source::AbstractString, packageInstructions::Union{Missing, spdxPackageInstructions}) if ismissing(packageInstructions) packageInstructions= spdxPackageInstructions(name= "") # go with the defaults diff --git a/test/runtests.jl b/test/runtests.jl index efe859e..8bbcbc9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -24,7 +24,7 @@ using Base.BinaryPlatforms sbom = generateSPDX() # The SBOM is too big and complex to check everything, but we can check some things root_relationships= filter(r -> r.RelationshipType=="DESCRIBES", sbom.Relationships) - @test issetequal(getproperty.(root_relationships, :RelatedSPDXID), ["SPDXRef-PkgToSoftwareBOM-6254a0f9-6143-4104-aa2e-fd339a2830a6", "SPDXRef-SPDX-47358f48-d834-4249-91f5-f6185eb3d540", "SPDXRef-RegistryInstances-2792f1a3-b283-48e8-9a74-f99dce5104f3", "SPDXRef-Reexport-189a3867-3050-52da-a836-e630ba90ab69"]) + @test issetequal(getproperty.(root_relationships, :RelatedSPDXID), ["SPDXRef-PkgToSoftwareBOM-6254a0f9-6143-4104-aa2e-fd339a2830a6", "SPDXRef-SPDX-47358f48-d834-4249-91f5-f6185eb3d540", "SPDXRef-RegistryInstances-2792f1a3-b283-48e8-9a74-f99dce5104f3", "SPDXRef-Reexport-189a3867-3050-52da-a836-e630ba90ab69", "SPDXRef-LicenseCheck-726dbf0d-6eb6-41af-b36c-cd770e0f00cc"]) @test !isempty(filter(p -> p.SPDXID == "SPDXRef-PkgToSoftwareBOM-6254a0f9-6143-4104-aa2e-fd339a2830a6", sbom.Packages)) @test !isempty(filter(p -> p.SPDXID == "SPDXRef-SPDX-47358f48-d834-4249-91f5-f6185eb3d540", sbom.Packages)) @test !isempty(filter(isequal(SpdxRelationshipV2("SPDXRef-SPDX-47358f48-d834-4249-91f5-f6185eb3d540 DEPENDENCY_OF SPDXRef-PkgToSoftwareBOM-6254a0f9-6143-4104-aa2e-fd339a2830a6")), sbom.Relationships)) @@ -139,8 +139,8 @@ using Base.BinaryPlatforms @test all(getproperty.(sbom.Packages, :FilesAnalyzed)) @test all(isempty.(getproperty.(sbom.Packages, :Checksums))) @test all(isequal.(getproperty.(sbom.Packages, :LicenseConcluded), [SpdxSimpleLicenseExpressionV2("NOASSERTION")])) - @test all(isequal.(getproperty.(sbom.Packages, :LicenseInfoFromFiles), [[SpdxSimpleLicenseExpressionV2("NOASSERTION")]])) - @test all(isequal.(getproperty.(sbom.Packages, :LicenseDeclared), [SpdxSimpleLicenseExpressionV2("NOASSERTION")])) + @test all(isequal.(getproperty.(sbom.Packages, :LicenseInfoFromFiles), [[SpdxSimpleLicenseExpressionV2("MIT")]])) + @test all(isequal.(getproperty.(sbom.Packages, :LicenseDeclared), [SpdxSimpleLicenseExpressionV2("MIT")])) @test all(ismissing.(getproperty.(sbom.Packages, :LicenseComments))) @test all(isequal.(getproperty.(sbom.Packages, :Copyright), "NOASSERTION")) @test all(isequal.(getproperty.(sbom.Packages, :Summary), "This is a Julia package, written in the Julia language."))