Skip to content

Commit ce04e78

Browse files
Add SPDX generation using spdx-tools
This is set up to produce the same output as the current spdx generation module while utilising the spdx-tools library. The goal is to replace the current module with this new one, which will allow easy migration to more SPDX formats as well as SPDXv3. Signed-off-by: Armin Tänzer <armin.taenzer@tngtech.com>
1 parent 62507ed commit ce04e78

File tree

14 files changed

+913
-0
lines changed

14 files changed

+913
-0
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,5 @@ GitPython~=3.1
1818
prettytable~=3.6
1919
packageurl-python>=0.10.4
2020
license-expression>=30.1
21+
spdx-tools>=0.8.0a3
2122

setup.cfg

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ tern.formats =
5252
yaml = tern.formats.yaml.generator:YAML
5353
html = tern.formats.html.generator:HTML
5454
cyclonedxjson = tern.formats.cyclonedx.cyclonedxjson.generator:CycloneDXJSON
55+
spdxjson_new = tern.formats.spdx_new.spdxjson.generator:SpdxJSON
56+
spdxtagvalue_new = tern.formats.spdx_new.spdxtagvalue.generator:SpdxTagValue
5557
tern.extensions =
5658
cve_bin_tool = tern.extensions.cve_bin_tool.executor:CveBinTool
5759
scancode = tern.extensions.scancode.executor:Scancode

tern/formats/spdx_new/__init__.py

Whitespace-only changes.

tern/formats/spdx_new/constants.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from spdx_tools.spdx.model import Version
2+
3+
DOCUMENT_ID = 'SPDXRef-DOCUMENT'
4+
DOCUMENT_NAME = 'Tern report for {image_name}'
5+
SPDX_VERSION = 'SPDX-2.2'
6+
DATA_LICENSE = 'CC0-1.0'
7+
DOCUMENT_COMMENT = 'This document was generated by ' \
8+
'the Tern Project: https://github.com/tern-tools/tern'
9+
DOCUMENT_NAMESPACE = 'https://spdx.org/spdxdocs/tern-' \
10+
'report-{version}-{image}-{uuid}'
11+
LICENSE_LIST_VERSION = Version(3, 20)
12+
CREATOR_NAME = 'tern-{version}'
13+
DOCUMENT_NAME_SNAPSHOT = 'Tern SPDX JSON SBoM' # TODO: different name here that is not specific to JSON
14+
DOCUMENT_NAMESPACE_SNAPSHOT = 'https://spdx.org/spdxdocs/tern-report-' \
15+
'{timestamp}-{uuid}'

tern/formats/spdx_new/file_helpers.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
4+
# SPDX-License-Identifier: BSD-2-Clause
5+
6+
"""
7+
File level helpers for SPDX document generator
8+
"""
9+
from datetime import datetime
10+
from typing import List
11+
12+
from spdx_tools.spdx.model import File as SpdxFile, SpdxNone, SpdxNoAssertion, Checksum, ChecksumAlgorithm
13+
14+
from tern.classes.file_data import FileData
15+
from tern.classes.image import Image
16+
from tern.classes.image_layer import ImageLayer
17+
from tern.classes.template import Template
18+
from tern.formats.spdx_new.layer_helpers import get_layer_checksum
19+
from tern.formats.spdx_new.general_helpers import get_package_license_declared, get_file_spdxref
20+
21+
22+
def get_layer_files_list(layer_obj: ImageLayer, template: Template, timestamp: datetime) -> List[SpdxFile]:
23+
"""Given a layer object and the SPDX template mapping, return a list
24+
of SPDX Files for each file in the layer"""
25+
spdx_files: List[SpdxFile] = []
26+
file_refs = set()
27+
for filedata in layer_obj.files:
28+
# we do not know the layer's id, so we will use the timestamp instead
29+
file_ref = get_file_spdxref(filedata, str(timestamp))
30+
if file_ref not in file_refs:
31+
spdx_files.append(get_file_dict(filedata, template, str(timestamp)))
32+
file_refs.add(file_ref)
33+
return spdx_files
34+
35+
36+
def get_files_list(image_obj: Image, template: Template) -> List[SpdxFile]:
37+
'''Given an image_obj object, and the SPDX template mapping, return a list
38+
of SPDX dictionary representations for each file in each layer of the
39+
image.'''
40+
file_list: List[SpdxFile] = []
41+
42+
# use file refs to keep track of duplicate files that may be located
43+
# in different places in the filesystem
44+
file_refs = set()
45+
for layer in image_obj.layers:
46+
if layer.files_analyzed:
47+
layer_checksum_value = get_layer_checksum(layer).value
48+
for filedata in layer.files:
49+
# we use the layer checksum as the layer id
50+
file_ref = get_file_spdxref(filedata, layer_checksum_value)
51+
if file_ref not in file_refs:
52+
file_list.append(get_file_dict(filedata, template, layer_checksum_value))
53+
file_refs.add(file_ref)
54+
return file_list
55+
56+
57+
def get_file_dict(filedata: FileData, template: Template, layer_id: str) -> SpdxFile:
58+
"""Given a FileData object and its SPDX template mapping, return an
59+
SPDX representation of the file. A layer_id is used to
60+
distinguish copies of the same file occurring in different places in the
61+
image"""
62+
mapping = filedata.to_dict(template)
63+
64+
if filedata.licenses:
65+
license_info_in_file = []
66+
for lic in set(filedata.licenses):
67+
# Add the license expression to the list if it is a valid SPDX
68+
# identifier; otherwise, add the LicenseRef
69+
license_info_in_file.append(get_package_license_declared(lic))
70+
else:
71+
license_info_in_file = [SpdxNone()]
72+
73+
file_notice = get_file_notice(filedata)
74+
file_comment = get_file_comment(filedata)
75+
file_contributors = get_file_contributors(filedata)
76+
77+
return SpdxFile(
78+
spdx_id=get_file_spdxref(filedata, layer_id),
79+
name=mapping['FileName'],
80+
checksums=[get_file_checksum(filedata)],
81+
license_concluded=SpdxNoAssertion(), # we don't provide this
82+
copyright_text=SpdxNoAssertion(), # we don't know this
83+
file_types=[mapping['FileType']] if mapping['FileType'] else None,
84+
license_info_in_file=license_info_in_file,
85+
notice=file_notice if file_notice else None,
86+
comment=file_comment if file_comment else None,
87+
contributors=file_contributors if file_contributors else None,
88+
)
89+
90+
91+
def get_file_checksum(filedata: FileData) -> Checksum:
92+
"""Given a FileData object, return the checksum required by SPDX.
93+
Currently, the spec requires a SHA1 checksum"""
94+
return Checksum(ChecksumAlgorithm.SHA1, filedata.get_checksum('sha1'))
95+
96+
97+
def get_file_notice(filedata: FileData) -> str:
98+
"""Return a formatted string with all copyrights found in a file. Return
99+
an empty string if there are no copyrights"""
100+
notice = ''
101+
for cp in filedata.copyrights:
102+
notice = notice + cp + '\n'
103+
return notice
104+
105+
106+
def get_file_comment(filedata: FileData) -> str:
107+
"""Return a formatted comment string with all file level notices. Return
108+
an empty string if no notices are present"""
109+
comment = ''
110+
for origin in filedata.origins.origins:
111+
comment = comment + '{}:'.format(origin.origin_str) + '\n'
112+
for notice in origin.notices:
113+
comment = comment + \
114+
'{}: {}'.format(notice.level, notice.message) + '\n'
115+
return comment
116+
117+
118+
def get_file_contributors(filedata: FileData) -> List[str]:
119+
"""The SPDX spec allows for an optional list of file contributors.
120+
If there are any authors found in the file, return a list of authors.
121+
If empty, return an empty list"""
122+
contributors = []
123+
for author in filedata.authors:
124+
contributors.append(author)
125+
return contributors
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
4+
# SPDX-License-Identifier: BSD-2-Clause
5+
6+
"""
7+
General helpers for SPDX document generator
8+
"""
9+
import datetime
10+
import hashlib
11+
import io
12+
import re
13+
import uuid
14+
from datetime import datetime
15+
from typing import Union, Callable, IO, Tuple
16+
17+
from license_expression import get_spdx_licensing, LicenseExpression, Licensing
18+
from spdx_tools.spdx.model import SpdxNone, Document
19+
20+
from tern.classes.file_data import FileData
21+
from tern.classes.image import Image
22+
from tern.classes.image_layer import ImageLayer
23+
from tern.classes.package import Package
24+
25+
26+
def get_uuid() -> str:
27+
return str(uuid.uuid4())
28+
29+
30+
def get_current_timestamp() -> datetime:
31+
return datetime.utcnow().replace(microsecond=0)
32+
33+
34+
def get_string_id(string: str) -> str:
35+
"""Return a unique identifier for the given string"""
36+
return hashlib.sha256(string.encode('utf-8')).hexdigest()[-7:]
37+
38+
39+
def get_license_ref(license_string: str) -> str:
40+
"""For SPDX format, return a LicenseRef string"""
41+
return 'LicenseRef-' + get_string_id(str(license_string))
42+
43+
44+
def replace_invalid_chars_in_license_expression(license_string: str) -> str:
45+
"""Given a license string, replace common invalid SPDX license characters."""
46+
not_allowed = [',', ';', '/', '&']
47+
if any(x in license_string for x in not_allowed):
48+
# Try to replace common invalid license characters
49+
license_string = license_string.replace(',', ' and')
50+
license_string = license_string.replace('/', '-')
51+
license_string = license_string.replace(';', '.')
52+
license_string = license_string.replace('&', 'and')
53+
return license_string
54+
55+
56+
def is_valid_license_expression(license_string: str) -> bool:
57+
licensing = get_spdx_licensing()
58+
try:
59+
return licensing.validate(license_string).errors == []
60+
# Catch any invalid license chars here
61+
except AttributeError:
62+
return False
63+
64+
65+
def get_package_license_declared(package_license_declared: str) -> Union[LicenseExpression, SpdxNone]:
66+
"""After substituting common invalid SPDX license characters using
67+
the is_spdx_license_expression() function, determines if the declared
68+
license string for a package or file is a valid SPDX license expression.
69+
If license expression is valid after substitutions, return the updated string.
70+
If not, return the LicenseRef of the original declared license expression
71+
passed in to the function. If a blank string is passed in, return `NONE`."""
72+
if package_license_declared:
73+
package_license_declared = replace_invalid_chars_in_license_expression(package_license_declared)
74+
if is_valid_license_expression(package_license_declared):
75+
return Licensing().parse(package_license_declared)
76+
77+
return Licensing().parse(get_license_ref(package_license_declared))
78+
return SpdxNone()
79+
80+
81+
def get_serialized_document_string(spdx_document: Document, writer_function: Callable[[Document, IO[str]], str]) -> str:
82+
with io.StringIO() as stream:
83+
writer_function(spdx_document, stream, validate=False)
84+
return stream.getvalue()
85+
86+
87+
###########################################################################################
88+
# central place for SPDXRef-generators to avoid circular imports as these are widely used #
89+
###########################################################################################
90+
91+
def get_image_spdxref(image_obj: Image) -> str:
92+
"""Given the image object, return an SPDX reference ID"""
93+
# here we return the image name, tag and id
94+
return f'SPDXRef-{image_obj.get_human_readable_id()}'
95+
96+
97+
def get_package_spdxref(package_obj: Package) -> Tuple[str, str]:
98+
"""Given the package obj, return an SPDX reference ID for the binary
99+
and source package, if available"""
100+
pkg_ref = f"{package_obj.name}-{package_obj.version}"
101+
src_ref = ''
102+
if package_obj.src_name:
103+
# differentiate between binary and source package refs
104+
src_ver = package_obj.src_version + "-src"
105+
src_ref = f"{package_obj.src_name}-{src_ver}"
106+
# replace all the strings that SPDX doesn't like
107+
# allowed characters are: letters, numbers, "." and "-"
108+
clean_pkg_ref = re.sub(r'[:+~_/]', r'-', pkg_ref)
109+
if src_ref:
110+
clean_src_ref = re.sub(r'[:+~/]', r'-', src_ref)
111+
return f'SPDXRef-{clean_pkg_ref}', f'SPDXRef-{clean_src_ref}'
112+
return f'SPDXRef-{clean_pkg_ref}', ''
113+
114+
115+
def get_layer_spdxref(layer_obj: ImageLayer) -> str:
116+
"""Given the layer object, return an SPDX reference ID"""
117+
# here we return the shortened diff_id of the layer
118+
return f'SPDXRef-{layer_obj.diff_id[:10]}'
119+
120+
121+
def get_file_spdxref(filedata: FileData, layer_id: str) -> str:
122+
"""Given a FileData object, return a unique identifier for the SPDX
123+
document. According to the spec, this should be of the form: SPDXRef-<id>
124+
We will use a combination of the file name, checksum and layer_id and
125+
calculate a hash of this string"""
126+
file_string = filedata.path + filedata.checksum[:7] + layer_id
127+
fileid = get_string_id(file_string)
128+
return f'SPDXRef-{fileid}'
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
4+
# SPDX-License-Identifier: BSD-2-Clause
5+
6+
"""
7+
Image level helpers for SPDX document generator
8+
Images for SPDX act like a Package
9+
"""
10+
from typing import List
11+
12+
from spdx_tools.spdx.model import ExtractedLicensingInfo, Package as SpdxPackage, \
13+
SpdxNoAssertion
14+
15+
from tern.classes.image import Image
16+
from tern.classes.template import Template
17+
from tern.formats.spdx_new.layer_helpers import get_layer_licenses
18+
from tern.formats.spdx_new.general_helpers import get_license_ref, get_uuid, is_valid_license_expression, \
19+
get_image_spdxref
20+
from tern.utils.general import get_git_rev_or_version
21+
22+
23+
def get_image_extracted_licenses(image_obj: Image) -> List[ExtractedLicensingInfo]:
24+
"""Given an image_obj, return a unique list of extractedLicensingInfo
25+
that contains all the file and package LicenseRef and their corresponding plain text."""
26+
27+
unique_licenses = set()
28+
for layer in image_obj.layers:
29+
# Get all of the unique file licenses, if they exist
30+
unique_licenses.update(get_layer_licenses(layer))
31+
# Next, collect any package licenses not already accounted for
32+
for package in layer.packages:
33+
if package.pkg_license:
34+
unique_licenses.add(package.pkg_license)
35+
# Add debian licenses from copyright text as one license
36+
if package.pkg_licenses:
37+
unique_licenses.add(", ".join(package.pkg_licenses))
38+
extracted_licensing_info = []
39+
for lic in list(unique_licenses):
40+
valid_spdx = is_valid_license_expression(lic)
41+
if not valid_spdx:
42+
extracted_licensing_info.append(ExtractedLicensingInfo(license_id=get_license_ref(lic), extracted_text=lic))
43+
44+
return extracted_licensing_info
45+
46+
47+
def get_image_dict(image_obj: Image, template: Template) -> SpdxPackage: # TODO: these kind of functions don't produce dicts anymore, rename them
48+
"""Given an image object and the template object for SPDX, return the
49+
SPDX Package for the given image."""
50+
mapping = image_obj.to_dict(template)
51+
return SpdxPackage(
52+
spdx_id=get_image_spdxref(image_obj),
53+
name=mapping["PackageName"],
54+
download_location=SpdxNoAssertion(),
55+
version=mapping["PackageVersion"],
56+
supplier=SpdxNoAssertion(),
57+
files_analyzed=False,
58+
license_concluded=SpdxNoAssertion(),
59+
license_declared=SpdxNoAssertion(),
60+
copyright_text=SpdxNoAssertion(),
61+
)
62+
63+
64+
def get_document_namespace(image_obj: Image) -> str:
65+
"""Given the image object, return a unique SPDX document uri.
66+
This is a combination of the tool name and version, the image name
67+
and the uuid"""
68+
return f'https://spdx.org/spdxdocs/tern-report-{get_git_rev_or_version()[1]}-{image_obj.name}-{get_uuid()}'

0 commit comments

Comments
 (0)