Skip to content

Commit

Permalink
Add from-file attribute to matches with origin
Browse files Browse the repository at this point in the history
Adds a new attribute to document the origin path from matches
so we can determine in the cases of de-referenced matches, the
location these came from, and also to differentiate between
matches that come from the same file or from a different file.

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
  • Loading branch information
AyanSinhaMahapatra committed Dec 7, 2023
1 parent 8de1e90 commit 3f78833
Show file tree
Hide file tree
Showing 789 changed files with 4,833 additions and 48 deletions.
29 changes: 28 additions & 1 deletion src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,7 @@ def from_dict(cls, license_match_mapping):
matched_text = license_match_mapping.get("matched_text") or None

return cls(
from_file=license_match_mapping["from_file"],
start_line=license_match_mapping["start_line"],
end_line=license_match_mapping["end_line"],
match_score=license_match_mapping["score"],
Expand Down Expand Up @@ -654,6 +655,7 @@ def to_dict(

# Detection Level Information
result['score'] = self.score()
result['from_file'] = self.from_file
result['start_line'] = self.start_line
result['end_line'] = self.end_line
result['matched_length'] = self.len()
Expand All @@ -671,6 +673,21 @@ def to_dict(
return result


def populate_matches_with_path(matches, path):
"""
Given `matches` list of LicenseMatch objects, populate the `from_file`
attribute in them with `path` which is the path for the origin file for
that license match.
"""
for match in matches:
# Here if we have the `from_file` attribute populated already,
# they are from other files, and if it's empty, they are from
# the original resource, so we populate the files with the resource
# path for the original resource of their origin
if not match["from_file"]:
match["from_file"] = path


def collect_license_detections(codebase, include_license_clues=True):
"""
Return a list of LicenseDetectionFromResult object rehydrated from
Expand All @@ -680,7 +697,10 @@ def collect_license_detections(codebase, include_license_clues=True):
according to their license detections. This is required because package fields
are populated in package plugin, which runs before the license plugin, and thus
the license plugin step where unknown references to other files are dereferenced
does not show up automatically in package attributes.
does not show up automatically in package attributes.
Also populate from_file attributes with resource paths for matches which have
origin in the same file.
"""
has_packages = hasattr(codebase.root, 'package_data')
has_licenses = hasattr(codebase.root, 'license_detections')
Expand All @@ -692,7 +712,11 @@ def collect_license_detections(codebase, include_license_clues=True):
resource_license_detections = []
if has_licenses:
license_detections = getattr(resource, 'license_detections', []) or []
for detection in license_detections:
populate_matches_with_path(matches=detection["matches"], path=resource.path)
license_clues = getattr(resource, 'license_clues', []) or []
populate_matches_with_path(matches=license_clues, path=resource.path)
codebase.save_resource(resource)

if license_detections:
license_detection_objects = detections_from_license_detection_mappings(
Expand Down Expand Up @@ -729,6 +753,9 @@ def collect_license_detections(codebase, include_license_clues=True):

package_license_detections = package["license_detections"]
if package_license_detections:
for detection in package_license_detections:
populate_matches_with_path(matches=detection["matches"], path=resource.path)
modified = True
package_license_detection_mappings.extend(package_license_detections)
detection_is_same, license_expression = verify_package_license_expression(
license_detection_mappings=package_license_detections,
Expand Down
15 changes: 14 additions & 1 deletion src/licensedcode/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,17 @@ class LicenseMatch(object):
metadata=dict(help='match end line, 1-based')
)

from_file = attr.ib(
default=None,
metadata=dict(
help='File path where this LicenseMatch was originally detected. '
'This needs to be stored as we bring over LicenseMatches from '
'other files into LicenseDetection objects now, and we need '
'to track the origin for these to be able to determine easily '
'which are native to that file.'
)
)

query = attr.ib(
default=None,
metadata=dict(help='Query object for this match')
Expand Down Expand Up @@ -722,7 +733,7 @@ def matched_text(
highlight=True,
highlight_matched='{}',
highlight_not_matched='[{}]',
_usecache=True
_usecache=True,
):
"""
Return the matched text for this match or an empty string if no query
Expand Down Expand Up @@ -763,6 +774,7 @@ def to_dict(
include_text=False,
license_text_diagnostics=False,
whole_lines=True,
file_path=None,
):
"""
Return a "result" scan data built from a LicenseMatch object.
Expand All @@ -783,6 +795,7 @@ def to_dict(
result['score'] = self.score()
result['start_line'] = self.start_line
result['end_line'] = self.end_line
result['from_file'] = file_path
result['matched_length'] = self.len()
result['match_coverage'] = self.coverage()
result['matcher'] = self.matcher
Expand Down
12 changes: 8 additions & 4 deletions src/licensedcode/plugin_license.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from licensedcode.cache import build_spdx_license_expression, get_cache
from licensedcode.detection import collect_license_detections
from licensedcode.detection import populate_matches_with_path
from licensedcode.detection import find_referenced_resource
from licensedcode.detection import get_detected_license_expression
from licensedcode.detection import get_matches_from_detection_mappings
Expand Down Expand Up @@ -279,11 +280,14 @@ def add_referenced_filenames_license_matches_for_detections(resource, codebase):
modified = True
detection_modified = True
detections_added.extend(referenced_resource.license_detections)
license_match_mappings.extend(
get_matches_from_detection_mappings(
license_detections=referenced_resource.license_detections
)
matches_to_extend = get_matches_from_detection_mappings(
license_detections=referenced_resource.license_detections
)
populate_matches_with_path(
matches=matches_to_extend,
path=referenced_resource.path
)
license_match_mappings.extend(matches_to_extend)

if not detection_modified:
continue
Expand Down
23 changes: 19 additions & 4 deletions src/packagedcode/licensing.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from licensedcode.detection import find_referenced_resource
from licensedcode.detection import detect_licenses
from licensedcode.detection import LicenseDetectionFromResult
from licensedcode.detection import populate_matches_with_path
from licensedcode.spans import Span
from licensedcode import query

Expand Down Expand Up @@ -113,11 +114,16 @@ def add_referenced_license_matches_for_package(resource, codebase):
if referenced_license_detections:
modified = True
detection_modified = True
license_match_mappings.extend(
get_matches_from_detection_mappings(
license_detections=referenced_license_detections
)
matches_to_extend = get_matches_from_detection_mappings(
license_detections=referenced_license_detections
)
# For LicenseMatches with different resources as origin, add the
# resource path to these matches as origin info
populate_matches_with_path(
matches=matches_to_extend,
path=referenced_resource.path
)
license_match_mappings.extend(matches_to_extend)

if not detection_modified:
continue
Expand Down Expand Up @@ -231,6 +237,10 @@ def add_referenced_license_detection_from_package(resource, codebase):
for pkg_detection in pkg_detections:
modified = True
detection_modified = True
populate_matches_with_path(
matches=pkg_detection["matches"],
path=resource.path
)
license_match_mappings.extend(pkg_detection["matches"])
detections_added.append(pkg_detection)
analysis = DetectionCategory.UNKNOWN_REFERENCE_IN_FILE_TO_PACKAGE.value
Expand Down Expand Up @@ -347,6 +357,11 @@ def get_license_detections_from_sibling_file(resource, codebase):

license_detections = []
for sibling in siblings:
for detection in sibling.license_detections:
populate_matches_with_path(
matches=detection["matches"],
path=sibling.path
)
license_detections.extend(sibling.license_detections)

if not license_detections:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"score": 96.07,
"start_line": 7,
"end_line": 70,
"from_file": "LICENSE",
"matched_length": 367,
"match_coverage": 100.0,
"matcher": "3-seq",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"score": 100.0,
"start_line": 7,
"end_line": 22,
"from_file": "LICENSE2",
"matched_length": 145,
"match_coverage": 100.0,
"matcher": "2-aho",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"score": 100.0,
"start_line": 1,
"end_line": 47,
"from_file": "LICENSE3",
"matched_length": 303,
"match_coverage": 100.0,
"matcher": "1-hash",
Expand Down
20 changes: 20 additions & 0 deletions tests/formattedcode/data/common/manifests-expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
"score": 100.0,
"start_line": 1,
"end_line": 1,
"from_file": null,
"matched_length": 8,
"match_coverage": 100.0,
"matcher": "2-aho",
Expand All @@ -47,6 +48,7 @@
"score": 100.0,
"start_line": 2,
"end_line": 2,
"from_file": null,
"matched_length": 7,
"match_coverage": 100.0,
"matcher": "2-aho",
Expand Down Expand Up @@ -125,6 +127,7 @@
"score": 100.0,
"start_line": 1,
"end_line": 1,
"from_file": null,
"matched_length": 3,
"match_coverage": 100.0,
"matcher": "1-hash",
Expand All @@ -144,6 +147,7 @@
"score": 100.0,
"start_line": 1,
"end_line": 1,
"from_file": null,
"matched_length": 3,
"match_coverage": 100.0,
"matcher": "1-hash",
Expand Down Expand Up @@ -218,6 +222,7 @@
"score": 100.0,
"start_line": 1,
"end_line": 1,
"from_file": null,
"matched_length": 1,
"match_coverage": 100.0,
"matcher": "1-spdx-id",
Expand Down Expand Up @@ -640,6 +645,7 @@
"score": 100.0,
"start_line": 1,
"end_line": 1,
"from_file": "manifests/maven/persistence-api-1.0.pom",
"matched_length": 8,
"match_coverage": 100.0,
"matcher": "2-aho",
Expand All @@ -653,6 +659,7 @@
"score": 100.0,
"start_line": 2,
"end_line": 2,
"from_file": "manifests/maven/persistence-api-1.0.pom",
"matched_length": 7,
"match_coverage": 100.0,
"matcher": "2-aho",
Expand Down Expand Up @@ -697,6 +704,7 @@
"score": 16.0,
"start_line": 17,
"end_line": 19,
"from_file": "manifests/maven/persistence-api-1.0.pom",
"matched_length": 3,
"match_coverage": 100.0,
"matcher": "2-aho",
Expand All @@ -709,6 +717,7 @@
"score": 82.35,
"start_line": 18,
"end_line": 20,
"from_file": "manifests/maven/persistence-api-1.0.pom",
"matched_length": 14,
"match_coverage": 82.35,
"matcher": "3-seq",
Expand Down Expand Up @@ -829,6 +838,7 @@
"score": 100.0,
"start_line": 1,
"end_line": 1,
"from_file": "manifests/npm-license-mapping/package.json",
"matched_length": 3,
"match_coverage": 100.0,
"matcher": "1-hash",
Expand All @@ -848,6 +858,7 @@
"score": 100.0,
"start_line": 1,
"end_line": 1,
"from_file": "manifests/npm-license-mapping/package.json",
"matched_length": 3,
"match_coverage": 100.0,
"matcher": "1-hash",
Expand Down Expand Up @@ -991,6 +1002,7 @@
"score": 100.0,
"start_line": 6,
"end_line": 6,
"from_file": "manifests/npm-license-mapping/package.json",
"matched_length": 4,
"match_coverage": 100.0,
"matcher": "2-aho",
Expand All @@ -1009,6 +1021,7 @@
"score": 100.0,
"start_line": 20,
"end_line": 20,
"from_file": "manifests/npm-license-mapping/package.json",
"matched_length": 3,
"match_coverage": 100.0,
"matcher": "2-aho",
Expand Down Expand Up @@ -1133,6 +1146,7 @@
"score": 100.0,
"start_line": 1,
"end_line": 1,
"from_file": "manifests/npm-license-string/package.json",
"matched_length": 1,
"match_coverage": 100.0,
"matcher": "1-spdx-id",
Expand Down Expand Up @@ -1226,6 +1240,7 @@
"score": 100.0,
"start_line": 4,
"end_line": 4,
"from_file": "manifests/npm-license-string/package.json",
"matched_length": 2,
"match_coverage": 100.0,
"matcher": "2-aho",
Expand Down Expand Up @@ -1354,6 +1369,7 @@
"score": 100.0,
"start_line": 1,
"end_line": 1,
"from_file": "manifests/pypi/bluepyopt_setup.py",
"matched_length": 1,
"match_coverage": 100.0,
"matcher": "1-hash",
Expand All @@ -1373,6 +1389,7 @@
"score": 100.0,
"start_line": 1,
"end_line": 1,
"from_file": "manifests/pypi/bluepyopt_setup.py",
"matched_length": 10,
"match_coverage": 100.0,
"matcher": "1-hash",
Expand Down Expand Up @@ -1524,6 +1541,7 @@
"score": 100.0,
"start_line": 9,
"end_line": 20,
"from_file": "manifests/pypi/bluepyopt_setup.py",
"matched_length": 106,
"match_coverage": 100.0,
"matcher": "2-aho",
Expand All @@ -1542,6 +1560,7 @@
"score": 100.0,
"start_line": 65,
"end_line": 65,
"from_file": "manifests/pypi/bluepyopt_setup.py",
"matched_length": 2,
"match_coverage": 100.0,
"matcher": "2-aho",
Expand All @@ -1560,6 +1579,7 @@
"score": 100.0,
"start_line": 74,
"end_line": 75,
"from_file": "manifests/pypi/bluepyopt_setup.py",
"matched_length": 10,
"match_coverage": 100.0,
"matcher": "2-aho",
Expand Down
Loading

0 comments on commit 3f78833

Please sign in to comment.