Add new attribute matched_text_diagnostics

This commit adds a new attribute for license text diagnostics which will be added when the CLI option `--license-text-diagnostics` is used, in contrast to earlier behaviour, where this diagnostics matched text used to overwrite the text in `matched_text`. Also makes sure top-level license/package summarizations have matched text and diagnostics correcttly only when the respective CLI options are used. Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
aboutcode-org · Dec 13, 2023 · 2ddb31c · 2ddb31c
1 parent d96e69e
commit 2ddb31c
Show file tree

Hide file tree

Showing 107 changed files with 11,529 additions and 10,343 deletions.
diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py
@@ -606,6 +606,12 @@ class LicenseMatchFromResult(LicenseMatch):
             help='Text which was matched')
     )
 
+    matched_text_diagnostics = attr.ib(
+        default=None,
+        metadata=dict(
+            help='Text which was matched, with extra diagnostics information.')
+    )
+
     def score(self):
         return self.match_score
 
@@ -631,6 +637,7 @@ def from_dict(cls, license_match_mapping):
         """
         rule = Rule.from_match_data(license_match_mapping)
         matched_text = license_match_mapping.get("matched_text") or None
+        matched_text_diagnostics = license_match_mapping.get("matched_text_diagnostics") or None
 
         return cls(
             from_file=license_match_mapping["from_file"],
@@ -641,6 +648,7 @@ def from_dict(cls, license_match_mapping):
             match_coverage=license_match_mapping["match_coverage"],
             matcher=license_match_mapping["matcher"],
             text=matched_text,
+            matched_text_diagnostics=matched_text_diagnostics,
             rule=rule,
             qspan=None,
             ispan=None,
@@ -664,10 +672,6 @@ def to_dict(
         """
         Return a "result" scan data built from a LicenseMatch object.
         """
-        matched_text = None
-        if include_text:
-            matched_text = self.matched_text
-
         result = {}
 
         result['license_expression'] = self.rule.license_expression
@@ -689,8 +693,10 @@ def to_dict(
         if rule_details:
             result["rule_notes"] = self.rule.notes
             result["referenced_filenames"] = self.rule.referenced_filenames
-        if include_text:
-            result['matched_text'] = matched_text
+        if include_text and self.matched_text:
+            result['matched_text'] = self.matched_text
+        if license_text_diagnostics and self.matched_text_diagnostics:
+            result['matched_text_diagnostics'] = self.matched_text_diagnostics
         if rule_details:
             result["rule_text"] = self.rule.text
 
@@ -929,7 +935,11 @@ def get_unique_detections(cls, license_detections):
 
         return unique_license_detections
 
-    def to_dict(self, license_diagnostics):
+    def to_dict(self,
+        include_text=False,
+        license_text_diagnostics=False,
+        license_diagnostics=False,
+    ):
 
         def dict_fields(attr, value):
 
@@ -946,7 +956,10 @@ def dict_fields(attr, value):
 
         detection_mapping = attr.asdict(self, filter=dict_fields)
         detection_mapping["sample_matches"] = [
-            match.to_dict(include_text=True)
+            match.to_dict(
+                include_text=include_text,
+                license_text_diagnostics=license_text_diagnostics,
+            )
             for match in self.matches
         ]
         return detection_mapping

diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
@@ -773,7 +773,7 @@ def to_dict(
         spdx_license_url=SPDX_LICENSE_URL,
         include_text=False,
         license_text_diagnostics=False,
-        whole_lines=True,
+        whole_lines=False,
         file_path=None,
     ):
         """
@@ -785,11 +785,11 @@ def to_dict(
         if include_text:
             if license_text_diagnostics:
                 matched_text_diagnostics = self.matched_text(whole_lines=False, highlight=True)
+
+            if whole_lines:
+                matched_text = self.matched_text(whole_lines=True, highlight=False)
             else:
-                if whole_lines:
-                    matched_text = self.matched_text(whole_lines=True, highlight=False)
-                else:
-                    matched_text = self.matched_text(whole_lines=False, highlight=False)
+                matched_text = self.matched_text(whole_lines=False, highlight=False)
 
         result = {}
 
@@ -808,8 +808,8 @@ def to_dict(
 
         if include_text:
             result['matched_text'] = matched_text
-            if license_text_diagnostics:
-               result['matched_text_diagnostics'] = matched_text_diagnostics
+        if license_text_diagnostics:
+            result['matched_text_diagnostics'] = matched_text_diagnostics
         return result
 
     def get_highlighted_text(self, trace=TRACE_HIGHLIGHTED_TEXT):

diff --git a/src/licensedcode/plugin_license.py b/src/licensedcode/plugin_license.py
@@ -170,7 +170,7 @@ def get_scanner(
             unknown_licenses=unknown_licenses,
         )
 
-    def process_codebase(self, codebase, license_diagnostics, **kwargs):
+    def process_codebase(self, codebase, license_text=False, license_diagnostics=False, license_text_diagnostics=False, **kwargs):
         """
         Post-process ``codebase`` to follow referenced filenames to license
         matches in other files.
@@ -231,7 +231,11 @@ def process_codebase(self, codebase, license_diagnostics, **kwargs):
             )
 
         unsorted_license_detections = [
-            unique_detection.to_dict(license_diagnostics=license_diagnostics)
+            unique_detection.to_dict(
+                include_text=license_text,
+                license_diagnostics=license_diagnostics,
+                license_text_diagnostics=license_text_diagnostics,
+            )
             for unique_detection in unique_license_detections
         ]
         codebase.attributes.license_detections.extend(

diff --git a/src/packagedcode/plugin_package.py b/src/packagedcode/plugin_package.py
@@ -185,7 +185,7 @@ def get_scanner(self, package=True, system_package=False, **kwargs):
             system=system_package,
         )
 
-    def process_codebase(self, codebase, strip_root=False, **kwargs):
+    def process_codebase(self, codebase, strip_root=False, license_text=False, license_diagnostics=False, license_text_diagnostics=False, **kwargs):
         """
         Populate the ``codebase`` top level ``packages`` and ``dependencies``
         with package and dependency instances, assembling parsed package data

diff --git a/src/summarycode/todo.py b/src/summarycode/todo.py
@@ -100,6 +100,22 @@ def process_codebase(self, codebase, **kwargs):
         if hasattr(codebase.root, 'license_detections'):
             has_licenses = True
 
+            license_diagnostics = kwargs.get("license_diagnostics")
+            license_text = kwargs.get("license_text")
+            license_text_diagnostics = kwargs.get("license_text_diagnostics")
+            if not license_diagnostics or not license_text or not license_text_diagnostics:
+                usage_suggestion_message = (
+                    "The --review option, whe paired with --license option should be used with the folowing "
+                    "additional CLI options for maximum benifit: [`--license-text`, `--license-text-diagnostics`,"
+                    "--license-diagnostics`] as these show additional diagnostic information to help review the issues."
+                )
+                warnings.simplefilter('always', ToDoPluginUsageWarning)
+                warnings.warn(
+                    usage_suggestion_message,
+                    ToDoPluginUsageWarning,
+                    stacklevel=2,
+                )
+
         if not has_packages and not has_licenses:
             usage_suggestion_message = (
                 "The --review option should be used with atleast one of the license [`--license`], "
@@ -323,11 +339,13 @@ def dict_fields(attr, value):
             matches_with_details = []
             for license_match in detection_mapping["detection"]["matches"]:
                 license_match_obj = LicenseMatchFromResult.from_dict(license_match)
-                matches_with_details.append(license_match_obj.to_dict(
-                    include_text=True,
-                    license_text_diagnostics=True,
-                    rule_details=True,
-                ))
+                matches_with_details.append(
+                    license_match_obj.to_dict(
+                        include_text=True,
+                        license_text_diagnostics=True,
+                        rule_details=True,
+                    )
+                )
             detection_mapping["detection"]["matches"] = matches_with_details
 
         return detection_mapping

diff --git a/tests/cluecode/data/plugin_filter_clues/filtered-expected.json b/tests/cluecode/data/plugin_filter_clues/filtered-expected.json
@@ -7,19 +7,18 @@
       "detection_count": 1,
       "sample_matches": [
         {
-          "score": 96.07,
+          "license_expression": "apache-1.1",
+          "license_expression_spdx": "Apache-1.1",
           "from_file": "LICENSE",
           "start_line": 7,
           "end_line": 70,
+          "matcher": "3-seq",
+          "score": 96.07,
           "matched_length": 367,
           "match_coverage": 100.0,
-          "matcher": "3-seq",
-          "license_expression": "apache-1.1",
-          "license_expression_spdx": "Apache-1.1",
-          "rule_identifier": "apache-1.1_63.RULE",
           "rule_relevance": 100,
-          "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/apache-1.1_63.RULE",
-          "matched_text": null
+          "rule_identifier": "apache-1.1_63.RULE",
+          "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/apache-1.1_63.RULE"
         }
       ]
     }
@@ -52,17 +51,17 @@
           "license_expression_spdx": "Apache-1.1",
           "matches": [
             {
-              "score": 96.07,
+              "license_expression": "apache-1.1",
+              "spdx_license_expression": "Apache-1.1",
+              "from_file": "LICENSE",
               "start_line": 7,
               "end_line": 70,
-              "from_file": "LICENSE",
+              "matcher": "3-seq",
+              "score": 96.07,
               "matched_length": 367,
               "match_coverage": 100.0,
-              "matcher": "3-seq",
-              "license_expression": "apache-1.1",
-              "spdx_license_expression": "Apache-1.1",
-              "rule_identifier": "apache-1.1_63.RULE",
               "rule_relevance": 100,
+              "rule_identifier": "apache-1.1_63.RULE",
               "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/apache-1.1_63.RULE"
             }
           ],

diff --git a/tests/cluecode/data/plugin_filter_clues/filtered-expected2.json b/tests/cluecode/data/plugin_filter_clues/filtered-expected2.json
@@ -7,19 +7,18 @@
       "detection_count": 1,
       "sample_matches": [
         {
-          "score": 100.0,
+          "license_expression": "pygres-2.2",
+          "license_expression_spdx": "LicenseRef-scancode-pygres-2.2",
           "from_file": "LICENSE2",
           "start_line": 7,
           "end_line": 22,
+          "matcher": "2-aho",
+          "score": 100.0,
           "matched_length": 145,
           "match_coverage": 100.0,
-          "matcher": "2-aho",
-          "license_expression": "pygres-2.2",
-          "license_expression_spdx": "LicenseRef-scancode-pygres-2.2",
-          "rule_identifier": "pygres-2.2_2.RULE",
           "rule_relevance": 100,
-          "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/pygres-2.2_2.RULE",
-          "matched_text": null
+          "rule_identifier": "pygres-2.2_2.RULE",
+          "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/pygres-2.2_2.RULE"
         }
       ]
     }
@@ -52,17 +51,17 @@
           "license_expression_spdx": "LicenseRef-scancode-pygres-2.2",
           "matches": [
             {
-              "score": 100.0,
+              "license_expression": "pygres-2.2",
+              "spdx_license_expression": "LicenseRef-scancode-pygres-2.2",
+              "from_file": "LICENSE2",
               "start_line": 7,
               "end_line": 22,
-              "from_file": "LICENSE2",
+              "matcher": "2-aho",
+              "score": 100.0,
               "matched_length": 145,
               "match_coverage": 100.0,
-              "matcher": "2-aho",
-              "license_expression": "pygres-2.2",
-              "spdx_license_expression": "LicenseRef-scancode-pygres-2.2",
-              "rule_identifier": "pygres-2.2_2.RULE",
               "rule_relevance": 100,
+              "rule_identifier": "pygres-2.2_2.RULE",
               "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/pygres-2.2_2.RULE"
             }
           ],

diff --git a/tests/cluecode/data/plugin_filter_clues/filtered-expected3.json b/tests/cluecode/data/plugin_filter_clues/filtered-expected3.json
@@ -7,19 +7,18 @@
       "detection_count": 1,
       "sample_matches": [
         {
-          "score": 100.0,
+          "license_expression": "pcre",
+          "license_expression_spdx": "LicenseRef-scancode-pcre",
           "from_file": "LICENSE3",
           "start_line": 1,
           "end_line": 47,
+          "matcher": "1-hash",
+          "score": 100.0,
           "matched_length": 303,
           "match_coverage": 100.0,
-          "matcher": "1-hash",
-          "license_expression": "pcre",
-          "license_expression_spdx": "LicenseRef-scancode-pcre",
-          "rule_identifier": "pcre.LICENSE",
           "rule_relevance": 100,
-          "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/pcre.LICENSE",
-          "matched_text": null
+          "rule_identifier": "pcre.LICENSE",
+          "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/pcre.LICENSE"
         }
       ]
     }
@@ -52,17 +51,17 @@
           "license_expression_spdx": "LicenseRef-scancode-pcre",
           "matches": [
             {
-              "score": 100.0,
+              "license_expression": "pcre",
+              "spdx_license_expression": "LicenseRef-scancode-pcre",
+              "from_file": "LICENSE3",
               "start_line": 1,
               "end_line": 47,
-              "from_file": "LICENSE3",
+              "matcher": "1-hash",
+              "score": 100.0,
               "matched_length": 303,
               "match_coverage": 100.0,
-              "matcher": "1-hash",
-              "license_expression": "pcre",
-              "spdx_license_expression": "LicenseRef-scancode-pcre",
-              "rule_identifier": "pcre.LICENSE",
               "rule_relevance": 100,
+              "rule_identifier": "pcre.LICENSE",
               "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/pcre.LICENSE"
             }
           ],