Merge branch 'master' into feature-meson-subprojects

fsfe · May 13, 2022 · bcf248b · bcf248b
2 parents 4cd9f25 + ab74c67
commit bcf248b
Show file tree

Hide file tree

Showing 12 changed files with 634 additions and 26 deletions.
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -50,6 +50,8 @@ Contributors
 
 - Nico Rikken <nico.rikken@fsfe.org>
 
+- Florian Snow <florian@familysnow.net>
+
 Translators
 -----------
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -41,6 +41,9 @@ The versions follow [semantic versioning](https://semver.org).
 
 - Recommendations for installation/run methods: package managers and pipx (#457)
 - Docker images for AArch64 (#478)
+- Added the ability to ignore parts of a file when running `reuse lint`. Simply
+  add `REUSE-IgnoreStart` and `REUSE-IgnoreEnd` as comments and all lines
+  between the two will be ignored by the next run of `reuse lint` (#463).
 - [Meson subprojects](https://mesonbuild.com/Subprojects.html) are now ignored
   by default (#496)
 
@@ -59,6 +62,12 @@ The versions follow [semantic versioning](https://semver.org).
 
   - V-Lang (#432)
 
+- Ignore all SPDX files with their typical formats and extensions (#494).
+
+- Add support for merging copyright lines based on copyright statement,
+  transforming multiple lines with a single year into a single line with a
+  range. (#328)
+
 ### Changed
 
 - Use `setuptools` instead of the deprecated `distutils` which will be removed
@@ -78,6 +87,9 @@ The versions follow [semantic versioning](https://semver.org).
 
 ### Removed
 
+- `JsxCommentStyle` in favor of using `CCommentStyle` directly (see section
+  `Fixed`)
+
 ### Fixed
 
 - Better support for unary "+" operator in license identifiers. For example, if
@@ -90,6 +102,10 @@ The versions follow [semantic versioning](https://semver.org).
 
 - Cleaned up internal string manipulation. (#477)
 
+- JSX (`.jxs` and `.tsx`) actually uses C comment syntax as JSX blocks never
+  stand at the beginning of the file where the licensing info needs to go.
+  (#406)
+
 ### Security
 
 ## 0.14.0 - 2021-12-27

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -83,6 +83,47 @@ With the argument ``--copyright-style`` it is possible to change the default
 
 Shebangs are always preserved at the top of the file.
 
+Merging Statements
+------------------
+
+When the tool parses copyright headers, `reuse` can be configured to
+automatically merge copyright lines based on the statement element.
+This effectively transforms multiple lines with a single year into a single line
+with a range.
+
+Starting with the following header,
+
+.. code-block:: python
+
+   # SPDX-FileCopyrightText: 2016 Jane Doe
+   # SPDX-FileCopyrightText: 2018 John Doe
+   #
+   # SPDX-License-Identifier: GPL-2.0
+
+The standard tool options would produce the following
+
+.. code-block:: console
+
+   $ reuse addheader --year 2018 --license GPL-2.0 --copyright="Jane Doe" file.py
+
+.. code-block:: python
+
+   # SPDX-FileCopyrightText: 2016 Jane Doe
+   # SPDX-FileCopyrightText: 2018 John Doe
+   # SPDX-FileCopyrightText: 2018 Jane Doe
+   #
+   # SPDX-License-Identifier: GPL-2.0
+
+Running the same command with the `--merge-copyrights` option will instead
+produce the following
+
+.. code-block:: python
+
+   # SPDX-FileCopyrightText: 2016 - 2018 Jane Doe
+   # SPDX-FileCopyrightText: 2018 John Doe
+   #
+   # SPDX-License-Identifier: GPL-2.0
+
 Comment styles
 --------------
 
@@ -287,3 +328,53 @@ are the methods:
 
 If a file is found that does not have copyright and/or license information
 associated with it, then the project is not compliant.
+
+Ignoring parts of a file
+------------------------
+
+You can easily ignore parts of a file that will always cause problems for
+``reuse lint``. Suppose you have the following bash script:
+
+.. code:: bash
+
+   #!/usr/bin/env bash
+   # SPDX-FileCopyrightText: 2021 John Doe
+   #
+   # SPDX-License-Identifier: CC0-1.0
+
+   echo "SPDX-FileCopyrightText: $(date +'%Y') Jane Doe" > file.txt
+   echo "SPDX-License-Identifier: MIT" > file.txt
+
+   exit 0
+
+This will lead to the following error message despite the file having the
+correct licensing info in the header:
+
+.. code:: text
+
+  $ reuse lint
+  reuse._util - ERROR - Could not parse 'MIT" > file.txt'
+  reuse.project - ERROR - 'foobar.sh' holds an SPDX expression that cannot be parsed, skipping the file
+  # MISSING COPYRIGHT AND LICENSING INFORMATION
+
+  The following files have no copyright and licensing information:
+  * foobar.sh
+  [...]
+
+To avoid this error message, you can simply amend the file as follows:
+
+.. code:: bash
+
+  #!/usr/bin/env bash
+  # SPDX-FileCopyrightText: 2021 John Doe
+  #
+  # SPDX-License-Identifier: CC0-1.0
+
+  # REUSE-IgnoreStart
+  echo "SPDX-FileCopyrightText: $(date +'%Y') Jane Doe" > file.txt
+  echo "SPDX-License-Identifier: MIT" > file.txt
+  # REUSE-IgnoreEnd
+
+  exit 0
+
+Now, `reuse lint` will not report any problems with this file anymore.
diff --git a/src/reuse/__init__.py b/src/reuse/__init__.py
@@ -57,12 +57,21 @@
     re.compile(r"^\.gitkeep$"),
     re.compile(r"^\.hgtags$"),
     re.compile(r".*\.license$"),
-    re.compile(r".*\.spdx$"),
     # Workaround for /~https://github.com/fsfe/reuse-tool/issues/229
     re.compile(r"^CAL-1.0(-Combined-Work-Exception)?(\..+)?$"),
     re.compile(r"^SHL-2.1(\..+)?$"),
 ]
 
+_IGNORE_SPDX_PATTERNS = [
+    # SPDX files from
+    # https://spdx.github.io/spdx-spec/conformance/#44-standard-data-format-requirements
+    re.compile(r".*\.spdx$"),
+    re.compile(r".*\.spdx.(rdf|json|xml|ya?ml)$"),
+]
+
+# Combine SPDX patterns into file patterns to ease default ignore usage
+_IGNORE_FILE_PATTERNS.extend(_IGNORE_SPDX_PATTERNS)
+
 #: Simple structure for holding SPDX information.
 #:
 #: The two iterables MUST be sets.

diff --git a/src/reuse/_comment.py b/src/reuse/_comment.py
@@ -362,14 +362,6 @@ class JinjaCommentStyle(CommentStyle):
     MULTI_LINE = ("{#", "", "#}")
 
 
-class JsxCommentStyle(CommentStyle):
-    """JSX comment style."""
-
-    _shorthand = "jsx"
-
-    MULTI_LINE = ("{/*", "", "*/}")
-
-
 class LispCommentStyle(CommentStyle):
     """Lisp comment style."""
 
@@ -536,7 +528,7 @@ class VimCommentStyle(CommentStyle):
     ".jinja2": JinjaCommentStyle,
     ".js": CCommentStyle,
     ".json": UncommentableCommentStyle,
-    ".jsx": JsxCommentStyle,
+    ".jsx": CCommentStyle,
     ".jy": PythonCommentStyle,
     ".ksh": PythonCommentStyle,
     ".kt": CCommentStyle,
@@ -626,7 +618,7 @@ class VimCommentStyle(CommentStyle):
     ".toc": TexCommentStyle,
     ".toml": PythonCommentStyle,
     ".ts": CCommentStyle,
-    ".tsx": JsxCommentStyle,
+    ".tsx": CCommentStyle,
     ".ttl": PythonCommentStyle,  # Turtle/RDF
     ".v": CCommentStyle,  # V-Lang source code
     ".vala": CCommentStyle,

diff --git a/src/reuse/_util.py b/src/reuse/_util.py
@@ -1,7 +1,9 @@
 # SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
 # SPDX-FileCopyrightText: © 2020 Liferay, Inc. <https://liferay.com>
 # SPDX-FileCopyrightText: 2020 Tuomas Siipola <tuomas@zpl.fi>
+# SPDX-FileCopyrightText: 2022 Nico Rikken <nico.rikken@fsfe.org>
 # SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
+# SPDX-FileCopyrightText: 2022 Carmen Bianca Bakker <carmenbianca@fsfe.org>
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
@@ -20,7 +22,7 @@
 from hashlib import sha1
 from os import PathLike
 from pathlib import Path
-from typing import BinaryIO, List, Optional
+from typing import BinaryIO, List, Optional, Set
 
 from boolean.boolean import Expression, ParseError
 from debian.copyright import Copyright
@@ -33,6 +35,9 @@
 GIT_EXE = shutil.which("git")
 HG_EXE = shutil.which("hg")
 
+REUSE_IGNORE_START = "REUSE-IgnoreStart"
+REUSE_IGNORE_END = "REUSE-IgnoreEnd"
+
 _LOGGER = logging.getLogger(__name__)
 _LICENSING = Licensing()
 
@@ -51,9 +56,21 @@
     r"SPDX" "-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
 )
 _COPYRIGHT_PATTERNS = [
-    re.compile(r"(SPDX" "-FileCopyrightText:[ \t]+.*?)" + _END_PATTERN),
-    re.compile(r"(Copyright .*?)" + _END_PATTERN),
-    re.compile(r"(© .*?)" + _END_PATTERN),
+    re.compile(
+        r"(?P<copyright>(?P<prefix>SPDX-FileCopyrightText:)\s+"
+        r"((?P<year>\d{4} - \d{4}|\d{4}),?\s+)?"
+        r"(?P<statement>.*)?)" + _END_PATTERN
+    ),
+    re.compile(
+        r"(?P<copyright>(?P<prefix>Copyright(\s\([cC]\))?)\s+"
+        r"((?P<year>\d{4} - \d{4}|\d{4}),?\s+)?"
+        r"(?P<statement>.*)?)" + _END_PATTERN
+    ),
+    re.compile(
+        r"(?P<copyright>(?P<prefix>©)\s+"
+        r"((?P<year>\d{4} - \d{4}|\d{4}),?\s+)?"
+        r"(?P<statement>.*)?)" + _END_PATTERN
+    ),
 ]
 
 _COPYRIGHT_STYLES = {
@@ -175,12 +192,76 @@ def _copyright_from_dep5(path: PathLike, dep5_copyright: Copyright) -> SpdxInfo:
     )
 
 
+def _parse_copyright_year(year: str) -> list:
+    """Parse copyright years and return list."""
+    if not year:
+        ret = []
+    if re.match(r"\d{4}$", year):
+        ret = [int(year)]
+    if re.match(r"\d{4} - \d{4}$", year):
+        ret = [int(year[:4]), int(year[-4:])]
+    return ret
+
+
+def merge_copyright_lines(copyright_lines: Set[str]) -> Set[str]:
+    """Parse all copyright lines and merge identical statements making years
+    into a range.
+    If a same statement uses multiple prefixes, use only the most frequent one.
+    """
+    copyright_in = []
+    for line in copyright_lines:
+        for pattern in _COPYRIGHT_PATTERNS:
+            match = pattern.search(line)
+            if match is not None:
+                copyright_in.append(
+                    {
+                        "statement": match.groupdict()["statement"],
+                        "year": _parse_copyright_year(
+                            match.groupdict()["year"]
+                        ),
+                        "prefix": match.groupdict()["prefix"],
+                    }
+                )
+
+    copyright_out = []
+    for statement in {item["statement"] for item in copyright_in}:
+        copyright_list = [
+            item for item in copyright_in if item["statement"] == statement
+        ]
+        prefixes = [item["prefix"] for item in copyright_list]
+
+        # Get the style of the most common prefix
+        prefix = max(set(prefixes), key=prefixes.count)
+        style = "spdx"
+        # pylint: disable=consider-using-dict-items
+        for sty in _COPYRIGHT_STYLES:
+            if prefix == _COPYRIGHT_STYLES[sty]:
+                style = sty
+                break
+
+        # get year range if any
+        years = []
+        for copy in copyright_list:
+            years += copy["year"]
+
+        if len(years) == 0:
+            year = None
+        elif min(years) == max(years):
+            year = min(years)
+        else:
+            year = f"{min(years)} - {max(years)}"
+
+        copyright_out.append(make_copyright_line(statement, year, style))
+    return copyright_out
+
+
 def extract_spdx_info(text: str) -> SpdxInfo:
     """Extract SPDX information from comments in a string.
 
     :raises ExpressionError: if an SPDX expression could not be parsed
     :raises ParseError: if an SPDX expression could not be parsed
     """
+    text = filter_ignore_block(text)
     expression_matches = set(map(str.strip, _IDENTIFIER_PATTERN.findall(text)))
     expressions = set()
     copyright_matches = set()
@@ -198,12 +279,36 @@ def extract_spdx_info(text: str) -> SpdxInfo:
         for pattern in _COPYRIGHT_PATTERNS:
             match = pattern.search(line)
             if match is not None:
-                copyright_matches.add(match.groups()[0])
+                copyright_matches.add(match.groupdict()["copyright"])
                 break
 
     return SpdxInfo(expressions, copyright_matches)
 
 
+def filter_ignore_block(text: str) -> str:
+    """Filter out blocks beginning with REUSE_IGNORE_START and ending with
+    REUSE_IGNORE_END to remove lines that should not be treated as copyright and
+    licensing information.
+    """
+    ignore_start = None
+    ignore_end = None
+    if REUSE_IGNORE_START in text:
+        ignore_start = text.index(REUSE_IGNORE_START)
+    if REUSE_IGNORE_END in text:
+        ignore_end = text.index(REUSE_IGNORE_END) + len(REUSE_IGNORE_END)
+    if not ignore_start:
+        return text
+    if not ignore_end:
+        return text[:ignore_start]
+    if ignore_end > ignore_start:
+        return text[:ignore_start] + filter_ignore_block(text[ignore_end:])
+    rest = text[ignore_start + len(REUSE_IGNORE_START) :]
+    if REUSE_IGNORE_END in rest:
+        ignore_end = rest.index(REUSE_IGNORE_END) + len(REUSE_IGNORE_END)
+        return text[:ignore_start] + filter_ignore_block(rest[ignore_end:])
+    return text[:ignore_start]
+
+
 def contains_spdx_info(text: str) -> bool:
     """The text contains SPDX info."""
     try:
-Original file line number
+Diff line change
@@ Expand Up / @@ -50,6 +50,8 @@ Contributors @@
     - Nico Rikken <nico.rikken@fsfe.org>
+    - Florian Snow <florian@familysnow.net>
     Translators
     -----------
@@ Expand Down @@