From f43167b41a32729d50f83cb29ce904ec3b9812e0 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jonathan.manning@seqera.io>
Date: Tue, 13 Feb 2024 10:20:20 +0000
Subject: [PATCH] Umitools prepareforrsem (#4897)

* Add umitools/prepareforrsem

* Remove trailing newlines

* Fix software env

* appease eclint

* fix linting

* fix comment

* Don't snapshot python version

* Readd snapshot file

* Apply suggestions from code review

Co-authored-by: Maxime U Garcia <max.u.garcia@gmail.com>

---------

Co-authored-by: Maxime U Garcia <max.u.garcia@gmail.com>
---
 .../umitools/prepareforrsem/environment.yml   |   7 +
 .../nf-core/umitools/prepareforrsem/main.nf   |  34 ++
 .../nf-core/umitools/prepareforrsem/meta.yml  |  53 +++
 .../templates/prepare-for-rsem.py             | 313 ++++++++++++++++++
 .../prepareforrsem/tests/main.nf.test         |  62 ++++
 .../prepareforrsem/tests/main.nf.test.snap    |  62 ++++
 .../umitools/prepareforrsem/tests/tags.yml    |   2 +
 7 files changed, 533 insertions(+)
 create mode 100644 modules/nf-core/umitools/prepareforrsem/environment.yml
 create mode 100644 modules/nf-core/umitools/prepareforrsem/main.nf
 create mode 100644 modules/nf-core/umitools/prepareforrsem/meta.yml
 create mode 100755 modules/nf-core/umitools/prepareforrsem/templates/prepare-for-rsem.py
 create mode 100644 modules/nf-core/umitools/prepareforrsem/tests/main.nf.test
 create mode 100644 modules/nf-core/umitools/prepareforrsem/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/umitools/prepareforrsem/tests/tags.yml

diff --git a/modules/nf-core/umitools/prepareforrsem/environment.yml b/modules/nf-core/umitools/prepareforrsem/environment.yml
new file mode 100644
index 000000000000..a6cc37774d43
--- /dev/null
+++ b/modules/nf-core/umitools/prepareforrsem/environment.yml
@@ -0,0 +1,7 @@
+name: umitools_prepareforrsem
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::umi_tools=1.1.4
diff --git a/modules/nf-core/umitools/prepareforrsem/main.nf b/modules/nf-core/umitools/prepareforrsem/main.nf
new file mode 100644
index 000000000000..b556474b8249
--- /dev/null
+++ b/modules/nf-core/umitools/prepareforrsem/main.nf
@@ -0,0 +1,34 @@
+process UMITOOLS_PREPAREFORRSEM {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/umi_tools:1.1.4--py38hbff2b2d_1' :
+        'biocontainers/umi_tools:1.1.4--py38hbff2b2d_1' }"
+
+    input:
+    tuple val(meta), path(bam), path(bai)
+
+    output:
+    tuple val(meta), path('*.bam'), emit: bam
+    tuple val(meta), path('*.log'), emit: log
+    path  "versions.yml"          , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    template 'prepare-for-rsem.py'
+
+    stub:
+    """
+    touch ${meta.id}.bam
+    touch ${meta.id}.log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        umitools: \$( umi_tools --version | sed '/version:/!d; s/.*: //' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/umitools/prepareforrsem/meta.yml b/modules/nf-core/umitools/prepareforrsem/meta.yml
new file mode 100644
index 000000000000..8b85366cc4ad
--- /dev/null
+++ b/modules/nf-core/umitools/prepareforrsem/meta.yml
@@ -0,0 +1,53 @@
+name: umitools_prepareforrsem
+description: Make the output from umi_tools dedup or group compatible with RSEM
+keywords:
+  - umitools
+  - rsem
+  - salmon
+  - dedup
+tools:
+  - umi_tools:
+      description: >
+        UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) and single cell RNA-Seq cell barcodes
+      documentation: https://umi-tools.readthedocs.io/en/latest/
+      license: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: |
+        BAM file containing reads to be deduplicated via UMIs.
+      pattern: "*.{bam}"
+  - bai:
+      type: file
+      description: |
+        BAM index files corresponding to the input BAM file.
+      pattern: "*.{bai}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: Prepared BAM file.
+      pattern: "*.{bam}"
+  - log:
+      type: file
+      description: File with logging information
+      pattern: "*.{log}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@drpatelh"
+  - "@pinin4fjords"
+maintainers:
+  - "@drpatelh"
+  - "@pinin4fjords"
diff --git a/modules/nf-core/umitools/prepareforrsem/templates/prepare-for-rsem.py b/modules/nf-core/umitools/prepareforrsem/templates/prepare-for-rsem.py
new file mode 100755
index 000000000000..c1c2fc2b8186
--- /dev/null
+++ b/modules/nf-core/umitools/prepareforrsem/templates/prepare-for-rsem.py
@@ -0,0 +1,313 @@
+#!/usr/bin/env python3
+
+"""
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Credits
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This script is a clone of the "prepare-for-rsem.py" script written by
+Ian Sudbury, Tom Smith and other contributors to the UMI-tools package:
+/~https://github.com/CGATOxford/UMI-tools
+
+It has been included here to address problems encountered with
+Salmon quant and RSEM as discussed in the issue below:
+/~https://github.com/CGATOxford/UMI-tools/issues/465
+
+When the "umi_tools prepare-for-rsem" command becomes available in an official
+UMI-tools release this template script will be replaced with the native
+umi_tools command.
+
+Commit:
+/~https://github.com/CGATOxford/UMI-tools/blob/bf8608d6a172c5ca0dcf33c126b4e23429177a72/umi_tools/prepare-for-rsem.py
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+prepare_for_rsem - make the output from dedup or group compatible with RSEM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The SAM format specification states that the mnext and mpos fields should point
+to the primary alignment of a read's mate. However, not all aligners adhere to
+this standard. In addition, the RSEM software requires that the mate of a read1
+appears directly after it in its input BAM. This requires that there is exactly
+one read1 alignment for every read2 and vice versa.
+
+In general (except in a few edge cases) UMI tools outputs only the read2 to that
+corresponds to the read specified in the mnext and mpos positions of a selected
+read1, and only outputs this read once, even if multiple read1s point to it.
+This makes UMI-tools outputs incompatible with RSEM. This script takes the output
+from dedup or groups and ensures that each read1 has exactly one read2 (and vice
+versa), that read2 always appears directly after read1,and that pairs point to
+each other (note this is technically not valid SAM format). Copy any specified
+tags from read1 to read2 if they are present (by default, UG and BX, the unique
+group and correct UMI tags added by _group_)
+
+Input must to name sorted.
+
+
+https://raw.githubusercontent.com/CGATOxford/UMI-tools/master/LICENSE
+
+"""
+
+from umi_tools import Utilities as U
+from collections import defaultdict, Counter
+import platform
+import pysam
+import shlex
+import sys
+
+usage = """
+prepare_for_rsem - make output from dedup or group compatible with RSEM
+
+Usage: umi_tools prepare_for_rsem [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
+
+    note: If --stdout is omited, standard out is output. To
+    generate a valid BAM file on standard out, please
+        redirect log with --log=LOGFILE or --log2stderr """
+
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+    """Formats a dictionary to a YAML-like string.
+
+    Args:
+        data (dict): The dictionary to format.
+        indent (int): The current indentation level.
+
+    Returns:
+        str: A string formatted as YAML.
+    """
+    yaml_str = ""
+    for key, value in data.items():
+        spaces = "  " * indent
+        if isinstance(value, dict):
+            yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+        else:
+            yaml_str += f"{spaces}{key}: {value}\\n"
+    return yaml_str
+
+def chunk_bam(bamfile):
+    """Take in a iterator of pysam.AlignmentSegment entries and yield
+    lists of reads that all share the same name"""
+
+    last_query_name = None
+    output_buffer = list()
+
+    for read in bamfile:
+        if last_query_name is not None and last_query_name != read.query_name:
+            yield (output_buffer)
+            output_buffer = list()
+
+        last_query_name = read.query_name
+        output_buffer.append(read)
+
+    yield (output_buffer)
+
+
+def copy_tags(tags, read1, read2):
+    """Given a  list of tags, copies the values of these tags from read1
+    to read2, if the tag is set"""
+
+    for tag in tags:
+        try:
+            read1_tag = read1.get_tag(tag, with_value_type=True)
+            read2.set_tag(tag, value=read1_tag[0], value_type=read1_tag[1])
+        except KeyError:
+            pass
+
+    return read2
+
+
+def pick_mate(read, template_dict, mate_key):
+    """Find the mate of read in the template dict using key. It will retrieve
+    all reads at that key, and then scan to pick the one that refers to _read_
+    as it's mate. If there is no such read, it picks a first one it comes to"""
+
+    mate = None
+
+    # get a list of secondary reads at the correct alignment position
+    potential_mates = template_dict[not read.is_read1][mate_key]
+
+    # search through one at a time to find a read that points to the current read
+    # as its mate.
+    for candidate_mate in potential_mates:
+        if (
+            candidate_mate.next_reference_name == read.reference_name
+            and candidate_mate.next_reference_start == read.pos
+        ):
+            mate = candidate_mate
+
+    # if no such read is found, then pick any old secondary alignment at that position
+    # note: this happens when UMI-tools outputs the wrong read as something's pair.
+    if mate is None and len(potential_mates) > 0:
+        mate = potential_mates[0]
+
+    return mate
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv
+
+    # setup command line parser
+    parser = U.OptionParser(version="%prog version: \$Id\$", usage=usage, description=globals()["__doc__"])
+    group = U.OptionGroup(parser, "RSEM preparation specific options")
+
+    group.add_option(
+        "--tags",
+        dest="tags",
+        type="string",
+        default="UG,BX",
+        help="Comma-separated list of tags to transfer from read1 to read2",
+    )
+    group.add_option(
+        "--sam", dest="sam", action="store_true", default=False, help="input and output SAM rather than BAM"
+    )
+
+    parser.add_option_group(group)
+
+    # add common options (-h/--help, ...) and parse command line
+    (options, args) = U.Start(
+        parser, argv=argv, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False
+    )
+
+    skipped_stats = Counter()
+
+    if options.stdin != sys.stdin:
+        in_name = options.stdin.name
+        options.stdin.close()
+    else:
+        in_name = "-"
+
+    if options.sam:
+        mode = ""
+    else:
+        mode = "b"
+
+    inbam = pysam.AlignmentFile(in_name, "r" + mode)
+
+    if options.stdout != sys.stdout:
+        out_name = options.stdout.name
+        options.stdout.close()
+    else:
+        out_name = "-"
+
+    outbam = pysam.AlignmentFile(out_name, "w" + mode, template=inbam)
+
+    options.tags = options.tags.split(",")
+
+    for template in chunk_bam(inbam):
+        assert len(set(r.query_name for r in template)) == 1
+        current_template = {True: defaultdict(list), False: defaultdict(list)}
+
+        for read in template:
+            key = (read.reference_name, read.pos, not read.is_secondary)
+            current_template[read.is_read1][key].append(read)
+
+        output = set()
+
+        for read in template:
+            mate = None
+
+            # if this read is a non_primary alignment, we first want to check if it has a mate
+            # with the non-primary alignment flag set.
+
+            mate_key_primary = True
+            mate_key_secondary = (read.next_reference_name, read.next_reference_start, False)
+
+            # First look for a read that has the same primary/secondary status
+            # as read (i.e. secondary mate for secondary read, and primary mate
+            # for primary read)
+            mate_key = (read.next_reference_name, read.next_reference_start, read.is_secondary)
+            mate = pick_mate(read, current_template, mate_key)
+
+            # If none was found then look for the opposite (primary mate of secondary
+            # read or seconadary mate of primary read)
+            if mate is None:
+                mate_key = (read.next_reference_name, read.next_reference_start, not read.is_secondary)
+                mate = pick_mate(read, current_template, mate_key)
+
+            # If we still don't have a mate, then their can't be one?
+            if mate is None:
+                skipped_stats["no_mate"] += 1
+                U.warn(
+                    "Alignment {} has no mate -- skipped".format(
+                        "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)]))
+                    )
+                )
+                continue
+
+            # because we might want to make changes to the read, but not have those changes reflected
+            # if we need the read again,we copy the read. This is only way I can find to do this.
+            read = pysam.AlignedSegment().from_dict(read.to_dict(), read.header)
+            mate = pysam.AlignedSegment().from_dict(mate.to_dict(), read.header)
+
+            # Make it so that if our read is secondary, the mate is also secondary. We don't make the
+            # mate primary if the read is primary because we would otherwise end up with mulitple
+            # primary alignments.
+            if read.is_secondary:
+                mate.is_secondary = True
+
+            # In a situation where there is already one mate for each read, then we will come across
+            # each pair twice - once when we scan read1 and once when we scan read2. Thus we need
+            # to make sure we don't output something already output.
+            if read.is_read1:
+                mate = copy_tags(options.tags, read, mate)
+                output_key = str(read) + str(mate)
+
+                if output_key not in output:
+                    output.add(output_key)
+                    outbam.write(read)
+                    outbam.write(mate)
+                    skipped_stats["pairs_output"] += 1
+
+            elif read.is_read2:
+                read = copy_tags(options.tags, mate, read)
+                output_key = str(mate) + str(read)
+
+                if output_key not in output:
+                    output.add(output_key)
+                    outbam.write(mate)
+                    outbam.write(read)
+                    skipped_stats["pairs_output"] += 1
+
+            else:
+                skipped_stats["skipped_not_read12"] += 1
+                U.warn(
+                    "Alignment {} is neither read1 nor read2 -- skipped".format(
+                        "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)]))
+                    )
+                )
+                continue
+
+    if not out_name == "-":
+        outbam.close()
+
+    U.info(
+        "Total pairs output: {}, Pairs skipped - no mates: {},"
+        " Pairs skipped - not read1 or 2: {}".format(
+            skipped_stats["pairs_output"], skipped_stats["no_mate"], skipped_stats["skipped_not_read12"]
+        )
+    )
+    U.Stop()
+
+
+if __name__ == "__main__":
+    if '${task.ext.prefix}' != "null":
+        prefix = "${task.ext.prefix}."
+    elif '$meta.id' != "null":
+        prefix = '${meta.id}.'
+    else:
+        prefix = ''
+
+    command_line = f"--stdin=$bam --stdout={prefix}bam --log={prefix}prepare_for_rsem.log $task.ext.args"
+
+    # Convert the string into a list mimicking sys.argv
+    argv = ["script_name"] + shlex.split(command_line)
+
+    # Run the script and record error code
+    exit_code = main(argv)
+
+    # Write the versions
+    versions_this_module = {}
+    versions_this_module["${task.process}"] = {"umitools": U.__version__}
+    with open("versions.yml", "w") as f:
+        f.write(format_yaml_like(versions_this_module))
+
+    # Exit with correct error code
+    sys.exit(exit_code)
diff --git a/modules/nf-core/umitools/prepareforrsem/tests/main.nf.test b/modules/nf-core/umitools/prepareforrsem/tests/main.nf.test
new file mode 100644
index 000000000000..d78158baeacb
--- /dev/null
+++ b/modules/nf-core/umitools/prepareforrsem/tests/main.nf.test
@@ -0,0 +1,62 @@
+nextflow_process {
+
+    name "Test Process UMITOOLS_PREPAREFORRSEM"
+    script "../main.nf"
+    process "UMITOOLS_PREPAREFORRSEM"
+
+    tag "modules"
+    tag "modules_nfcore"
+    tag "umitools"
+    tag "umitools/prepareforrsem"
+
+    test("sarscov2 - bam") {
+
+        when {
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:false ], // meta map
+                    file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true),
+                    file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true)
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out.bam).match('bam_sarscov2_bam') },
+                { assert snapshot(process.out.versions).match('versions_sarscov2_bam') }
+            )
+        }
+
+    }
+
+    test("sarscov2 - bam - stub") {
+
+        options "-stub"
+
+        when {
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:false ], // meta map
+                    file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true),
+                    file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true)
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out.bam).match('bam_sarscov2_bam - stub') },
+                { assert snapshot(process.out.versions).match('versions_sarscov2_bam - stub') }
+            )
+        }
+
+    }
+
+}
diff --git a/modules/nf-core/umitools/prepareforrsem/tests/main.nf.test.snap b/modules/nf-core/umitools/prepareforrsem/tests/main.nf.test.snap
new file mode 100644
index 000000000000..5e8be480585e
--- /dev/null
+++ b/modules/nf-core/umitools/prepareforrsem/tests/main.nf.test.snap
@@ -0,0 +1,62 @@
+{
+    "versions_sarscov2_bam": {
+        "content": [
+            [
+                "versions.yml:md5,c0e2a933c2844e29e894e7d02c0e9a8b"
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-02-12T17:33:02.077139331"
+    },
+    "bam_sarscov2_bam": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": false
+                    },
+                    "test.bam:md5,fe4b8302182615651e4e7784ec67c819"
+                ]
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-02-12T17:33:02.066537672"
+    },
+    "bam_sarscov2_bam - stub": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": false
+                    },
+                    "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e"
+                ]
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-02-12T17:33:39.36279218"
+    },
+    "versions_sarscov2_bam - stub": {
+        "content": [
+            [
+                "versions.yml:md5,780af62fd633c145b279660eb9c3bd2b"
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-02-12T17:33:39.3706508"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/umitools/prepareforrsem/tests/tags.yml b/modules/nf-core/umitools/prepareforrsem/tests/tags.yml
new file mode 100644
index 000000000000..804c113dd5a1
--- /dev/null
+++ b/modules/nf-core/umitools/prepareforrsem/tests/tags.yml
@@ -0,0 +1,2 @@
+umitools/prepareforrsem:
+  - "modules/nf-core/umitools/prepareforrsem/**"