From f43167b41a32729d50f83cb29ce904ec3b9812e0 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 13 Feb 2024 10:20:20 +0000 Subject: [PATCH] Umitools prepareforrsem (#4897) * Add umitools/prepareforrsem * Remove trailing newlines * Fix software env * appease eclint * fix linting * fix comment * Don't snapshot python version * Readd snapshot file * Apply suggestions from code review Co-authored-by: Maxime U Garcia --------- Co-authored-by: Maxime U Garcia --- .../umitools/prepareforrsem/environment.yml | 7 + .../nf-core/umitools/prepareforrsem/main.nf | 34 ++ .../nf-core/umitools/prepareforrsem/meta.yml | 53 +++ .../templates/prepare-for-rsem.py | 313 ++++++++++++++++++ .../prepareforrsem/tests/main.nf.test | 62 ++++ .../prepareforrsem/tests/main.nf.test.snap | 62 ++++ .../umitools/prepareforrsem/tests/tags.yml | 2 + 7 files changed, 533 insertions(+) create mode 100644 modules/nf-core/umitools/prepareforrsem/environment.yml create mode 100644 modules/nf-core/umitools/prepareforrsem/main.nf create mode 100644 modules/nf-core/umitools/prepareforrsem/meta.yml create mode 100755 modules/nf-core/umitools/prepareforrsem/templates/prepare-for-rsem.py create mode 100644 modules/nf-core/umitools/prepareforrsem/tests/main.nf.test create mode 100644 modules/nf-core/umitools/prepareforrsem/tests/main.nf.test.snap create mode 100644 modules/nf-core/umitools/prepareforrsem/tests/tags.yml diff --git a/modules/nf-core/umitools/prepareforrsem/environment.yml b/modules/nf-core/umitools/prepareforrsem/environment.yml new file mode 100644 index 000000000000..a6cc37774d43 --- /dev/null +++ b/modules/nf-core/umitools/prepareforrsem/environment.yml @@ -0,0 +1,7 @@ +name: umitools_prepareforrsem +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::umi_tools=1.1.4 diff --git a/modules/nf-core/umitools/prepareforrsem/main.nf b/modules/nf-core/umitools/prepareforrsem/main.nf new file mode 100644 index 000000000000..b556474b8249 --- /dev/null +++ b/modules/nf-core/umitools/prepareforrsem/main.nf @@ -0,0 +1,34 @@ +process UMITOOLS_PREPAREFORRSEM { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.4--py38hbff2b2d_1' : + 'biocontainers/umi_tools:1.1.4--py38hbff2b2d_1' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path('*.bam'), emit: bam + tuple val(meta), path('*.log'), emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'prepare-for-rsem.py' + + stub: + """ + touch ${meta.id}.bam + touch ${meta.id}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + umitools: \$( umi_tools --version | sed '/version:/!d; s/.*: //' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/umitools/prepareforrsem/meta.yml b/modules/nf-core/umitools/prepareforrsem/meta.yml new file mode 100644 index 000000000000..8b85366cc4ad --- /dev/null +++ b/modules/nf-core/umitools/prepareforrsem/meta.yml @@ -0,0 +1,53 @@ +name: umitools_prepareforrsem +description: Make the output from umi_tools dedup or group compatible with RSEM +keywords: + - umitools + - rsem + - salmon + - dedup +tools: + - umi_tools: + description: > + UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) and single cell RNA-Seq cell barcodes + documentation: https://umi-tools.readthedocs.io/en/latest/ + license: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: | + BAM file containing reads to be deduplicated via UMIs. + pattern: "*.{bam}" + - bai: + type: file + description: | + BAM index files corresponding to the input BAM file. + pattern: "*.{bai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Prepared BAM file. + pattern: "*.{bam}" + - log: + type: file + description: File with logging information + pattern: "*.{log}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@pinin4fjords" +maintainers: + - "@drpatelh" + - "@pinin4fjords" diff --git a/modules/nf-core/umitools/prepareforrsem/templates/prepare-for-rsem.py b/modules/nf-core/umitools/prepareforrsem/templates/prepare-for-rsem.py new file mode 100755 index 000000000000..c1c2fc2b8186 --- /dev/null +++ b/modules/nf-core/umitools/prepareforrsem/templates/prepare-for-rsem.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 + +""" +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Credits +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This script is a clone of the "prepare-for-rsem.py" script written by +Ian Sudbury, Tom Smith and other contributors to the UMI-tools package: +/~https://github.com/CGATOxford/UMI-tools + +It has been included here to address problems encountered with +Salmon quant and RSEM as discussed in the issue below: +/~https://github.com/CGATOxford/UMI-tools/issues/465 + +When the "umi_tools prepare-for-rsem" command becomes available in an official +UMI-tools release this template script will be replaced with the native +umi_tools command. + +Commit: +/~https://github.com/CGATOxford/UMI-tools/blob/bf8608d6a172c5ca0dcf33c126b4e23429177a72/umi_tools/prepare-for-rsem.py + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +prepare_for_rsem - make the output from dedup or group compatible with RSEM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The SAM format specification states that the mnext and mpos fields should point +to the primary alignment of a read's mate. However, not all aligners adhere to +this standard. In addition, the RSEM software requires that the mate of a read1 +appears directly after it in its input BAM. This requires that there is exactly +one read1 alignment for every read2 and vice versa. + +In general (except in a few edge cases) UMI tools outputs only the read2 to that +corresponds to the read specified in the mnext and mpos positions of a selected +read1, and only outputs this read once, even if multiple read1s point to it. +This makes UMI-tools outputs incompatible with RSEM. This script takes the output +from dedup or groups and ensures that each read1 has exactly one read2 (and vice +versa), that read2 always appears directly after read1,and that pairs point to +each other (note this is technically not valid SAM format). Copy any specified +tags from read1 to read2 if they are present (by default, UG and BX, the unique +group and correct UMI tags added by _group_) + +Input must to name sorted. + + +https://raw.githubusercontent.com/CGATOxford/UMI-tools/master/LICENSE + +""" + +from umi_tools import Utilities as U +from collections import defaultdict, Counter +import platform +import pysam +import shlex +import sys + +usage = """ +prepare_for_rsem - make output from dedup or group compatible with RSEM + +Usage: umi_tools prepare_for_rsem [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is omited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr """ + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +def chunk_bam(bamfile): + """Take in a iterator of pysam.AlignmentSegment entries and yield + lists of reads that all share the same name""" + + last_query_name = None + output_buffer = list() + + for read in bamfile: + if last_query_name is not None and last_query_name != read.query_name: + yield (output_buffer) + output_buffer = list() + + last_query_name = read.query_name + output_buffer.append(read) + + yield (output_buffer) + + +def copy_tags(tags, read1, read2): + """Given a list of tags, copies the values of these tags from read1 + to read2, if the tag is set""" + + for tag in tags: + try: + read1_tag = read1.get_tag(tag, with_value_type=True) + read2.set_tag(tag, value=read1_tag[0], value_type=read1_tag[1]) + except KeyError: + pass + + return read2 + + +def pick_mate(read, template_dict, mate_key): + """Find the mate of read in the template dict using key. It will retrieve + all reads at that key, and then scan to pick the one that refers to _read_ + as it's mate. If there is no such read, it picks a first one it comes to""" + + mate = None + + # get a list of secondary reads at the correct alignment position + potential_mates = template_dict[not read.is_read1][mate_key] + + # search through one at a time to find a read that points to the current read + # as its mate. + for candidate_mate in potential_mates: + if ( + candidate_mate.next_reference_name == read.reference_name + and candidate_mate.next_reference_start == read.pos + ): + mate = candidate_mate + + # if no such read is found, then pick any old secondary alignment at that position + # note: this happens when UMI-tools outputs the wrong read as something's pair. + if mate is None and len(potential_mates) > 0: + mate = potential_mates[0] + + return mate + + +def main(argv=None): + if argv is None: + argv = sys.argv + + # setup command line parser + parser = U.OptionParser(version="%prog version: \$Id\$", usage=usage, description=globals()["__doc__"]) + group = U.OptionGroup(parser, "RSEM preparation specific options") + + group.add_option( + "--tags", + dest="tags", + type="string", + default="UG,BX", + help="Comma-separated list of tags to transfer from read1 to read2", + ) + group.add_option( + "--sam", dest="sam", action="store_true", default=False, help="input and output SAM rather than BAM" + ) + + parser.add_option_group(group) + + # add common options (-h/--help, ...) and parse command line + (options, args) = U.Start( + parser, argv=argv, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False + ) + + skipped_stats = Counter() + + if options.stdin != sys.stdin: + in_name = options.stdin.name + options.stdin.close() + else: + in_name = "-" + + if options.sam: + mode = "" + else: + mode = "b" + + inbam = pysam.AlignmentFile(in_name, "r" + mode) + + if options.stdout != sys.stdout: + out_name = options.stdout.name + options.stdout.close() + else: + out_name = "-" + + outbam = pysam.AlignmentFile(out_name, "w" + mode, template=inbam) + + options.tags = options.tags.split(",") + + for template in chunk_bam(inbam): + assert len(set(r.query_name for r in template)) == 1 + current_template = {True: defaultdict(list), False: defaultdict(list)} + + for read in template: + key = (read.reference_name, read.pos, not read.is_secondary) + current_template[read.is_read1][key].append(read) + + output = set() + + for read in template: + mate = None + + # if this read is a non_primary alignment, we first want to check if it has a mate + # with the non-primary alignment flag set. + + mate_key_primary = True + mate_key_secondary = (read.next_reference_name, read.next_reference_start, False) + + # First look for a read that has the same primary/secondary status + # as read (i.e. secondary mate for secondary read, and primary mate + # for primary read) + mate_key = (read.next_reference_name, read.next_reference_start, read.is_secondary) + mate = pick_mate(read, current_template, mate_key) + + # If none was found then look for the opposite (primary mate of secondary + # read or seconadary mate of primary read) + if mate is None: + mate_key = (read.next_reference_name, read.next_reference_start, not read.is_secondary) + mate = pick_mate(read, current_template, mate_key) + + # If we still don't have a mate, then their can't be one? + if mate is None: + skipped_stats["no_mate"] += 1 + U.warn( + "Alignment {} has no mate -- skipped".format( + "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)])) + ) + ) + continue + + # because we might want to make changes to the read, but not have those changes reflected + # if we need the read again,we copy the read. This is only way I can find to do this. + read = pysam.AlignedSegment().from_dict(read.to_dict(), read.header) + mate = pysam.AlignedSegment().from_dict(mate.to_dict(), read.header) + + # Make it so that if our read is secondary, the mate is also secondary. We don't make the + # mate primary if the read is primary because we would otherwise end up with mulitple + # primary alignments. + if read.is_secondary: + mate.is_secondary = True + + # In a situation where there is already one mate for each read, then we will come across + # each pair twice - once when we scan read1 and once when we scan read2. Thus we need + # to make sure we don't output something already output. + if read.is_read1: + mate = copy_tags(options.tags, read, mate) + output_key = str(read) + str(mate) + + if output_key not in output: + output.add(output_key) + outbam.write(read) + outbam.write(mate) + skipped_stats["pairs_output"] += 1 + + elif read.is_read2: + read = copy_tags(options.tags, mate, read) + output_key = str(mate) + str(read) + + if output_key not in output: + output.add(output_key) + outbam.write(mate) + outbam.write(read) + skipped_stats["pairs_output"] += 1 + + else: + skipped_stats["skipped_not_read12"] += 1 + U.warn( + "Alignment {} is neither read1 nor read2 -- skipped".format( + "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)])) + ) + ) + continue + + if not out_name == "-": + outbam.close() + + U.info( + "Total pairs output: {}, Pairs skipped - no mates: {}," + " Pairs skipped - not read1 or 2: {}".format( + skipped_stats["pairs_output"], skipped_stats["no_mate"], skipped_stats["skipped_not_read12"] + ) + ) + U.Stop() + + +if __name__ == "__main__": + if '${task.ext.prefix}' != "null": + prefix = "${task.ext.prefix}." + elif '$meta.id' != "null": + prefix = '${meta.id}.' + else: + prefix = '' + + command_line = f"--stdin=$bam --stdout={prefix}bam --log={prefix}prepare_for_rsem.log $task.ext.args" + + # Convert the string into a list mimicking sys.argv + argv = ["script_name"] + shlex.split(command_line) + + # Run the script and record error code + exit_code = main(argv) + + # Write the versions + versions_this_module = {} + versions_this_module["${task.process}"] = {"umitools": U.__version__} + with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions_this_module)) + + # Exit with correct error code + sys.exit(exit_code) diff --git a/modules/nf-core/umitools/prepareforrsem/tests/main.nf.test b/modules/nf-core/umitools/prepareforrsem/tests/main.nf.test new file mode 100644 index 000000000000..d78158baeacb --- /dev/null +++ b/modules/nf-core/umitools/prepareforrsem/tests/main.nf.test @@ -0,0 +1,62 @@ +nextflow_process { + + name "Test Process UMITOOLS_PREPAREFORRSEM" + script "../main.nf" + process "UMITOOLS_PREPAREFORRSEM" + + tag "modules" + tag "modules_nfcore" + tag "umitools" + tag "umitools/prepareforrsem" + + test("sarscov2 - bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.bam).match('bam_sarscov2_bam') }, + { assert snapshot(process.out.versions).match('versions_sarscov2_bam') } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.bam).match('bam_sarscov2_bam - stub') }, + { assert snapshot(process.out.versions).match('versions_sarscov2_bam - stub') } + ) + } + + } + +} diff --git a/modules/nf-core/umitools/prepareforrsem/tests/main.nf.test.snap b/modules/nf-core/umitools/prepareforrsem/tests/main.nf.test.snap new file mode 100644 index 000000000000..5e8be480585e --- /dev/null +++ b/modules/nf-core/umitools/prepareforrsem/tests/main.nf.test.snap @@ -0,0 +1,62 @@ +{ + "versions_sarscov2_bam": { + "content": [ + [ + "versions.yml:md5,c0e2a933c2844e29e894e7d02c0e9a8b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-12T17:33:02.077139331" + }, + "bam_sarscov2_bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam:md5,fe4b8302182615651e4e7784ec67c819" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-12T17:33:02.066537672" + }, + "bam_sarscov2_bam - stub": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-12T17:33:39.36279218" + }, + "versions_sarscov2_bam - stub": { + "content": [ + [ + "versions.yml:md5,780af62fd633c145b279660eb9c3bd2b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-12T17:33:39.3706508" + } +} \ No newline at end of file diff --git a/modules/nf-core/umitools/prepareforrsem/tests/tags.yml b/modules/nf-core/umitools/prepareforrsem/tests/tags.yml new file mode 100644 index 000000000000..804c113dd5a1 --- /dev/null +++ b/modules/nf-core/umitools/prepareforrsem/tests/tags.yml @@ -0,0 +1,2 @@ +umitools/prepareforrsem: + - "modules/nf-core/umitools/prepareforrsem/**"