Skip to content

Commit

Permalink
fix: properly template-coordinate sort and duplicate mark secondary a…
Browse files Browse the repository at this point in the history
…nd supplementary reads

Secondary and supplementary reads must use the coordinates of the
primary alignments within the template, otherwise they will not
guaranteed to be next the primary alignments in the file.  Therefore,
we've added the "rp" and "mp" tags to store the SA-tag equivalent
information for the primary alignment.  This keeps information about the
primary alignments with the secondary and supplementary alignments.
  • Loading branch information
nh13 committed Jan 31, 2024
1 parent 73a6b67 commit 1b5753c
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 36 deletions.
49 changes: 45 additions & 4 deletions src/main/scala/com/fulcrumgenomics/bam/Bams.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
package com.fulcrumgenomics.bam

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.alignment.Cigar
import com.fulcrumgenomics.bam.api.SamOrder.Queryname
import com.fulcrumgenomics.bam.api._
import com.fulcrumgenomics.commons.collection.{BetterBufferedIterator, SelfClosingIterator}
Expand All @@ -41,6 +42,36 @@ import htsjdk.samtools.util.{CloserUtil, CoordMath, Murmur3, SequenceUtil}
import java.io.Closeable
import scala.math.{max, min}



case class Supplementary(refName: String, start: Int, positiveStrand: Boolean, cigar: Cigar, mapq: Int, nm: Int) {
def negativeStrand: Boolean = !positiveStrand
def refIndex(header: SAMFileHeader): Int = header.getSequence(refName).getSequenceIndex

def end: Int = start + cigar.lengthOnTarget - 1

Check warning on line 51 in src/main/scala/com/fulcrumgenomics/bam/Bams.scala

View check run for this annotation

Codecov / codecov/patch

src/main/scala/com/fulcrumgenomics/bam/Bams.scala#L51

Added line #L51 was not covered by tests
def unclippedStart: Int = {
SAMUtils.getUnclippedStart(start, cigar.toHtsjdkCigar)
}

def unclippedEnd: Int = {
SAMUtils.getUnclippedEnd(end, cigar.toHtsjdkCigar)

Check warning on line 57 in src/main/scala/com/fulcrumgenomics/bam/Bams.scala

View check run for this annotation

Codecov / codecov/patch

src/main/scala/com/fulcrumgenomics/bam/Bams.scala#L57

Added line #L57 was not covered by tests
}
}

object Supplementary {
/** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */
def toString(rec: SamRecord): String = {
val strand = if (rec.positiveStrand) '+' else '-'
f"${rec.refName},${rec.start},${strand},${rec.cigar},${rec.mapq},${rec.getOrElse(SAMTag.NM.name(),0)}"
}


def apply(sa: String): Supplementary = {
val parts = sa.split(",")
Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt)
}
}

/**
* Class that represents all reads from a template within a BAM file.
*/
Expand Down Expand Up @@ -107,11 +138,21 @@ case class Template(r1: Option[SamRecord],

/** Fixes mate information and sets mate cigar on all primary and supplementary (but not secondary) records. */
def fixMateInfo(): Unit = {
for (primary <- r1; supp <- r2Supplementals) {
SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true)
// Set all mate info on BOTH secondary and supplementary records, not just supplementary records. We also need to
// add the "pa" and "pm" tags with information about the primary alignments. Finally, we need the MQ tag!
val r1NonPrimary = r1Supplementals ++ r1Secondaries
val r2NonPrimary = r2Supplementals ++ r2Secondaries
for (primary <- r1; nonPrimary <- r2NonPrimary) {
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
nonPrimary(SAMTag.MQ.name()) = primary.mapq
nonPrimary("mp") = Supplementary.toString(primary)
r2.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
}
for (primary <- r2; supp <- r1Supplementals) {
SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true)
for (primary <- r2; nonPrimary <- r1NonPrimary) {
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
nonPrimary(SAMTag.MQ.name()) = primary.mapq
nonPrimary("mp") = Supplementary.toString(primary)
r1.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
}
for (first <- r1; second <- r2) {
SamPairUtil.setMateInfo(first.asSam, second.asSam, true)
Expand Down
65 changes: 47 additions & 18 deletions src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,15 @@

package com.fulcrumgenomics.bam.api

import com.fulcrumgenomics.bam.{Bams, Supplementary}
import com.fulcrumgenomics.umi.ConsensusTags
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
import htsjdk.samtools.util.Murmur3
import htsjdk.samtools.{SAMFileHeader, SAMUtils}
import org.apache.commons.math3.genetics.RandomKey

import scala.reflect.runtime.universe.Template


/** Trait for specifying BAM orderings. */
sealed trait SamOrder extends Product {
Expand Down Expand Up @@ -175,24 +178,50 @@ object SamOrder {
override val groupOrder: GroupOrder = GroupOrder.query
override val subSort: Option[String] = Some("template-coordinate")
override val sortkey: SamRecord => A = rec => {
val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
val readNeg = rec.negativeStrand
val mateNeg = if (rec.paired) rec.mateNegativeStrand else false
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
val index: Int = m.lastIndexOf('/')
if (index >= 0) m.substring(0, index) else m
}.getOrElse("")

if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
}
else {
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
// For non-secondary/non-supplementary alignments, use the info in the record. For secondary and supplementary
// alignments, use the info in the pa/pm tags.
if (!rec.secondary && !rec.supplementary) {
val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
val readNeg = rec.negativeStrand
val mateNeg = if (rec.paired) rec.mateNegativeStrand else false
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
val index: Int = m.lastIndexOf('/')
if (index >= 0) m.substring(0, index) else m
}.getOrElse("")

if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
}
else {
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
}
} else {
val primary = Supplementary(rec[String]("rp"))
val mate = Supplementary(rec[String]("mp"))
val readChrom = if (rec.unmapped) Int.MaxValue else primary.refIndex(rec.header)
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else mate.refIndex(rec.header)
val readNeg = primary.negativeStrand
val mateNeg = if (rec.paired) mate.negativeStrand else false
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) primary.unclippedEnd else primary.unclippedStart
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) mate.unclippedEnd else mate.unclippedStart
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
val index: Int = m.lastIndexOf('/')
if (index >= 0) m.substring(0, index) else m
}.getOrElse("")

if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
}
else {
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -719,7 +719,7 @@ class GroupReadsByUmi

// Then output the records in the right order (assigned tag, read name, r1, r2)
templatesByMi.keys.toSeq.sortBy(id => (id.length, id)).foreach(tag => {
templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.primaryReads).foreach(rec => {
templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.allReads).foreach(rec => {
out += rec
})
})
Expand Down
17 changes: 13 additions & 4 deletions src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@
package com.fulcrumgenomics.bam.api

import java.util.Random

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.Bams
import com.fulcrumgenomics.bam.Bams.{MaxInMemory, templateIterator}
import com.fulcrumgenomics.commons.util.SimpleCounter
import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
import com.fulcrumgenomics.util.Io
import htsjdk.samtools.{SAMFileHeader, SAMRecordCoordinateComparator, SAMRecordQueryNameComparator}
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
import htsjdk.samtools.util.Murmur3
Expand Down Expand Up @@ -228,11 +230,18 @@ class SamOrderTest extends UnitSpec {
s2.supplementary = true
s2.properlyPaired = true
exp += s2
// primary read pairs for q1, that map to different contigs, but earlier that q1
// primary read pairs for q2, that map to different contigs, but earlier that q1
exp ++= builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S")

val expected = List("q1/2:sup", "q1/1", "q1/2", "q2/1", "q2/2")
val actual = exp.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id)
// Fix the mate information. Note: sorting here to get a template-iterator will write the records to disk first,
// so we cannot use the records in builder/exp.
val records = Bams.templateIterator(iterator=exp.iterator, header=builder.header, maxInMemory=MaxInMemory, tmpDir=Io.tmpDir).flatMap { template =>
template.fixMateInfo()
template.allReads
}.toList

val expected = List("q2/1", "q2/2", "q1/1", "q1/2", "q1/2:sup")
val actual = records.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id)

actual should contain theSameElementsInOrderAs expected
}
Expand Down
33 changes: 24 additions & 9 deletions src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,14 @@
*/
package com.fulcrumgenomics.umi

import com.fulcrumgenomics.bam.Template
import com.fulcrumgenomics.bam.api.SamOrder
import com.fulcrumgenomics.bam.{Bams, Template}
import com.fulcrumgenomics.bam.api.{SamOrder, SamWriter}
import com.fulcrumgenomics.bam.api.SamOrder.TemplateCoordinate
import com.fulcrumgenomics.cmdline.FgBioMain.FailureException
import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
import com.fulcrumgenomics.umi.GroupReadsByUmi._
import com.fulcrumgenomics.util.Io
import org.scalatest.{OptionValues, PrivateMethodTester}

import java.nio.file.Files
Expand Down Expand Up @@ -339,29 +340,43 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
}

it should "mark duplicates on supplementary reads" in {
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.TemplateCoordinate))
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
// primary read pairs for q1, that map to different contigs
builder.addPair("q1", contig = 1, contig2 = Some(2), start1 = 66, start2 = 47, cigar1 = "60M40S", cigar2 = "55M45S", strand2 = Plus, attrs = Map("RX" -> "ACT"))
// supplementary R2, which maps to the same chromosome as the primary R1
val Seq(s1, s2) = builder.addPair("q1", contig = 1, contig2 = Some(1), start1 = 66, start2 = 66, cigar1 = "60M40S", strand2 = Minus, attrs = Map("RX" -> "ACT"))
s1.supplementary = true
s2.supplementary = true
s2.properlyPaired = true
// primary read pairs for q1, that map to different contigs, but earlier that q1
// primary read pairs for q2, that map to different contigs, but earlier that q1
builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT"))

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
// primary read pairs for q3, that are duplicates of q2
builder.addPair("q3", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT"))

// Fix the mate information and write the input. Note: sorting here to get a template-iterator will write the
// records to disk first, so we cannot use the records in builder and therefore must write to the input file
// directly.
val in = Files.createTempFile("raw_reads", ".sam")
val writer = SamWriter(in, builder.header, sort = builder.sort)
Bams.templateIterator(iterator = builder.iterator, header = builder.header, maxInMemory = Bams.MaxInMemory, tmpDir = Io.tmpDir).foreach { template =>
template.fixMateInfo()
writer ++= template.allReads
}
writer.close()

val out = Files.createTempFile("umi_grouped.", ".sam")
val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true, includeSupplementary = Some(true))
val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true)

gr.markDuplicates shouldBe true
gr.execute()

val recs = readBamRecs(out)
recs.length shouldBe 4
recs.filter(_.name.equals("q1")).forall(_.duplicate == true) shouldBe true
recs.length shouldBe 8
recs.filter(_.name.equals("q1")).forall(_.duplicate == false) shouldBe true
recs.filter(_.name.equals("q2")).forall(_.duplicate == false) shouldBe true
recs.filter(_.name.equals("q3")).forall(_.duplicate == true) shouldBe true
}

it should "correctly group reads with the paired assigner when the two UMIs are the same" in {
Expand Down

0 comments on commit 1b5753c

Please sign in to comment.