Skip to content

Commit

Permalink
Update to v0.12.10
Browse files Browse the repository at this point in the history
  • Loading branch information
ctsa committed Feb 24, 2025
1 parent 3f5c3d4 commit 7ea9b2c
Show file tree
Hide file tree
Showing 8 changed files with 193 additions and 69 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Change Log

## v0.12.10 - 2025-02-24

### Fixed

- Fix handling of chromosome names with colons, eg. 'HLA-DRB1*10:01:01' (github #11)

## v0.12.9 - 2025-01-10

### Added
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sawfish"
version = "0.12.9"
version = "0.12.10"
authors = ["Chris Saunders <csaunders@pacificbiosciences.com>"]
description = "Structural variant analysis for mapped PacBio HiFi reads"
edition = "2021"
Expand Down
2 changes: 1 addition & 1 deletion LICENSE-THIRDPARTY.json
Original file line number Diff line number Diff line change
Expand Up @@ -1702,7 +1702,7 @@
},
{
"name": "sawfish",
"version": "0.12.9",
"version": "0.12.10",
"authors": "Chris Saunders <csaunders@pacificbiosciences.com>",
"repository": null,
"license": null,
Expand Down
2 changes: 1 addition & 1 deletion docs/user_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ SV haplotype contig alignments are output to `${OUTPUT_DIR}/contig.alignment.bam
this file can be viewed in alignment browsers such as IGV.

Aligned contigs are provided for all single breakpoint SV calls. To find the contig for a given SV, locate the SV's VCF ID field, such as `sawfish:0:2803:1:2`,
and take the prefix from this ID that includes the first three digits, in this case `sawfish:0:2803:1` -- this is `QNAME` value of the corresponding SV
and take the prefix from this ID that includes the first three digits, in this case `sawfish:0:2803:1`. This is the `QNAME` value of the corresponding SV
haplotype alignment(s) in the contig alignment BAM file.

Contigs are not available for multi-breakpoint events such as inversions. However, contigs are available for each individual breakpoint
Expand Down
8 changes: 4 additions & 4 deletions src/genome_regions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use log::info;
use rust_vc_utils::ChromList;
use unwrap::unwrap;

use crate::genome_segment::samtools_region_string_splitter;
use crate::genome_segment::parse_samtools_region_string;

/// A set of chromosome regions which can be efficiently queried
///
Expand Down Expand Up @@ -144,10 +144,10 @@ impl GenomeRegions {
) -> Self {
let mut regions = Self::new(overlaps_allowed);
for target_region in target_regions {
let (_chrom_index, chrom_label, start, end) =
samtools_region_string_splitter(chrom_list, target_region);
let (chrom_index, start, end) = parse_samtools_region_string(chrom_list, target_region);

regions.add_region(&chrom_label, start, end);
let chrom_label = &chrom_list.data[chrom_index].label;
regions.add_region(chrom_label, start, end);
}
regions
}
Expand Down
152 changes: 101 additions & 51 deletions src/genome_segment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ impl GenomeSegment {
///
#[allow(dead_code)]
pub fn from_region_str(chrom_list: &ChromList, str: &str) -> Self {
let (chrom_index, _chrom_label, start, end) =
samtools_region_string_splitter(chrom_list, str);
let (chrom_index, start, end) = parse_samtools_region_string(chrom_list, str);
Self {
chrom_index,
range: IntRange::from_pair(start, end),
Expand Down Expand Up @@ -109,43 +108,58 @@ pub fn get_segment_dir_distance(gs1: &GenomeSegment, gs2: &GenomeSegment) -> Opt
}
}

/// Convert from a string in 'samtools' region format (e.g. chr20:100-200) to a tuple of
/// (chrom_index, chrom_label, start, end)
/// ...where start and end are converted to the zero-indexed half-open convention used for bed
/// Parse the chromosome string out of a samtools-style region string
///
/// Commas will be stripped out of coordinates if present
/// Return the index of the chromosome from the expected chromosome list, and
/// an optional position string following the chromosome name
///
pub fn samtools_region_string_splitter(
fn parse_chrom_index_from_samtools_region_string<'a>(
chrom_list: &ChromList,
str: &str,
) -> (usize, String, i64, i64) {
let s1 = str.split(':').collect::<Vec<_>>();
str: &'a str,
) -> (usize, Option<&'a str>) {
// Note that rsplitn orders words in reverse order compared to how they appear in the string:
let s1 = str.rsplitn(2, ':').collect::<Vec<_>>();
let s1l = s1.len();
assert!(
s1l > 0 && s1l < 3,
"Unexpected format in genome region string {}",
str
"Unexpected format in genome region string '{str}'"
);
let chrom = s1[0].to_string();
let chrom_index = match chrom_list.label_to_index.get(s1[0]) {
Some(x) => *x,
None => {
panic!("Can't find chromosome '{}' in bam file header", chrom);
}
};
let chrom_size = chrom_list.data[chrom_index].length as i64;
let (start, end) = if s1l == 1 {
(0, chrom_size)
let chrom = *s1.last().unwrap();
if let Some(&chrom_index) = chrom_list.label_to_index.get(chrom) {
let pos_string = if s1l == 2 { Some(s1[0]) } else { None };
(chrom_index, pos_string)
} else if let Some(&chrom_index) = chrom_list.label_to_index.get(str) {
(chrom_index, None)
} else {
let s2 = s1[1].split('-').collect::<Vec<_>>();
let msg = if str != chrom {
format!("Unexpected format in genome region string '{str}': can't find chromosome name '{chrom}' or '{str}' in bam file header")
} else {
format!("Unexpected format in genome region string '{str}': can't find chromosome '{chrom}' in bam file header")
};
panic!("{}", msg);
}
}

/// Parse position range from samtools-style genomic interval string, return
/// start-end coordinate in bedtools zero-index half-open format.
///
/// In the samtools-style string, "100-300" would return (99,300). Just "100"
/// should retunr (99, chrom_length)
///
/// # Arguments
/// * `region_str` - Only used to improve error messages
///
fn parse_samtools_pos_range(
region_str: &str,
pos_range_str: Option<&str>,
chrom_size: i64,
) -> (i64, i64) {
if let Some(pos_range_str) = pos_range_str {
let s2 = pos_range_str.split('-').collect::<Vec<_>>();
let s2l = s2.len();
assert!(
s2l > 0 && s2l < 3,
"Unexpected format in genome region string {}",
str
);
// Strip any commas out of the number field (don't know if samtools does this but just a
// nice ease of use bonus:
assert!(s2l <= 2, "Unexpected format in position range '{pos_range_str}' from genome region string {region_str}");

// Strip any commas out of the number field (same as tabix cmdline behavior)
let s2 = s2
.into_iter()
.map(|s| {
Expand All @@ -155,14 +169,33 @@ pub fn samtools_region_string_splitter(
})
.collect::<Vec<_>>();
let start = s2[0].parse::<i64>().unwrap() - 1;
if s2l == 1 {
(start, chrom_size)
let end = if s2l == 1 {
chrom_size
} else {
let end = s2[1].parse::<i64>().unwrap();
(start, end)
}
};
(chrom_index, chrom, start, end)
s2[1].parse::<i64>().unwrap()
};
(start, end)
} else {
(0, chrom_size)
}
}

/// Convert from a string in 'samtools' region format (e.g. chr20:100-200) to a tuple of
/// (chrom_index, chrom_label, start, end)
/// ...where start and end are converted to the zero-indexed half-open convention used for bed
///
/// Commas will be stripped out of coordinates if present
///
/// This parser makes a 'best-effort' to parse contig names with colons in them, such as HLA alleles
/// like "HLA-DRB1*10:01:01". Given that samtools region format already has an optinoal colon, it may
/// be impossible to resolve some cases.
///
pub fn parse_samtools_region_string(chrom_list: &ChromList, region_str: &str) -> (usize, i64, i64) {
let (chrom_index, pos_str) =
parse_chrom_index_from_samtools_region_string(chrom_list, region_str);
let chrom_size = chrom_list.data[chrom_index].length as i64;
let (start, end) = parse_samtools_pos_range(region_str, pos_str, chrom_size);
(chrom_index, start, end)
}

#[allow(dead_code)]
Expand Down Expand Up @@ -235,37 +268,54 @@ mod tests {

#[test]
fn test_samtools_region_string_splitter() {
let mut chrom_list = ChromList::default();
chrom_list.add_chrom("chr1", 10000);
chrom_list.add_chrom("chr2", 10000);
chrom_list.add_chrom("chr3", 10000);
let chrom_list = chrom_list;
let chrom_list = {
let mut x = ChromList::default();
x.add_chrom("chr1", 10000);
x.add_chrom("chr2", 10000);
x.add_chrom("chr3", 10000);
x
};

// A simple case
let s = "chr2:1000-2000";
let (chrom_index, chrom_label, start, end) =
samtools_region_string_splitter(&chrom_list, s);
let (chrom_index, start, end) = parse_samtools_region_string(&chrom_list, s);
assert_eq!(chrom_index, 1);
assert_eq!(chrom_label, "chr2");
assert_eq!(start, 999);
assert_eq!(end, 2000);

// Simple case with commas
let s = "chr2:1,000-2,000";
let (chrom_index, chrom_label, start, end) =
samtools_region_string_splitter(&chrom_list, s);
let (chrom_index, start, end) = parse_samtools_region_string(&chrom_list, s);
assert_eq!(chrom_index, 1);
assert_eq!(chrom_label, "chr2");
assert_eq!(start, 999);
assert_eq!(end, 2000);

// No end
let s = "chr2:1,000";
let (chrom_index, chrom_label, start, end) =
samtools_region_string_splitter(&chrom_list, s);
let (chrom_index, start, end) = parse_samtools_region_string(&chrom_list, s);
assert_eq!(chrom_index, 1);
assert_eq!(chrom_label, "chr2");
assert_eq!(start, 999);
assert_eq!(end, 10000);
}

#[test]
fn test_samtools_region_string_splitter_hla() {
let chrom_list = {
let mut x = ChromList::default();
x.add_chrom("HLA-DRB1*10:01:01", 10000);
x
};

let s = "HLA-DRB1*10:01:01:1000-2000";
let (chrom_index, start, end) = parse_samtools_region_string(&chrom_list, s);
assert_eq!(chrom_index, 0);
assert_eq!(start, 999);
assert_eq!(end, 2000);

let s = "HLA-DRB1*10:01:01";
let (chrom_index, start, end) = parse_samtools_region_string(&chrom_list, s);
assert_eq!(chrom_index, 0);
assert_eq!(start, 0);
assert_eq!(end, 10000);
}
}
88 changes: 78 additions & 10 deletions src/joint_call/get_refined_svs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,13 @@ struct BndAltInfo {
insert_seq: Vec<u8>,
}

/// Parse a VCF breakend record alt-allele string into component parts describing the breakpoint
///
/// Example BND alt allele string is "TCT]chr1:500]", see VCF spec for full description.
///
/// # Arguments
/// * homlen - this is used to correct the remote breakend into a left-shifted position when the breakpoint is inverted
///
fn parse_bnd_alt_allele(chrom_list: &ChromList, alt_allele: &[u8], homlen: i64) -> BndAltInfo {
let alt_allele = std::str::from_utf8(alt_allele).unwrap();

Expand All @@ -224,17 +231,21 @@ fn parse_bnd_alt_allele(chrom_list: &ChromList, alt_allele: &[u8], homlen: i64)
alt_allele
);

let mate_location = words[1].split(':').collect::<Vec<_>>();
assert_eq!(
mate_location.len(),
2,
"Unexpected BND alt allele format `{}`",
alt_allele
);
let (breakend2_chrom_index, breakend2_raw_pos) = {
// Note that rsplitn orders words in reverse order compared to how they appear in the string:
let mate_location = words[1].rsplitn(2, ':').collect::<Vec<_>>();
assert_eq!(
mate_location.len(),
2,
"Unexpected BND alt allele format `{}`",
alt_allele
);

let breakend2_chrom_name = mate_location[0];
let breakend2_chrom_index = *chrom_list.label_to_index.get(breakend2_chrom_name).unwrap();
let breakend2_raw_pos = mate_location[1].parse::<i64>().unwrap() - 1;
let breakend2_chrom_name = mate_location[1];
let breakend2_chrom_index = *chrom_list.label_to_index.get(breakend2_chrom_name).unwrap();
let breakend2_raw_pos = mate_location[0].parse::<i64>().unwrap() - 1;
(breakend2_chrom_index, breakend2_raw_pos)
};

let breakend1_direction = if !words[0].is_empty() && words[2].is_empty() {
BreakendDirection::LeftAnchor
Expand Down Expand Up @@ -838,3 +849,60 @@ pub fn get_sample_sv_groups(

sv_groups
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_parse_bnd_alt_allele() {
let chrom_list = {
let mut x = ChromList::default();
x.add_chrom("chr1", 1000);
x
};

let alt_allele = b"TCT]chr1:500]";

let homlen = 10;
let bnd_alt_info = parse_bnd_alt_allele(&chrom_list, alt_allele, homlen);

assert_eq!(
bnd_alt_info.breakend1_direction,
BreakendDirection::LeftAnchor
);
assert_eq!(
bnd_alt_info.breakend2_direction,
BreakendDirection::LeftAnchor
);
assert_eq!(bnd_alt_info.breakend2_chrom_index, 0);
assert_eq!(bnd_alt_info.breakend2_pos, 489);
assert_eq!(bnd_alt_info.insert_seq, b"CT");
}

#[test]
fn test_parse_bnd_alt_allele_hla() {
let chrom_list = {
let mut x = ChromList::default();
x.add_chrom("HLA-DRB1*10:01:01", 1000);
x
};

let alt_allele = b"TCT]HLA-DRB1*10:01:01:500]";

let homlen = 10;
let bnd_alt_info = parse_bnd_alt_allele(&chrom_list, alt_allele, homlen);

assert_eq!(
bnd_alt_info.breakend1_direction,
BreakendDirection::LeftAnchor
);
assert_eq!(
bnd_alt_info.breakend2_direction,
BreakendDirection::LeftAnchor
);
assert_eq!(bnd_alt_info.breakend2_chrom_index, 0);
assert_eq!(bnd_alt_info.breakend2_pos, 489);
assert_eq!(bnd_alt_info.insert_seq, b"CT");
}
}

0 comments on commit 7ea9b2c

Please sign in to comment.