Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Version 0.3 #26

Merged
merged 20 commits into from
Sep 12, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 2
jobs:
test:
docker:
- image: circleci/rust:1.31.1-stretch
- image: circleci/rust:1.37-stretch
steps:
- checkout
- run:
Expand Down
5 changes: 2 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "needletail"
version = "0.2.3"
version = "0.3.0"
authors = ["Roderick Bovee <roderick@onecodex.com>"]
description = "FASTX parsing and k-mer methods"
keywords = ["FASTA", "FASTQ", "kmer", "bioinformatics"]
Expand All @@ -12,13 +12,12 @@ edition = "2018"

[features]
default = ["compression"]
compression = ["bzip2", "flate2", "xz2", "zip"]
compression = ["bzip2", "flate2", "xz2"]

[dependencies]
flate2 = { version="1.0.6", optional=true }
bzip2 = { version="0.3.3", optional=true }
xz2 = { version="0.1.6", optional=true }
zip = { version="0.5.0", optional=true }
memchr = "2.2.0"

[dev-dependencies]
Expand Down
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@ Needletail's goal is to be as fast as the [readfq](/~https://github.com/lh3/readfq
```rust
extern crate needletail;
use std::env;
use needletail::{fastx};
use std::fs::File;
use needletail::parse_sequences;

fn main() {
let filename: String = env::args().nth(1).unwrap();

let mut n_bases = 0;
let mut n_valid_kmers = 0;
fastx::fastx_cli(&filename[..], |_| {}, |seq| {
parse_sequences(File::open(filename).expect("missing file"), |_| {}, |seq| {
// seq.id is the name of the record
// seq.seq is the base sequence
// seq.qual is an optional quality score
Expand All @@ -35,7 +36,7 @@ fn main() {
n_valid_kmers += 1;
}
}
});
}).expect("parsing failed");
println!("There are {} bases in your file.", n_bases);
println!("There are {} AAAAs in your file.", n_valid_kmers);
}
Expand All @@ -49,7 +50,7 @@ Please use either your local package manager (`homebrew`, `apt-get`, `pacman`, e
Once you have Rust set up, you can include needletail in your `Cargo.toml` file like:
```shell
[dependencies]
needletail = "^0.2.0"
needletail = "^0.3.0"
```

To install needletail itself for development:
Expand Down
66 changes: 35 additions & 31 deletions benches/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,26 @@ extern crate bencher;
extern crate needletail;

use bencher::Bencher;
use needletail::{bitkmer, fastx, kmer};
use needletail::fastx;
use std::fs::File;
use std::io::Read;
use std::io::{Cursor, Read};

// from Bio.SeqIO import parse
// n_total = sum([len([k for k in slid_win(i.seq, 31) if set(k).issubset({'A', 'C', 'G', 'T'})]) for i in SeqIO.parse('./tests/data/28S.fasta', 'fasta')])

fn bench_kmer_speed(bench: &mut Bencher) {
let filename = String::from("tests/data/28S.fasta");
let filename = "tests/data/28S.fasta";
let ksize = 31;

bench.iter(|| {
let mut n_total = 0;
let mut n_canonical = 0;
fastx::fastx_cli(
&filename[..],
let file = File::open(filename).unwrap();
fastx::parse_sequences(
file,
|_| {},
|seq| {
for (_, kmer, was_rc) in seq.normalize(true).kmers(ksize, true) {
for (_, _kmer, was_rc) in seq.normalize(true).kmers(ksize, true) {
if !was_rc {
n_canonical += 1;
}
Expand All @@ -36,17 +37,18 @@ fn bench_kmer_speed(bench: &mut Bencher) {
}

fn bench_bitkmer_speed(bench: &mut Bencher) {
let filename = String::from("tests/data/28S.fasta");
let filename = "tests/data/28S.fasta";
let ksize = 31;

bench.iter(|| {
let mut n_total = 0;
let mut n_canonical = 0;
fastx::fastx_cli(
&filename[..],
let file = File::open(filename).unwrap();
fastx::parse_sequences(
file,
|_| {},
|seq| {
for (_, k, was_rc) in seq.bit_kmers(ksize, true) {
for (_, _kmer, was_rc) in seq.bit_kmers(ksize, true) {
if !was_rc {
n_canonical += 1;
}
Expand All @@ -63,33 +65,33 @@ fn bench_bitkmer_speed(bench: &mut Bencher) {
benchmark_group!(kmers, bench_kmer_speed, bench_bitkmer_speed);

fn bench_fastq_bytes(bench: &mut Bencher) {
let filename = String::from("tests/data/PRJNA271013_head.fq");
let ksize = 31;

let mut data: Vec<u8> = vec![];
let mut f = File::open(filename).unwrap();
f.read_to_end(&mut data);
let mut f = File::open("tests/data/PRJNA271013_head.fq").unwrap();
let _ = f.read_to_end(&mut data);

bench.iter(|| {
let mut n_bases = 0;
fastx::fastx_bytes(&data, |seq| {
n_bases += seq.seq.len();
})
fastx::parse_sequences(
Cursor::new(&data),
|_| {},
|seq| {
n_bases += seq.seq.len();
},
)
.unwrap();
assert_eq!(250000, n_bases);
})
}

fn bench_fastq_file(bench: &mut Bencher) {
let filename = String::from("tests/data/PRJNA271013_head.fq");
let ksize = 31;
let filename = "tests/data/PRJNA271013_head.fq";

// warming up the cache doesn't seem to make the timings more repeatable?
// fastx::fastx_file(&filename[..], |seq| { assert!(seq.1.len() > 0) }).unwrap();
bench.iter(|| {
let mut n_bases = 0;
fastx::fastx_cli(
&filename[..],
fastx::parse_sequences(
File::open(filename).unwrap(),
|_| {},
|seq| {
n_bases += seq.seq.len();
Expand All @@ -102,32 +104,34 @@ fn bench_fastq_file(bench: &mut Bencher) {

fn bench_fasta_bytes(bench: &mut Bencher) {
let filename = String::from("tests/data/28S.fasta");
let ksize = 31;

let mut data: Vec<u8> = vec![];
let mut f = File::open(filename).unwrap();
f.read_to_end(&mut data);
let _ = f.read_to_end(&mut data);

bench.iter(|| {
let mut n_bases = 0;
fastx::fastx_bytes(&data, |seq| {
n_bases += seq.seq.len();
})
fastx::parse_sequences(
Cursor::new(&data),
|_| {},
|seq| {
n_bases += seq.seq.len();
},
)
.unwrap();
assert_eq!(738580, n_bases);
})
}

fn bench_fasta_file(bench: &mut Bencher) {
let filename = String::from("tests/data/28S.fasta");
let ksize = 31;
let filename = "tests/data/28S.fasta";

// warming up the cache doesn't seem to make the timings more repeatable?
// fastx::fastx_file(&filename[..], |seq| { assert!(seq.1.len() > 0) }).unwrap();
bench.iter(|| {
let mut n_bases = 0;
fastx::fastx_cli(
&filename[..],
fastx::parse_sequences(
File::open(filename).unwrap(),
|_| {},
|seq| {
n_bases += seq.seq.len();
Expand Down
4 changes: 4 additions & 0 deletions fuzz/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

target
corpus
artifacts
26 changes: 26 additions & 0 deletions fuzz/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

[package]
name = "needletail-fuzz"
version = "0.0.1"
authors = ["Automatically generated"]
publish = false

[package.metadata]
cargo-fuzz = true

[dependencies.needletail]
path = ".."
[dependencies.libfuzzer-sys]
git = "/~https://github.com/rust-fuzz/libfuzzer-sys.git"

# Prevent this from interfering with workspaces
[workspace]
members = ["."]

[[bin]]
name = "parse_fasta"
path = "fuzz_targets/parse_fasta.rs"

[[bin]]
name = "parse_fastq"
path = "fuzz_targets/parse_fastq.rs"
10 changes: 10 additions & 0 deletions fuzz/fuzz_targets/parse_fasta.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#![no_main]
#[macro_use] extern crate libfuzzer_sys;
extern crate needletail;

use std::io::Cursor;

fuzz_target!(|data: &[u8]| {
let cursor = Cursor::new([b">", data].concat());
let _ = needletail::parse_sequences(cursor, |_ftype| {}, |_seq| {});
});
10 changes: 10 additions & 0 deletions fuzz/fuzz_targets/parse_fastq.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#![no_main]
#[macro_use] extern crate libfuzzer_sys;
extern crate needletail;

use std::io::Cursor;

fuzz_target!(|data: &[u8]| {
let cursor = Cursor::new([b"@", data].concat());
let _ = needletail::parse_sequences(cursor, |_ftype| {}, |_seq| {});
});
53 changes: 26 additions & 27 deletions src/bitkmer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@ pub type BitKmer = (BitKmerSeq, u8);

/// Takes a BitKmer and adds a new base on the end, optionally loping off the
/// first base if the resulting kmer is too long.
fn extend_kmer(kmer: &mut BitKmer, new_char: &u8) -> bool {
fn extend_kmer(kmer: &mut BitKmer, new_char: u8) -> bool {
let new_char_int;
match new_char {
&b'A' | &b'a' => new_char_int = 0 as BitKmerSeq,
&b'C' | &b'c' => new_char_int = 1 as BitKmerSeq,
&b'G' | &b'g' => new_char_int = 2 as BitKmerSeq,
&b'T' | &b't' => new_char_int = 3 as BitKmerSeq,
b'A' | b'a' => new_char_int = 0 as BitKmerSeq,
b'C' | b'c' => new_char_int = 1 as BitKmerSeq,
b'G' | b'g' => new_char_int = 2 as BitKmerSeq,
b'T' | b't' => new_char_int = 3 as BitKmerSeq,
_ => return false,
};
let new_kmer = (kmer.0 << 2) + new_char_int;

// mask out any overflowed bits
kmer.0 = new_kmer & (BitKmerSeq::pow(2, (2 * kmer.1) as u32) - 1) as BitKmerSeq;
kmer.0 = new_kmer & (BitKmerSeq::pow(2, u32::from(2 * kmer.1)) - 1) as BitKmerSeq;
true
}

Expand All @@ -30,16 +30,15 @@ fn update_position(
return false;
}

let mut kmer_len = (kmer.1 - 1) as usize;
let mut stop_len = kmer.1 as usize;
if initial {
kmer_len = 0;
stop_len = (kmer.1 - 1) as usize;
}
let (mut kmer_len, stop_len) = if initial {
(0, (kmer.1 - 1) as usize)
} else {
((kmer.1 - 1) as usize, kmer.1 as usize)
};

let mut cur_kmer = kmer;
while kmer_len < stop_len {
if extend_kmer(&mut cur_kmer, &buffer[*start_pos + kmer_len]) {
if extend_kmer(&mut cur_kmer, buffer[*start_pos + kmer_len]) {
kmer_len += 1;
} else {
kmer_len = 0;
Expand Down Expand Up @@ -67,10 +66,10 @@ impl<'a> BitNuclKmer<'a> {
update_position(&mut start_pos, &mut kmer, slice, true);

BitNuclKmer {
start_pos: start_pos,
start_pos,
cur_kmer: kmer,
buffer: slice,
canonical: canonical,
canonical,
}
}
}
Expand Down Expand Up @@ -159,15 +158,15 @@ pub fn reverse_complement(kmer: BitKmer) -> BitKmer {
// inspired from https://www.biostars.org/p/113640/
let mut new_kmer = kmer.0;
// reverse it
new_kmer = (new_kmer >> 2 & 0x3333333333333333) | (new_kmer & 0x3333333333333333) << 2;
new_kmer = (new_kmer >> 4 & 0x0F0F0F0F0F0F0F0F) | (new_kmer & 0x0F0F0F0F0F0F0F0F) << 4;
new_kmer = (new_kmer >> 8 & 0x00FF00FF00FF00FF) | (new_kmer & 0x00FF00FF00FF00FF) << 8;
new_kmer = (new_kmer >> 16 & 0x0000FFFF0000FFFF) | (new_kmer & 0x0000FFFF0000FFFF) << 16;
new_kmer = (new_kmer >> 32 & 0x00000000FFFFFFFF) | (new_kmer & 0x00000000FFFFFFFF) << 32;
new_kmer = (new_kmer >> 2 & 0x3333_3333_3333_3333) | (new_kmer & 0x3333_3333_3333_3333) << 2;
new_kmer = (new_kmer >> 4 & 0x0F0F_0F0F_0F0F_0F0F) | (new_kmer & 0x0F0F_0F0F_0F0F_0F0F) << 4;
new_kmer = (new_kmer >> 8 & 0x00FF_00FF_00FF_00FF) | (new_kmer & 0x00FF_00FF_00FF_00FF) << 8;
new_kmer = (new_kmer >> 16 & 0x0000_FFFF_0000_FFFF) | (new_kmer & 0x0000_FFFF_0000_FFFF) << 16;
new_kmer = (new_kmer >> 32 & 0x0000_0000_FFFF_FFFF) | (new_kmer & 0x0000_0000_FFFF_FFFF) << 32;
// complement it
new_kmer ^= 0xFFFFFFFFFFFFFFFF;
new_kmer ^= 0xFFFF_FFFF_FFFF_FFFF;
// shift it to the right size
new_kmer = new_kmer >> (2 * (32 - kmer.1));
new_kmer >>= 2 * (32 - kmer.1);
(new_kmer, kmer.1)
}

Expand All @@ -194,8 +193,8 @@ pub fn canonical(kmer: BitKmer) -> (BitKmer, bool) {
pub fn minimizer(kmer: BitKmer, minmer_size: u8) -> BitKmer {
let mut new_kmer = kmer.0;
let mut lowest = !(0 as BitKmerSeq);
let bitmask = (BitKmerSeq::pow(2, (2 * minmer_size) as u32) - 1) as BitKmerSeq;
for _ in 0..(kmer.1 - minmer_size + 1) {
let bitmask = (BitKmerSeq::pow(2, u32::from(2 * minmer_size)) - 1) as BitKmerSeq;
for _ in 0..=(kmer.1 - minmer_size) {
let cur = bitmask & new_kmer;
if cur < lowest {
lowest = cur;
Expand Down Expand Up @@ -224,8 +223,8 @@ pub fn bitmer_to_bytes(kmer: BitKmer) -> Vec<u8> {
// math to figure out where they start (this helps us just pop the bases on the end
// of the working buffer as we read them off "left to right")
let offset = (kmer.1 - 1) * 2;
let bitmask =
BitKmerSeq::pow(2, (2 * kmer.1 - 1) as u32) + BitKmerSeq::pow(2, (2 * kmer.1 - 2) as u32);
let bitmask = BitKmerSeq::pow(2, u32::from(2 * kmer.1 - 1))
+ BitKmerSeq::pow(2, u32::from(2 * kmer.1 - 2));

for _ in 0..kmer.1 {
let new_char = (new_kmer & bitmask) >> offset;
Expand Down Expand Up @@ -253,7 +252,7 @@ pub fn bytes_to_bitmer(kmer: &[u8]) -> BitKmer {

let mut bit_kmer = (0u64, k);
for i in 0..k {
extend_kmer(&mut bit_kmer, &kmer[i as usize]);
extend_kmer(&mut bit_kmer, kmer[i as usize]);
}
bit_kmer
}
Expand Down
Loading