diff --git a/CHANGELOG.md b/CHANGELOG.md index b548fa2..1bc8cc1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +# v1.0.0 +## Changes +- Source code is included in the GitHub repository +- `LICENSE.md` has been updated to reflect terms and conditions of source code usage + # v0.14.2 ## Fixed - Fixed an issue where a CYP2D6 graph with no edges would lead to a panic during SVG graph visualization diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..f23af09 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,2590 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +# It is not in +version = 3 + +[[package]] +name = "addr2line" +version = "0.21.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "aho-corasick" +version = "1.0.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "6748e8def348ed4d14996fa801f4122cd763fff530258cdc03f64b25f89d3a5a" +dependencies = [ + "memchr", +] + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "0.5.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "b1f58811cfac344940f1a400b6e6231ce35171f614f26439e80f8c1465c5cc0c" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon 2.1.0", + "colorchoice", + "utf8parse", +] + +[[package]] +name = "anstream" +version = "0.6.13" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon 3.0.2", + "colorchoice", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.6" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" + +[[package]] +name = "anstyle-parse" +version = "0.2.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +dependencies = [ + "windows-sys 0.48.0", +] + +[[package]] +name = "anstyle-wincon" +version = "2.1.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "58f54d10c6dfa51283a066ceab3ec1ab78d13fae00aa49243a45e4571fb79dfd" +dependencies = [ + "anstyle", + "windows-sys 0.48.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", +] + +[[package]] +name = "anyhow" +version = "1.0.75" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" + +[[package]] +name = "approx" +version = "0.5.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" +dependencies = [ + "num-traits", +] + +[[package]] +name = "assert_approx_eq" +version = "1.1.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "3c07dab4369547dbe5114677b33fbbf724971019f3818172d59a97a61c774ffd" + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi 0.1.19", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "backtrace" +version = "0.3.69" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.21.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "414dcefbc63d77c526a76b3afcf6fbb9b5e2791c19c3aa2297733208750c6e53" + +[[package]] +name = "bio" +version = "1.5.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "25dccfc5babf5a4f505ab5bdda0e18d4b5fc1600c222677c54992203632cbdf5" +dependencies = [ + "anyhow", + "approx", + "bio-types", + "bit-set", + "bv", + "bytecount", + "csv", + "custom_derive", + "editdistancek", + "enum-map", + "fxhash", + "getset", + "itertools 0.11.0", + "itertools-num", + "lazy_static", + "multimap", + "ndarray", + "newtype_derive", + "num-integer", + "num-traits", + "ordered-float", + "petgraph", + "rand", + "regex", + "serde", + "serde_derive", + "statrs", + "strum 0.25.0", + "strum_macros 0.25.2", + "thiserror", + "triple_accel", + "vec_map", +] + +[[package]] +name = "bio-types" +version = "1.0.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "9d45749b87f21808051025e9bf714d14ff4627f9d8ca967eade6946ea769aa4a" +dependencies = [ + "derive-new", + "lazy_static", + "regex", + "strum_macros 0.25.2", + "thiserror", +] + +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bitflags" +version = "2.4.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" + +[[package]] +name = "bumpalo" +version = "3.13.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" + +[[package]] +name = "bv" +version = "0.11.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "8834bb1d8ee5dc048ee3124f2c7c1afcc6bc9aed03f11e9dfd8c69470a5db340" +dependencies = [ + "feature-probe", + "serde", +] + +[[package]] +name = "bytecount" +version = "0.6.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" + +[[package]] +name = "bytelines" +version = "2.2.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "791e4e40d13e1463dee537b254225c12c46ec7328f1817c6264873bc166f615f" + +[[package]] +name = "bytemuck" +version = "1.14.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6" + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "bytes" +version = "1.4.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" + +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "jobserver", + "libc", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.26" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "serde", + "time 0.1.45", + "wasm-bindgen", + "winapi", +] + +[[package]] +name = "clap" +version = "4.4.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "7c8d502cbaec4595d2e7d5f61e318f05417bd2b66fdc3809498f0d3fdf0bea27" +dependencies = [ + "clap_builder", + "clap_derive", + "once_cell", +] + +[[package]] +name = "clap_builder" +version = "4.4.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "5891c7bc0edb3e1c2204fc5e94009affabeb1821c9e5fdc3959536c5c0bb984d" +dependencies = [ + "anstream 0.5.0", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.4.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "c9fd1a5729c4548118d7d70ff234a44868d00489a4b6597b0b020918a0e91a1a" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn 2.0.74", +] + +[[package]] +name = "clap_lex" +version = "0.5.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "cd7cc57abe963c6d3b9d8be5b06ba7c8957a930305ca90304f24ef040aa6f961" + +[[package]] +name = "cmake" +version = "0.1.50" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +dependencies = [ + "cc", +] + +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + +[[package]] +name = "core-foundation-sys" +version = "0.8.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" + +[[package]] +name = "cpu-time" +version = "1.0.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" + +[[package]] +name = "csv" +version = "1.2.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "custom_derive" +version = "0.1.7" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" + +[[package]] +name = "darling" +version = "0.14.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.14.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 1.0.109", +] + +[[package]] +name = "darling_macro" +version = "0.14.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" +dependencies = [ + "darling_core", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "derive-new" +version = "0.5.9" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_builder" +version = "0.13.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "8f59169f400d8087f238c5c0c7db6a28af18681717f3b623227d92f397e938c7" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.13.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a4ec317cc3e7ef0928b0ca6e4a634a4d6c001672ae210438cf114a83e56b018d" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_builder_macro" +version = "0.13.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "870368c3fb35b8031abb378861d4460f573b92238ec2152c927a21f77e3e0127" +dependencies = [ + "derive_builder_core", + "syn 1.0.109", +] + +[[package]] +name = "editdistancek" +version = "1.0.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "3e02df23d5b1c6f9e69fa603b890378123b93073df998a21e6e33b9db0a32613" + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "encoding_rs" +version = "0.8.33" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "enum-map" +version = "1.1.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "e893a7ba6116821058dec84a6fb14fb2a97cd8ce5fd0f85d5a4e760ecd7329d9" +dependencies = [ + "enum-map-derive", +] + +[[package]] +name = "enum-map-derive" +version = "0.6.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "84278eae0af6e34ff6c1db44c11634a694aafac559ff3080e4db4e4ac35907aa" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "env_filter" +version = "0.1.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.9.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7" +dependencies = [ + "atty", + "humantime", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "env_logger" +version = "0.10.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" +dependencies = [ + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "env_logger" +version = "0.11.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" +dependencies = [ + "anstream 0.6.13", + "anstyle", + "env_filter", + "humantime", + "log", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "exitcode" +version = "1.1.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "de853764b47027c2e862a995c34978ffa63c1501f2e15f987ba11bd4f9bba193" + +[[package]] +name = "feature-probe" +version = "0.1.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "835a3dc7d1ec9e75e2b5fb4ba75396837112d2060b03f7d43bc1897c7f7211da" + +[[package]] +name = "fffx" +version = "0.1.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "f33c20b97a5cfd9d56d67e15677c1dae66c52846ee51801af32b94b70438f626" +dependencies = [ + "bytelines", + "flate2", + "simdutf8", + "static_assertions", +] + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "flate2" +version = "1.0.28" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" +dependencies = [ + "crc32fast", + "libz-sys", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "form_urlencoded" +version = "1.2.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "fs-utils" +version = "1.1.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "6fc7a9dc005c944c98a935e7fd626faf5bf7e5a609f94bc13e42fc4a02e52593" +dependencies = [ + "quick-error", +] + +[[package]] +name = "futures-channel" +version = "0.3.28" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +dependencies = [ + "futures-core", +] + +[[package]] +name = "futures-core" +version = "0.3.28" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" + +[[package]] +name = "futures-io" +version = "0.3.28" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" + +[[package]] +name = "futures-macro" +version = "0.3.28" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.74", +] + +[[package]] +name = "futures-sink" +version = "0.3.28" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" + +[[package]] +name = "futures-task" +version = "0.3.28" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" + +[[package]] +name = "futures-util" +version = "0.3.28" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +dependencies = [ + "futures-core", + "futures-io", + "futures-macro", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getset" +version = "0.1.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "e45727250e75cc04ff2846a66397da8ef2b3db8e40e0cef4df67950a07621eb9" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "gimli" +version = "0.28.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0" + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "h2" +version = "0.3.21" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "91fc23aa11be92976ef4729127f1a74adf36d8436f7816b185d18df956790833" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap 1.9.3", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.14.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "hermit-abi" +version = "0.3.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" + +[[package]] +name = "hiphase" +version = "1.2.1" +source = "git+/~https://github.com/PacificBiosciences/HiPhase.git?tag=v1.2.1#dff3c47ad39c5c3325e91afdebdaa21ad3e459bc" +dependencies = [ + "bio", + "bit-vec", + "chrono", + "clap", + "cpu-time", + "csv", + "env_logger 0.9.3", + "exitcode", + "flate2", + "lazy_static", + "log", + "priority-queue", + "rust-htslib 0.39.5", + "rustc-hash", + "serde", + "simple-error 0.2.3", + "threadpool", + "vergen", +] + +[[package]] +name = "hts-sys" +version = "2.1.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "deebfb779c734d542e7f14c298597914b9b5425e4089aef482eacb5cab941915" +dependencies = [ + "cc", + "fs-utils", + "glob", + "libz-sys", +] + +[[package]] +name = "http" +version = "0.2.9" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.8.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "hyper" +version = "0.14.27" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "ffb1cfd654a8219eaef89881fdb3bb3b1cdc5fa75ded05d6933b2b382e395468" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2 0.4.9", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.24.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "8d78e1e73ec14cf7375674f74d7dde185c8206fd9dea6fb6295e8a98098aaa97" +dependencies = [ + "futures-util", + "http", + "hyper", + "rustls", + "tokio", + "tokio-rustls", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.57" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "0.4.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "ieee754" +version = "0.2.6" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "9007da9cacbd3e6343da136e98b0d2df013f553d35bdec8b518f07bea768e19c" + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + +[[package]] +name = "indexmap" +version = "2.0.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" +dependencies = [ + "equivalent", + "hashbrown 0.14.0", +] + +[[package]] +name = "ipnet" +version = "2.8.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" + +[[package]] +name = "is-terminal" +version = "0.4.9" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +dependencies = [ + "hermit-abi 0.3.2", + "rustix", + "windows-sys 0.48.0", +] + +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itertools-num" +version = "0.1.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a872a22f9e6f7521ca557660adb96dd830e54f0f490fa115bb55dd69d38b27e7" +dependencies = [ + "num-traits", +] + +[[package]] +name = "itoa" +version = "1.0.9" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" + +[[package]] +name = "jobserver" +version = "0.1.26" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.64" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "layout-rs" +version = "0.1.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "84deb28a3a6c839ca42a7341664f32281416d69e2f29deb85aec5cc0243fdea8" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.152" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" + +[[package]] +name = "libm" +version = "0.2.7" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" + +[[package]] +name = "libz-sys" +version = "1.1.12" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" +dependencies = [ + "cc", + "cmake", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linear-map" +version = "1.2.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "bfae20f6b19ad527b550c223fddc3077a547fc70cda94b9b566575423fd303ee" + +[[package]] +name = "linux-raw-sys" +version = "0.4.12" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + +[[package]] +name = "matrixmultiply" +version = "0.3.7" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "090126dc04f95dc0d1c1c91f61bdd474b3930ca064c1edc8a849da2c6cbe1e77" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "memchr" +version = "2.6.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "76fc44e2588d5b436dbc3c6cf62aef290f90dab6235744a93dfe1cc18f451e2c" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "minimap2" +version = "0.1.16+minimap2.2.26" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "1a6cb7c4c240401901de9c5bdfbebbcd8a43a2beb980a00c3b7c558d922ec089" +dependencies = [ + "fffx", + "flate2", + "libc", + "minimap2-sys", + "simdutf8", +] + +[[package]] +name = "minimap2-sys" +version = "0.1.16+minimap2.2.26" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "b3116fd091e0b499cd370475c6d03f8c333aa956b1769140dfd07e1a42101c8a" +dependencies = [ + "cc", + "libz-sys", + "pkg-config", +] + +[[package]] +name = "miniz_oxide" +version = "0.7.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] + +[[package]] +name = "mio" +version = "0.8.8" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +dependencies = [ + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.48.0", +] + +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +dependencies = [ + "serde", +] + +[[package]] +name = "nalgebra" +version = "0.29.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d506eb7e08d6329505faa8a3a00a5dcc6de9f76e0c77e4b75763ae3c770831ff" +dependencies = [ + "approx", + "matrixmultiply", + "nalgebra-macros", + "num-complex", + "num-rational", + "num-traits", + "rand", + "rand_distr", + "simba", + "typenum", +] + +[[package]] +name = "nalgebra-macros" +version = "0.1.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "01fcc0b8149b4632adc89ac3b7b31a12fb6099a0317a4eb2ebff574ef7de7218" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "ndarray" +version = "0.15.6" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "rawpointer", +] + +[[package]] +name = "newtype_derive" +version = "0.1.6" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "ac8cd24d9f185bb7223958d8c1ff7a961b74b1953fd05dba7cc568a63b3861ec" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "num-complex" +version = "0.4.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + +[[package]] +name = "num-integer" +version = "0.1.45" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.16" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi 0.3.2", + "libc", +] + +[[package]] +name = "num_threads" +version = "0.1.6" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" +dependencies = [ + "libc", +] + +[[package]] +name = "object" +version = "0.32.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "77ac5bbd07aea88c60a577a1ce218075ffd59208b2d7ca97adf9bfc5aeb21ebe" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "ordered-float" +version = "3.9.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "2a54938017eacd63036332b4ae5c8a49fc8c0c1d6d629893057e4f13609edd06" +dependencies = [ + "num-traits", +] + +[[package]] +name = "paste" +version = "1.0.14" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" + +[[package]] +name = "pbstarphase" +version = "1.0.0" +dependencies = [ + "assert_approx_eq", + "bio", + "chrono", + "clap", + "csv", + "env_logger 0.10.0", + "exitcode", + "flate2", + "hiphase", + "itertools 0.12.1", + "layout-rs", + "lazy_static", + "log", + "minimap2", + "quick-xml", + "regex", + "reqwest", + "rust-htslib 0.44.1", + "rust-lib-reference-genome", + "rustc-hash", + "serde", + "serde_json", + "simple-error 0.3.1", + "statrs", + "strum 0.26.2", + "strum_macros 0.26.4", + "thiserror", + "vergen", + "waffle_con", + "zip", +] + +[[package]] +name = "percent-encoding" +version = "2.3.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" + +[[package]] +name = "petgraph" +version = "0.6.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" +dependencies = [ + "fixedbitset", + "indexmap 2.0.0", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.13" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.27" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "priority-queue" +version = "1.3.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "fff39edfcaec0d64e8d0da38564fad195d2d51b680940295fcc307366e101e61" +dependencies = [ + "autocfg", + "indexmap 1.9.3", +] + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.109", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + +[[package]] +name = "quick-xml" +version = "0.36.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "96a05e2e8efddfa51a84ca47cec303fac86c8541b686d37cac5efc0e094417bc" +dependencies = [ + "memchr", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand", +] + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "regex" +version = "1.9.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.3.8" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" + +[[package]] +name = "reqwest" +version = "0.11.20" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "3e9ad3fe7488d7e34558a2033d45a0c90b72d97b4f80705666fea71472e2e6a1" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-rustls", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "tokio", + "tokio-rustls", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "webpki-roots", + "winreg", +] + +[[package]] +name = "ring" +version = "0.16.20" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" +dependencies = [ + "cc", + "libc", + "once_cell", + "spin", + "untrusted", + "web-sys", + "winapi", +] + +[[package]] +name = "rust-htslib" +version = "0.39.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "239ef7334dbf59acd56b7a6fa62a525ed7e36d6239a686ed4ff61bc794108e53" +dependencies = [ + "bio-types", + "byteorder", + "custom_derive", + "derive-new", + "hts-sys", + "ieee754", + "lazy_static", + "libc", + "linear-map", + "newtype_derive", + "regex", + "thiserror", + "url", +] + +[[package]] +name = "rust-htslib" +version = "0.44.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "7c7eb0f29fce64a4e22578905efef3d72389058016023279a58b282eb5c0c467" +dependencies = [ + "bio-types", + "byteorder", + "custom_derive", + "derive-new", + "hts-sys", + "ieee754", + "lazy_static", + "libc", + "linear-map", + "newtype_derive", + "regex", + "thiserror", + "url", +] + +[[package]] +name = "rust-lib-reference-genome" +version = "0.2.0" +source = "git+/~https://github.com/holtjma/rust-lib-reference-genome.git?tag=v0.2.0#6d6925118653654ef4d38b86f23b55079dad6f6c" +dependencies = [ + "bio", + "flate2", + "log", + "rustc-hash", + "simple-error 0.3.1", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc_version" +version = "0.1.7" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "0.38.30" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls" +version = "0.21.7" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "cd8d6c9f025a446bc4d18ad9632e69aec8f287aa84499ee335599fabd20c3fd8" +dependencies = [ + "log", + "ring", + "rustls-webpki", + "sct", +] + +[[package]] +name = "rustls-pemfile" +version = "1.0.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "2d3987094b1d07b653b7dfdc3f70ce9a1da9c51ac18c1b06b662e4f9a0e9f4b2" +dependencies = [ + "base64", +] + +[[package]] +name = "rustls-webpki" +version = "0.101.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "7d93931baf2d282fff8d3a532bbfd7653f734643161b87e3e01e59a04439bf0d" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.14" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" + +[[package]] +name = "ryu" +version = "1.0.15" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" + +[[package]] +name = "safe_arch" +version = "0.7.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "f398075ce1e6a179b46f51bd88d0598b92b00d3551f1a2d4ac49e771b56ac354" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "sct" +version = "0.7.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "semver" +version = "0.1.20" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac" + +[[package]] +name = "serde" +version = "1.0.188" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.188" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.74", +] + +[[package]] +name = "serde_json" +version = "1.0.105" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "693151e1ac27563d6dbcec9dee9fbd5da8539b20fa14ad3752b2e6d363ace360" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "simba" +version = "0.6.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "f0b7840f121a46d63066ee7a99fc81dcabbc6105e437cae43528cea199b5a05f" +dependencies = [ + "approx", + "num-complex", + "num-traits", + "paste", + "wide", +] + +[[package]] +name = "simdutf8" +version = "0.1.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" + +[[package]] +name = "simple-error" +version = "0.2.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "cc47a29ce97772ca5c927f75bac34866b16d64e07f330c3248e2d7226623901b" + +[[package]] +name = "simple-error" +version = "0.3.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "7e2accd2c41a0e920d2abd91b2badcfa1da784662f54fbc47e0e3a51f1e2e1cf" + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "socket2" +version = "0.4.9" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "socket2" +version = "0.5.3" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877" +dependencies = [ + "libc", + "windows-sys 0.48.0", +] + +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "statrs" +version = "0.16.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "2d08e5e1748192713cc281da8b16924fb46be7b0c2431854eadc785823e5696e" +dependencies = [ + "approx", + "lazy_static", + "nalgebra", + "num-traits", + "rand", +] + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "strum" +version = "0.25.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" + +[[package]] +name = "strum" +version = "0.26.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29" + +[[package]] +name = "strum_macros" +version = "0.25.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "ad8d03b598d3d0fff69bf533ee3ef19b8eeb342729596df84bcc7e1f96ec4059" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.74", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.74", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.74" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "termcolor" +version = "1.2.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "thiserror" +version = "1.0.63" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.63" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.74", +] + +[[package]] +name = "threadpool" +version = "1.8.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" +dependencies = [ + "num_cpus", +] + +[[package]] +name = "time" +version = "0.1.45" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" +dependencies = [ + "libc", + "wasi 0.10.0+wasi-snapshot-preview1", + "winapi", +] + +[[package]] +name = "time" +version = "0.3.36" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" +dependencies = [ + "deranged", + "itoa", + "libc", + "num-conv", + "num_threads", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" + +[[package]] +name = "time-macros" +version = "0.2.18" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.32.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "num_cpus", + "pin-project-lite", + "socket2 0.5.3", + "windows-sys 0.48.0", +] + +[[package]] +name = "tokio-rustls" +version = "0.24.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.8" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", + "tracing", +] + +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" + +[[package]] +name = "tracing" +version = "0.1.37" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +dependencies = [ + "cfg-if", + "pin-project-lite", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.31" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" +dependencies = [ + "once_cell", +] + +[[package]] +name = "triple_accel" +version = "0.4.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "22048bc95dfb2ffd05b1ff9a756290a009224b60b2f0e7525faeee7603851e63" + +[[package]] +name = "try-lock" +version = "0.2.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" + +[[package]] +name = "typenum" +version = "1.16.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" + +[[package]] +name = "unicode-bidi" +version = "0.3.13" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" + +[[package]] +name = "unicode-ident" +version = "1.0.11" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" + +[[package]] +name = "unicode-normalization" +version = "0.1.22" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "untrusted" +version = "0.7.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" + +[[package]] +name = "url" +version = "2.4.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "143b538f18257fac9cad154828a57c6bf5157e1aa604d4816b5995bf6de87ae5" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" +dependencies = [ + "serde", +] + +[[package]] +name = "vergen" +version = "8.2.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "bbc5ad0d9d26b2c49a5ab7da76c3e79d3ee37e7821799f8223fcb8f2f391a2e7" +dependencies = [ + "anyhow", + "rustversion", + "time 0.3.36", +] + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "waffle_con" +version = "0.4.2" +source = "git+/~https://github.com/PacificBiosciences/waffle_con.git?tag=v0.4.2#7997a5939271988b0ed57029af4c6637cebe81e2" +dependencies = [ + "derive_builder", + "env_logger 0.11.2", + "itertools 0.12.1", + "log", + "priority-queue", + "rand", + "rustc-hash", + "simple-error 0.3.1", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.87" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.87" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.74", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.37" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.87" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.87" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.74", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.87" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" + +[[package]] +name = "web-sys" +version = "0.3.64" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "0.25.2" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc" + +[[package]] +name = "wide" +version = "0.7.11" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "aa469ffa65ef7e0ba0f164183697b89b854253fd31aeb92358b7b6155177d62f" +dependencies = [ + "bytemuck", + "safe_arch", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.0", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + +[[package]] +name = "zip" +version = "0.6.6" +source = "registry+/~https://github.com/rust-lang/crates.io-index" +checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" +dependencies = [ + "byteorder", + "crc32fast", + "crossbeam-utils", + "flate2", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..c9de61f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,42 @@ +[package] +name = "pbstarphase" +version = "1.0.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[build-dependencies] +vergen = { version = "8.2.4", features = ["git", "gitcl"] } + +[dependencies] +bio = "1.5.0" +chrono = { version = "0.4.26", features = ["serde"] } +clap = { version = "4.4.1", features = ["derive"] } +csv = "1.2.2" +env_logger = "0.10.0" +exitcode = "1.1.2" +flate2 = "1.0.28" +hiphase = { tag = "v1.2.1", git = "/~https://github.com/PacificBiosciences/HiPhase.git" } +itertools = "0.12.1" +layout-rs = "0.1.2" +lazy_static = "1.4.0" +log = "0.4.20" +minimap2 = "0.1.16" +quick-xml = "0.36.1" +regex = "1.9.5" +reqwest = { version = "0.11.20", default-features = false, features = ["blocking", "rustls-tls"] } +rust-htslib = { version = "0.44.1", default-features = false, features = ["static"] } +rustc-hash = "1.1.0" +rust-lib-reference-genome = { tag = "v0.2.0", git = "/~https://github.com/holtjma/rust-lib-reference-genome.git" } +serde = { version = "1.0.188", features = ["derive"] } +serde_json = "1.0.105" +simple-error = "0.3.0" +statrs = "0.16.0" +strum = "0.26.2" +strum_macros = "0.26.2" +thiserror = "1.0.63" +waffle_con = { tag = "v0.4.2", git = "/~https://github.com/PacificBiosciences/waffle_con.git" } +zip = { version = "0.6.6", default-features = false, features = ["deflate"] } + +[dev-dependencies] +assert_approx_eq = "1.1.0" diff --git a/LICENSE b/LICENSE deleted file mode 100644 index af8f28f..0000000 --- a/LICENSE +++ /dev/null @@ -1,34 +0,0 @@ -Copyright (c) 2022, Pacific Biosciences of California, Inc. - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted (subject to the limitations in the -disclaimer below) provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of Pacific Biosciences nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE -GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC -BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED -WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF -USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT -OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -SUCH DAMAGE. diff --git a/LICENSE-THIRDPARTY.json b/LICENSE-THIRDPARTY.json new file mode 100644 index 0000000..4ddf920 --- /dev/null +++ b/LICENSE-THIRDPARTY.json @@ -0,0 +1,2486 @@ +[ + { + "name": "addr2line", + "version": "0.21.0", + "authors": null, + "repository": "/~https://github.com/gimli-rs/addr2line", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A cross-platform symbolication library written in Rust, using `gimli`" + }, + { + "name": "adler", + "version": "1.0.2", + "authors": "Jonas Schievink ", + "repository": "/~https://github.com/jonas-schievink/adler.git", + "license": "0BSD OR Apache-2.0 OR MIT", + "license_file": null, + "description": "A simple clean-room implementation of the Adler-32 checksum" + }, + { + "name": "aho-corasick", + "version": "1.0.4", + "authors": "Andrew Gallant ", + "repository": "/~https://github.com/BurntSushi/aho-corasick", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "Fast multiple substring searching." + }, + { + "name": "android-tzdata", + "version": "0.1.1", + "authors": "RumovZ", + "repository": "/~https://github.com/RumovZ/android-tzdata", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Parser for the Android-specific tzdata file" + }, + { + "name": "android_system_properties", + "version": "0.1.5", + "authors": "Nicolas Silva ", + "repository": "/~https://github.com/nical/android_system_properties", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Minimal Android system properties wrapper" + }, + { + "name": "anstream", + "version": "0.5.0", + "authors": null, + "repository": "/~https://github.com/rust-cli/anstyle.git", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A simple cross platform library for writing colored text to a terminal." + }, + { + "name": "anstream", + "version": "0.6.13", + "authors": null, + "repository": "/~https://github.com/rust-cli/anstyle.git", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A simple cross platform library for writing colored text to a terminal." + }, + { + "name": "anstyle", + "version": "1.0.6", + "authors": null, + "repository": "/~https://github.com/rust-cli/anstyle.git", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "ANSI text styling" + }, + { + "name": "anstyle-parse", + "version": "0.2.1", + "authors": null, + "repository": "/~https://github.com/rust-cli/anstyle.git", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Parse ANSI Style Escapes" + }, + { + "name": "anstyle-query", + "version": "1.0.0", + "authors": null, + "repository": "/~https://github.com/rust-cli/anstyle", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Look up colored console capabilities" + }, + { + "name": "anstyle-wincon", + "version": "2.1.0", + "authors": null, + "repository": "/~https://github.com/rust-cli/anstyle.git", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Styling legacy Windows terminals" + }, + { + "name": "anstyle-wincon", + "version": "3.0.2", + "authors": null, + "repository": "/~https://github.com/rust-cli/anstyle.git", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Styling legacy Windows terminals" + }, + { + "name": "anyhow", + "version": "1.0.75", + "authors": "David Tolnay ", + "repository": "/~https://github.com/dtolnay/anyhow", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Flexible concrete Error type built on std::error::Error" + }, + { + "name": "approx", + "version": "0.5.1", + "authors": "Brendan Zabarauskas ", + "repository": "/~https://github.com/brendanzab/approx", + "license": "Apache-2.0", + "license_file": null, + "description": "Approximate floating point equality comparisons and assertions." + }, + { + "name": "assert_approx_eq", + "version": "1.1.0", + "authors": "Ashley Williams ", + "repository": "/~https://github.com/ashleygwilliams/assert_approx_eq.git", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "assert approximately equal" + }, + { + "name": "atty", + "version": "0.2.14", + "authors": "softprops ", + "repository": "/~https://github.com/softprops/atty", + "license": "MIT", + "license_file": null, + "description": "A simple interface for querying atty" + }, + { + "name": "autocfg", + "version": "1.1.0", + "authors": "Josh Stone ", + "repository": "/~https://github.com/cuviper/autocfg", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Automatic cfg for Rust compiler features" + }, + { + "name": "backtrace", + "version": "0.3.69", + "authors": "The Rust Project Developers", + "repository": "/~https://github.com/rust-lang/backtrace-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A library to acquire a stack trace (backtrace) at runtime in a Rust program." + }, + { + "name": "base64", + "version": "0.21.3", + "authors": "Alice Maz |Marshall Pierce ", + "repository": "/~https://github.com/marshallpierce/rust-base64", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "encodes and decodes base64 as bytes or utf8" + }, + { + "name": "bio", + "version": "1.5.0", + "authors": "Johannes Köster ", + "repository": "/~https://github.com/rust-bio/rust-bio", + "license": "MIT", + "license_file": null, + "description": "A bioinformatics library for Rust. This library provides implementations of many algorithms and data structures that are useful for bioinformatics, but also in other fields." + }, + { + "name": "bio-types", + "version": "1.0.1", + "authors": "Johannes Köster ", + "repository": "/~https://github.com/rust-bio/rust-bio-types", + "license": "MIT", + "license_file": "LICENSE.md", + "description": "A collection of common biomedical types for use in rust-bio and rust-htslib." + }, + { + "name": "bit-set", + "version": "0.5.3", + "authors": "Alexis Beingessner ", + "repository": "/~https://github.com/contain-rs/bit-set", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A set of bits" + }, + { + "name": "bit-vec", + "version": "0.6.3", + "authors": "Alexis Beingessner ", + "repository": "/~https://github.com/contain-rs/bit-vec", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A vector of bits" + }, + { + "name": "bitflags", + "version": "2.4.0", + "authors": "The Rust Project Developers", + "repository": "/~https://github.com/bitflags/bitflags", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A macro to generate structures which behave like bitflags." + }, + { + "name": "bumpalo", + "version": "3.13.0", + "authors": "Nick Fitzgerald ", + "repository": "/~https://github.com/fitzgen/bumpalo", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A fast bump allocation arena for Rust." + }, + { + "name": "bv", + "version": "0.11.1", + "authors": "Jesse A. Tov ", + "repository": "/~https://github.com/tov/bv-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Bit-vectors and bit-slices" + }, + { + "name": "bytecount", + "version": "0.6.3", + "authors": "Andre Bogus |Joshua Landau ", + "repository": "/~https://github.com/llogiq/bytecount", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "count occurrences of a given byte, or the number of UTF-8 code points, in a byte slice, fast" + }, + { + "name": "bytelines", + "version": "2.2.2", + "authors": "Isaac Whitfield ", + "repository": "/~https://github.com/whitfin/bytelines", + "license": "MIT", + "license_file": null, + "description": "Read input lines as byte slices for high efficiency" + }, + { + "name": "bytemuck", + "version": "1.14.0", + "authors": "Lokathor ", + "repository": "/~https://github.com/Lokathor/bytemuck", + "license": "Apache-2.0 OR MIT OR Zlib", + "license_file": null, + "description": "A crate for mucking around with piles of bytes." + }, + { + "name": "byteorder", + "version": "1.4.3", + "authors": "Andrew Gallant ", + "repository": "/~https://github.com/BurntSushi/byteorder", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "Library for reading/writing numbers in big-endian and little-endian." + }, + { + "name": "bytes", + "version": "1.4.0", + "authors": "Carl Lerche |Sean McArthur ", + "repository": "/~https://github.com/tokio-rs/bytes", + "license": "MIT", + "license_file": null, + "description": "Types and traits for working with bytes" + }, + { + "name": "cc", + "version": "1.0.83", + "authors": "Alex Crichton ", + "repository": "/~https://github.com/rust-lang/cc-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A build-time dependency for Cargo build scripts to assist in invoking the native C compiler to compile native C code into a static archive to be linked into Rust code." + }, + { + "name": "cfg-if", + "version": "1.0.0", + "authors": "Alex Crichton ", + "repository": "/~https://github.com/alexcrichton/cfg-if", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A macro to ergonomically define an item depending on a large number of #[cfg] parameters. Structured like an if-else chain, the first matching branch is the item that gets emitted." + }, + { + "name": "chrono", + "version": "0.4.26", + "authors": null, + "repository": "/~https://github.com/chronotope/chrono", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Date and time library for Rust" + }, + { + "name": "clap", + "version": "4.4.1", + "authors": null, + "repository": "/~https://github.com/clap-rs/clap", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A simple to use, efficient, and full-featured Command Line Argument Parser" + }, + { + "name": "clap_builder", + "version": "4.4.1", + "authors": null, + "repository": "/~https://github.com/clap-rs/clap", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A simple to use, efficient, and full-featured Command Line Argument Parser" + }, + { + "name": "clap_derive", + "version": "4.4.0", + "authors": null, + "repository": "/~https://github.com/clap-rs/clap/tree/master/clap_derive", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Parse command line argument by defining a struct, derive crate." + }, + { + "name": "clap_lex", + "version": "0.5.1", + "authors": null, + "repository": "/~https://github.com/clap-rs/clap/tree/master/clap_lex", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Minimal, flexible command line parser" + }, + { + "name": "cmake", + "version": "0.1.50", + "authors": "Alex Crichton ", + "repository": "/~https://github.com/rust-lang/cmake-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A build dependency for running `cmake` to build a native library" + }, + { + "name": "colorchoice", + "version": "1.0.0", + "authors": null, + "repository": "/~https://github.com/rust-cli/anstyle", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Global override of color control" + }, + { + "name": "core-foundation-sys", + "version": "0.8.4", + "authors": "The Servo Project Developers", + "repository": "/~https://github.com/servo/core-foundation-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Bindings to Core Foundation for macOS" + }, + { + "name": "cpu-time", + "version": "1.0.0", + "authors": "Paul Colomiets ", + "repository": null, + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Small crate that provides CPU time measurement." + }, + { + "name": "crc32fast", + "version": "1.3.2", + "authors": "Sam Rijs |Alex Crichton ", + "repository": "/~https://github.com/srijs/rust-crc32fast", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Fast, SIMD-accelerated CRC32 (IEEE) checksum computation" + }, + { + "name": "crossbeam-utils", + "version": "0.8.19", + "authors": null, + "repository": "/~https://github.com/crossbeam-rs/crossbeam", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Utilities for concurrent programming" + }, + { + "name": "csv", + "version": "1.2.2", + "authors": "Andrew Gallant ", + "repository": "/~https://github.com/BurntSushi/rust-csv", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "Fast CSV parsing with support for serde." + }, + { + "name": "csv-core", + "version": "0.1.10", + "authors": "Andrew Gallant ", + "repository": "/~https://github.com/BurntSushi/rust-csv", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "Bare bones CSV parsing with no_std support." + }, + { + "name": "custom_derive", + "version": "0.1.7", + "authors": "Daniel Keep ", + "repository": "/~https://github.com/DanielKeep/rust-custom-derive/tree/custom_derive-master", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "(Note: superseded by `macro-attr`) This crate provides a macro that enables the use of custom derive attributes." + }, + { + "name": "darling", + "version": "0.14.4", + "authors": "Ted Driggs ", + "repository": "/~https://github.com/TedDriggs/darling", + "license": "MIT", + "license_file": null, + "description": "A proc-macro library for reading attributes into structs when implementing custom derives." + }, + { + "name": "darling_core", + "version": "0.14.4", + "authors": "Ted Driggs ", + "repository": "/~https://github.com/TedDriggs/darling", + "license": "MIT", + "license_file": null, + "description": "Helper crate for proc-macro library for reading attributes into structs when implementing custom derives. Use https://crates.io/crates/darling in your code." + }, + { + "name": "darling_macro", + "version": "0.14.4", + "authors": "Ted Driggs ", + "repository": "/~https://github.com/TedDriggs/darling", + "license": "MIT", + "license_file": null, + "description": "Internal support for a proc-macro library for reading attributes into structs when implementing custom derives. Use https://crates.io/crates/darling in your code." + }, + { + "name": "deranged", + "version": "0.3.11", + "authors": "Jacob Pratt ", + "repository": "/~https://github.com/jhpratt/deranged", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Ranged integers" + }, + { + "name": "derive-new", + "version": "0.5.9", + "authors": "Nick Cameron ", + "repository": "/~https://github.com/nrc/derive-new", + "license": "MIT", + "license_file": null, + "description": "`#[derive(new)]` implements simple constructor functions for structs and enums." + }, + { + "name": "derive_builder", + "version": "0.13.1", + "authors": "Colin Kiegel |Pascal Hertleif |Jan-Erik Rediger |Ted Driggs ", + "repository": "/~https://github.com/colin-kiegel/rust-derive-builder", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Rust macro to automatically implement the builder pattern for arbitrary structs." + }, + { + "name": "derive_builder_core", + "version": "0.13.1", + "authors": "Colin Kiegel |Pascal Hertleif |Jan-Erik Rediger |Ted Driggs ", + "repository": "/~https://github.com/colin-kiegel/rust-derive-builder", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Internal helper library for the derive_builder crate." + }, + { + "name": "derive_builder_macro", + "version": "0.13.1", + "authors": "Colin Kiegel |Pascal Hertleif |Jan-Erik Rediger |Ted Driggs ", + "repository": "/~https://github.com/colin-kiegel/rust-derive-builder", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Rust macro to automatically implement the builder pattern for arbitrary structs." + }, + { + "name": "editdistancek", + "version": "1.0.2", + "authors": "Nikolai Karpov ", + "repository": "/~https://github.com/nkkarpov/editdistancek", + "license": "MIT", + "license_file": null, + "description": "Fast algorithm for computing edit distance" + }, + { + "name": "either", + "version": "1.9.0", + "authors": "bluss", + "repository": "/~https://github.com/bluss/either", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "The enum `Either` with variants `Left` and `Right` is a general purpose sum type with two cases." + }, + { + "name": "encoding_rs", + "version": "0.8.33", + "authors": "Henri Sivonen ", + "repository": "/~https://github.com/hsivonen/encoding_rs", + "license": "(Apache-2.0 OR MIT) AND BSD-3-Clause", + "license_file": null, + "description": "A Gecko-oriented implementation of the Encoding Standard" + }, + { + "name": "enum-map", + "version": "1.1.1", + "authors": "Konrad Borowski ", + "repository": "https://gitlab.com/KonradBorowski/enum-map", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A map with C-like enum keys represented internally as an array" + }, + { + "name": "enum-map-derive", + "version": "0.6.0", + "authors": "Konrad Borowski ", + "repository": "https://gitlab.com/KonradBorowski/enum-map", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Macros 1.1 implementation of #[derive(Enum)]" + }, + { + "name": "env_filter", + "version": "0.1.0", + "authors": null, + "repository": "/~https://github.com/rust-cli/env_logger", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Filter log events using environment variables" + }, + { + "name": "env_logger", + "version": "0.9.3", + "authors": null, + "repository": "/~https://github.com/env-logger-rs/env_logger/", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A logging implementation for `log` which is configured via an environment variable." + }, + { + "name": "env_logger", + "version": "0.10.0", + "authors": null, + "repository": "/~https://github.com/rust-cli/env_logger/", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A logging implementation for `log` which is configured via an environment variable." + }, + { + "name": "env_logger", + "version": "0.11.2", + "authors": null, + "repository": "/~https://github.com/rust-cli/env_logger", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A logging implementation for `log` which is configured via an environment variable." + }, + { + "name": "equivalent", + "version": "1.0.1", + "authors": null, + "repository": "/~https://github.com/cuviper/equivalent", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Traits for key comparison in maps." + }, + { + "name": "errno", + "version": "0.3.8", + "authors": "Chris Wong ", + "repository": "/~https://github.com/lambda-fairy/rust-errno", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Cross-platform interface to the `errno` variable." + }, + { + "name": "exitcode", + "version": "1.1.2", + "authors": "Ben Wilber ", + "repository": "/~https://github.com/benwilber/exitcode", + "license": "Apache-2.0", + "license_file": null, + "description": "Preferred system exit codes as defined by sysexits.h" + }, + { + "name": "feature-probe", + "version": "0.1.1", + "authors": "Jesse A. Tov ", + "repository": "/~https://github.com/tov/feature-probe-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Probe for rustc features from build.rs" + }, + { + "name": "fffx", + "version": "0.1.3", + "authors": "Joseph Guhlin ", + "repository": "/~https://github.com/jguhlin/fffx", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Another fasta/q/x file format parser. Well fuzzed." + }, + { + "name": "fixedbitset", + "version": "0.4.2", + "authors": "bluss", + "repository": "/~https://github.com/petgraph/fixedbitset", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "FixedBitSet is a simple bitset collection" + }, + { + "name": "flate2", + "version": "1.0.28", + "authors": "Alex Crichton |Josh Triplett ", + "repository": "/~https://github.com/rust-lang/flate2-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "DEFLATE compression and decompression exposed as Read/BufRead/Write streams. Supports miniz_oxide and multiple zlib implementations. Supports zlib, gzip, and raw deflate streams." + }, + { + "name": "fnv", + "version": "1.0.7", + "authors": "Alex Crichton ", + "repository": "/~https://github.com/servo/rust-fnv", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Fowler–Noll–Vo hash function" + }, + { + "name": "form_urlencoded", + "version": "1.2.0", + "authors": "The rust-url developers", + "repository": "/~https://github.com/servo/rust-url", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Parser and serializer for the application/x-www-form-urlencoded syntax, as used by HTML forms." + }, + { + "name": "fs-utils", + "version": "1.1.4", + "authors": "Sebastian Thiel ", + "repository": "/~https://github.com/Byron/fs-utils-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Utilities to help working with the filesytem" + }, + { + "name": "futures-channel", + "version": "0.3.28", + "authors": null, + "repository": "/~https://github.com/rust-lang/futures-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Channels for asynchronous communication using futures-rs." + }, + { + "name": "futures-core", + "version": "0.3.28", + "authors": null, + "repository": "/~https://github.com/rust-lang/futures-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "The core traits and types in for the `futures` library." + }, + { + "name": "futures-io", + "version": "0.3.28", + "authors": null, + "repository": "/~https://github.com/rust-lang/futures-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "The `AsyncRead`, `AsyncWrite`, `AsyncSeek`, and `AsyncBufRead` traits for the futures-rs library." + }, + { + "name": "futures-macro", + "version": "0.3.28", + "authors": null, + "repository": "/~https://github.com/rust-lang/futures-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "The futures-rs procedural macro implementations." + }, + { + "name": "futures-sink", + "version": "0.3.28", + "authors": null, + "repository": "/~https://github.com/rust-lang/futures-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "The asynchronous `Sink` trait for the futures-rs library." + }, + { + "name": "futures-task", + "version": "0.3.28", + "authors": null, + "repository": "/~https://github.com/rust-lang/futures-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Tools for working with tasks." + }, + { + "name": "futures-util", + "version": "0.3.28", + "authors": null, + "repository": "/~https://github.com/rust-lang/futures-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Common utilities and extension traits for the futures-rs library." + }, + { + "name": "fxhash", + "version": "0.2.1", + "authors": "cbreeden ", + "repository": "/~https://github.com/cbreeden/fxhash", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A fast, non-secure, hashing algorithm derived from an internal hasher used in FireFox and Rustc." + }, + { + "name": "getrandom", + "version": "0.2.10", + "authors": "The Rand Project Developers", + "repository": "/~https://github.com/rust-random/getrandom", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A small cross-platform library for retrieving random data from system source" + }, + { + "name": "getset", + "version": "0.1.2", + "authors": "Ana Hobden ", + "repository": "/~https://github.com/Hoverbear/getset", + "license": "MIT", + "license_file": null, + "description": "Getset, we're ready to go! A procedural macro for generating the most basic getters and setters on fields." + }, + { + "name": "gimli", + "version": "0.28.0", + "authors": null, + "repository": "/~https://github.com/gimli-rs/gimli", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A library for reading and writing the DWARF debugging format." + }, + { + "name": "glob", + "version": "0.3.1", + "authors": "The Rust Project Developers", + "repository": "/~https://github.com/rust-lang/glob", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Support for matching file paths against Unix shell style patterns." + }, + { + "name": "h2", + "version": "0.3.21", + "authors": "Carl Lerche |Sean McArthur ", + "repository": "/~https://github.com/hyperium/h2", + "license": "MIT", + "license_file": null, + "description": "An HTTP/2 client and server" + }, + { + "name": "hashbrown", + "version": "0.12.3", + "authors": "Amanieu d'Antras ", + "repository": "/~https://github.com/rust-lang/hashbrown", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A Rust port of Google's SwissTable hash map" + }, + { + "name": "hashbrown", + "version": "0.14.0", + "authors": "Amanieu d'Antras ", + "repository": "/~https://github.com/rust-lang/hashbrown", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A Rust port of Google's SwissTable hash map" + }, + { + "name": "heck", + "version": "0.4.1", + "authors": "Without Boats ", + "repository": "/~https://github.com/withoutboats/heck", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "heck is a case conversion library." + }, + { + "name": "heck", + "version": "0.5.0", + "authors": null, + "repository": "/~https://github.com/withoutboats/heck", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "heck is a case conversion library." + }, + { + "name": "hermit-abi", + "version": "0.1.19", + "authors": "Stefan Lankes", + "repository": "/~https://github.com/hermitcore/libhermit-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "hermit-abi is small interface to call functions from the unikernel RustyHermit. It is used to build the target `x86_64-unknown-hermit`." + }, + { + "name": "hermit-abi", + "version": "0.3.2", + "authors": "Stefan Lankes", + "repository": "/~https://github.com/hermitcore/rusty-hermit", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Hermit system calls definitions." + }, + { + "name": "hiphase", + "version": "1.2.1", + "authors": "J. Matthew Holt ", + "repository": null, + "license": null, + "license_file": "LICENSE.md", + "description": "A tool for jointly phasing small, structural, and tandem repeat variants for PacBio sequencing data" + }, + { + "name": "hts-sys", + "version": "2.1.1", + "authors": "Christopher Schröder |Johannes Köster ", + "repository": "/~https://github.com/samtools/htslib.git", + "license": "MIT", + "license_file": null, + "description": "This library provides HTSlib bindings." + }, + { + "name": "http", + "version": "0.2.9", + "authors": "Alex Crichton |Carl Lerche |Sean McArthur ", + "repository": "/~https://github.com/hyperium/http", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A set of types for representing HTTP requests and responses." + }, + { + "name": "http-body", + "version": "0.4.5", + "authors": "Carl Lerche |Lucio Franco |Sean McArthur ", + "repository": "/~https://github.com/hyperium/http-body", + "license": "MIT", + "license_file": null, + "description": "Trait representing an asynchronous, streaming, HTTP request or response body." + }, + { + "name": "httparse", + "version": "1.8.0", + "authors": "Sean McArthur ", + "repository": "/~https://github.com/seanmonstar/httparse", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A tiny, safe, speedy, zero-copy HTTP/1.x parser." + }, + { + "name": "httpdate", + "version": "1.0.3", + "authors": "Pyfisch ", + "repository": "/~https://github.com/pyfisch/httpdate", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "HTTP date parsing and formatting" + }, + { + "name": "humantime", + "version": "2.1.0", + "authors": "Paul Colomiets ", + "repository": "/~https://github.com/tailhook/humantime", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A parser and formatter for std::time::{Duration, SystemTime}" + }, + { + "name": "hyper", + "version": "0.14.27", + "authors": "Sean McArthur ", + "repository": "/~https://github.com/hyperium/hyper", + "license": "MIT", + "license_file": null, + "description": "A fast and correct HTTP library." + }, + { + "name": "hyper-rustls", + "version": "0.24.1", + "authors": null, + "repository": "/~https://github.com/rustls/hyper-rustls", + "license": "Apache-2.0 OR ISC OR MIT", + "license_file": null, + "description": "Rustls+hyper integration for pure rust HTTPS" + }, + { + "name": "iana-time-zone", + "version": "0.1.57", + "authors": "Andrew Straw |René Kijewski |Ryan Lopopolo ", + "repository": "/~https://github.com/strawlab/iana-time-zone", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "get the IANA time zone for the current system" + }, + { + "name": "iana-time-zone-haiku", + "version": "0.1.2", + "authors": "René Kijewski ", + "repository": "/~https://github.com/strawlab/iana-time-zone", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "iana-time-zone support crate for Haiku OS" + }, + { + "name": "ident_case", + "version": "1.0.1", + "authors": "Ted Driggs ", + "repository": "/~https://github.com/TedDriggs/ident_case", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Utility for applying case rules to Rust identifiers." + }, + { + "name": "idna", + "version": "0.4.0", + "authors": "The rust-url developers", + "repository": "/~https://github.com/servo/rust-url/", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "IDNA (Internationalizing Domain Names in Applications) and Punycode." + }, + { + "name": "ieee754", + "version": "0.2.6", + "authors": "Huon Wilson ", + "repository": "/~https://github.com/huonw/ieee754", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Low-level manipulations of IEEE754 floating-point numbers." + }, + { + "name": "indexmap", + "version": "1.9.3", + "authors": null, + "repository": "/~https://github.com/bluss/indexmap", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A hash table with consistent order and fast iteration." + }, + { + "name": "indexmap", + "version": "2.0.0", + "authors": null, + "repository": "/~https://github.com/bluss/indexmap", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A hash table with consistent order and fast iteration." + }, + { + "name": "ipnet", + "version": "2.8.0", + "authors": "Kris Price ", + "repository": "/~https://github.com/krisprice/ipnet", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Provides types and useful methods for working with IPv4 and IPv6 network addresses, commonly called IP prefixes. The new `IpNet`, `Ipv4Net`, and `Ipv6Net` types build on the existing `IpAddr`, `Ipv4Addr`, and `Ipv6Addr` types already provided in Rust's standard library and align to their design to stay consistent. The module also provides useful traits that extend `Ipv4Addr` and `Ipv6Addr` with methods for `Add`, `Sub`, `BitAnd`, and `BitOr` operations. The module only uses stable feature so it is guaranteed to compile using the stable toolchain." + }, + { + "name": "is-terminal", + "version": "0.4.9", + "authors": "softprops |Dan Gohman ", + "repository": "/~https://github.com/sunfishcode/is-terminal", + "license": "MIT", + "license_file": null, + "description": "Test whether a given stream is a terminal" + }, + { + "name": "itertools", + "version": "0.11.0", + "authors": "bluss", + "repository": "/~https://github.com/rust-itertools/itertools", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Extra iterator adaptors, iterator methods, free functions, and macros." + }, + { + "name": "itertools", + "version": "0.12.1", + "authors": "bluss", + "repository": "/~https://github.com/rust-itertools/itertools", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Extra iterator adaptors, iterator methods, free functions, and macros." + }, + { + "name": "itertools-num", + "version": "0.1.3", + "authors": "bluss", + "repository": "/~https://github.com/bluss/itertools-num", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Numerical iterator tools. Extra iterators and iterator methods and functions." + }, + { + "name": "itoa", + "version": "1.0.9", + "authors": "David Tolnay ", + "repository": "/~https://github.com/dtolnay/itoa", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Fast integer primitive to string conversion" + }, + { + "name": "jobserver", + "version": "0.1.26", + "authors": "Alex Crichton ", + "repository": "/~https://github.com/alexcrichton/jobserver-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "An implementation of the GNU make jobserver for Rust" + }, + { + "name": "js-sys", + "version": "0.3.64", + "authors": "The wasm-bindgen Developers", + "repository": "/~https://github.com/rustwasm/wasm-bindgen/tree/master/crates/js-sys", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Bindings for all JS global objects and functions in all JS environments like Node.js and browsers, built on `#[wasm_bindgen]` using the `wasm-bindgen` crate." + }, + { + "name": "layout-rs", + "version": "0.1.2", + "authors": "Nadav Rotem ", + "repository": "/~https://github.com/nadavrot/layout", + "license": "MIT", + "license_file": null, + "description": "A graph visualization program" + }, + { + "name": "lazy_static", + "version": "1.4.0", + "authors": "Marvin Löbel ", + "repository": "/~https://github.com/rust-lang-nursery/lazy-static.rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A macro for declaring lazily evaluated statics in Rust." + }, + { + "name": "libc", + "version": "0.2.152", + "authors": "The Rust Project Developers", + "repository": "/~https://github.com/rust-lang/libc", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Raw FFI bindings to platform libraries like libc." + }, + { + "name": "libm", + "version": "0.2.7", + "authors": "Jorge Aparicio ", + "repository": "/~https://github.com/rust-lang/libm", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "libm in pure Rust" + }, + { + "name": "libz-sys", + "version": "1.1.12", + "authors": "Alex Crichton |Josh Triplett |Sebastian Thiel ", + "repository": "/~https://github.com/rust-lang/libz-sys", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Low-level bindings to the system libz library (also known as zlib)." + }, + { + "name": "linear-map", + "version": "1.2.0", + "authors": "Andrew Paseltiner |Tobias Bucher ", + "repository": "/~https://github.com/contain-rs/linear-map", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A map implemented by searching linearly in a vector." + }, + { + "name": "linux-raw-sys", + "version": "0.4.12", + "authors": "Dan Gohman ", + "repository": "/~https://github.com/sunfishcode/linux-raw-sys", + "license": "Apache-2.0 OR Apache-2.0 WITH LLVM-exception OR MIT", + "license_file": null, + "description": "Generated bindings for Linux's userspace API" + }, + { + "name": "log", + "version": "0.4.20", + "authors": "The Rust Project Developers", + "repository": "/~https://github.com/rust-lang/log", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A lightweight logging facade for Rust" + }, + { + "name": "matrixmultiply", + "version": "0.3.7", + "authors": "bluss|R. Janis Goldschmidt", + "repository": "/~https://github.com/bluss/matrixmultiply/", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "General matrix multiplication for f32 and f64 matrices. Operates on matrices with general layout (they can use arbitrary row and column stride). Detects and uses AVX or SSE2 on x86 platforms transparently for higher performance. Uses a microkernel strategy, so that the implementation is easy to parallelize and optimize. Supports multithreading." + }, + { + "name": "memchr", + "version": "2.6.0", + "authors": "Andrew Gallant |bluss", + "repository": "/~https://github.com/BurntSushi/memchr", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "Safe interface to memchr." + }, + { + "name": "mime", + "version": "0.3.17", + "authors": "Sean McArthur ", + "repository": "/~https://github.com/hyperium/mime", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Strongly Typed Mimes" + }, + { + "name": "minimap2", + "version": "0.1.16+minimap2.2.26", + "authors": "Joseph Guhlin ", + "repository": "/~https://github.com/jguhlin/minimap2-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Bindings to libminimap2" + }, + { + "name": "minimap2-sys", + "version": "0.1.16+minimap2.2.26", + "authors": "Joseph Guhlin ", + "repository": "/~https://github.com/jguhlin/minimap2-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Bindings to libminimap2" + }, + { + "name": "miniz_oxide", + "version": "0.7.1", + "authors": "Frommi |oyvindln ", + "repository": "/~https://github.com/Frommi/miniz_oxide/tree/master/miniz_oxide", + "license": "Apache-2.0 OR MIT OR Zlib", + "license_file": null, + "description": "DEFLATE compression and decompression library rewritten in Rust based on miniz" + }, + { + "name": "mio", + "version": "0.8.8", + "authors": "Carl Lerche |Thomas de Zeeuw |Tokio Contributors ", + "repository": "/~https://github.com/tokio-rs/mio", + "license": "MIT", + "license_file": null, + "description": "Lightweight non-blocking I/O." + }, + { + "name": "multimap", + "version": "0.8.3", + "authors": "Håvar Nøvik ", + "repository": "/~https://github.com/havarnov/multimap", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A multimap implementation." + }, + { + "name": "nalgebra", + "version": "0.29.0", + "authors": "Sébastien Crozet ", + "repository": "/~https://github.com/dimforge/nalgebra", + "license": "BSD-3-Clause", + "license_file": null, + "description": "General-purpose linear algebra library with transformations and statically-sized or dynamically-sized matrices." + }, + { + "name": "nalgebra-macros", + "version": "0.1.0", + "authors": "Andreas Longva|Sébastien Crozet ", + "repository": "/~https://github.com/dimforge/nalgebra", + "license": "Apache-2.0", + "license_file": null, + "description": "Procedural macros for nalgebra" + }, + { + "name": "ndarray", + "version": "0.15.6", + "authors": "Ulrik Sverdrup \"bluss\"|Jim Turner", + "repository": "/~https://github.com/rust-ndarray/ndarray", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "An n-dimensional array for general elements and for numerics. Lightweight array views and slicing; views support chunking and splitting." + }, + { + "name": "newtype_derive", + "version": "0.1.6", + "authors": "Daniel Keep ", + "repository": "/~https://github.com/DanielKeep/rust-custom-derive", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "This crate provides macros for deriving common traits for newtype structures." + }, + { + "name": "num-complex", + "version": "0.4.4", + "authors": "The Rust Project Developers", + "repository": "/~https://github.com/rust-num/num-complex", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Complex numbers implementation for Rust" + }, + { + "name": "num-conv", + "version": "0.1.0", + "authors": "Jacob Pratt ", + "repository": "/~https://github.com/jhpratt/num-conv", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "`num_conv` is a crate to convert between integer types without using `as` casts. This provides better certainty when refactoring, makes the exact behavior of code more explicit, and allows using turbofish syntax." + }, + { + "name": "num-integer", + "version": "0.1.45", + "authors": "The Rust Project Developers", + "repository": "/~https://github.com/rust-num/num-integer", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Integer traits and functions" + }, + { + "name": "num-rational", + "version": "0.4.1", + "authors": "The Rust Project Developers", + "repository": "/~https://github.com/rust-num/num-rational", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Rational numbers implementation for Rust" + }, + { + "name": "num-traits", + "version": "0.2.16", + "authors": "The Rust Project Developers", + "repository": "/~https://github.com/rust-num/num-traits", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Numeric traits for generic mathematics" + }, + { + "name": "num_cpus", + "version": "1.16.0", + "authors": "Sean McArthur ", + "repository": "/~https://github.com/seanmonstar/num_cpus", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Get the number of CPUs on a machine." + }, + { + "name": "num_threads", + "version": "0.1.6", + "authors": "Jacob Pratt ", + "repository": "/~https://github.com/jhpratt/num_threads", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A minimal library that determines the number of running threads for the current process." + }, + { + "name": "object", + "version": "0.32.0", + "authors": null, + "repository": "/~https://github.com/gimli-rs/object", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A unified interface for reading and writing object file formats." + }, + { + "name": "once_cell", + "version": "1.18.0", + "authors": "Aleksey Kladov ", + "repository": "/~https://github.com/matklad/once_cell", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Single assignment cells and lazy values." + }, + { + "name": "ordered-float", + "version": "3.9.1", + "authors": "Jonathan Reem |Matt Brubeck ", + "repository": "/~https://github.com/reem/rust-ordered-float", + "license": "MIT", + "license_file": null, + "description": "Wrappers for total ordering on floats" + }, + { + "name": "paste", + "version": "1.0.14", + "authors": "David Tolnay ", + "repository": "/~https://github.com/dtolnay/paste", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Macros for all your token pasting needs" + }, + { + "name": "pbstarphase", + "version": "1.0.0", + "authors": null, + "repository": null, + "license": null, + "license_file": null, + "description": null + }, + { + "name": "percent-encoding", + "version": "2.3.0", + "authors": "The rust-url developers", + "repository": "/~https://github.com/servo/rust-url/", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Percent encoding and decoding" + }, + { + "name": "petgraph", + "version": "0.6.4", + "authors": "bluss|mitchmindtree", + "repository": "/~https://github.com/petgraph/petgraph", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Graph data structure library. Provides graph types and graph algorithms." + }, + { + "name": "pin-project-lite", + "version": "0.2.13", + "authors": null, + "repository": "/~https://github.com/taiki-e/pin-project-lite", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A lightweight version of pin-project written with declarative macros." + }, + { + "name": "pin-utils", + "version": "0.1.0", + "authors": "Josef Brandl ", + "repository": "/~https://github.com/rust-lang-nursery/pin-utils", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Utilities for pinning" + }, + { + "name": "pkg-config", + "version": "0.3.27", + "authors": "Alex Crichton ", + "repository": "/~https://github.com/rust-lang/pkg-config-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A library to run the pkg-config system tool at build time in order to be used in Cargo build scripts." + }, + { + "name": "powerfmt", + "version": "0.2.0", + "authors": "Jacob Pratt ", + "repository": "/~https://github.com/jhpratt/powerfmt", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "`powerfmt` is a library that provides utilities for formatting values. This crate makes it significantly easier to support filling to a minimum width with alignment, avoid heap allocation, and avoid repetitive calculations." + }, + { + "name": "ppv-lite86", + "version": "0.2.17", + "authors": "The CryptoCorrosion Contributors", + "repository": "/~https://github.com/cryptocorrosion/cryptocorrosion", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Implementation of the crypto-simd API for x86" + }, + { + "name": "priority-queue", + "version": "1.3.2", + "authors": "Gianmarco Garrisi ", + "repository": "/~https://github.com/garro95/priority-queue", + "license": "LGPL-3.0 OR MPL-2.0", + "license_file": null, + "description": "A Priority Queue implemented as a heap with a function to efficiently change the priority of an item." + }, + { + "name": "proc-macro-error", + "version": "1.0.4", + "authors": "CreepySkeleton ", + "repository": "https://gitlab.com/CreepySkeleton/proc-macro-error", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Almost drop-in replacement to panics in proc-macros" + }, + { + "name": "proc-macro-error-attr", + "version": "1.0.4", + "authors": "CreepySkeleton ", + "repository": "https://gitlab.com/CreepySkeleton/proc-macro-error", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Attribute macro for proc-macro-error crate" + }, + { + "name": "proc-macro2", + "version": "1.0.86", + "authors": "David Tolnay |Alex Crichton ", + "repository": "/~https://github.com/dtolnay/proc-macro2", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A substitute implementation of the compiler's `proc_macro` API to decouple token-based libraries from the procedural macro use case." + }, + { + "name": "quick-error", + "version": "1.2.3", + "authors": "Paul Colomiets |Colin Kiegel ", + "repository": "http://github.com/tailhook/quick-error", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A macro which makes error types pleasant to write." + }, + { + "name": "quick-xml", + "version": "0.36.1", + "authors": null, + "repository": "/~https://github.com/tafia/quick-xml", + "license": "MIT", + "license_file": null, + "description": "High performance xml reader and writer" + }, + { + "name": "quote", + "version": "1.0.36", + "authors": "David Tolnay ", + "repository": "/~https://github.com/dtolnay/quote", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Quasi-quoting macro quote!(...)" + }, + { + "name": "rand", + "version": "0.8.5", + "authors": "The Rand Project Developers|The Rust Project Developers", + "repository": "/~https://github.com/rust-random/rand", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Random number generators and other randomness functionality." + }, + { + "name": "rand_chacha", + "version": "0.3.1", + "authors": "The Rand Project Developers|The Rust Project Developers|The CryptoCorrosion Contributors", + "repository": "/~https://github.com/rust-random/rand", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "ChaCha random number generator" + }, + { + "name": "rand_core", + "version": "0.6.4", + "authors": "The Rand Project Developers|The Rust Project Developers", + "repository": "/~https://github.com/rust-random/rand", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Core random number generator traits and tools for implementation." + }, + { + "name": "rand_distr", + "version": "0.4.3", + "authors": "The Rand Project Developers", + "repository": "/~https://github.com/rust-random/rand", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Sampling from random number distributions" + }, + { + "name": "rawpointer", + "version": "0.2.1", + "authors": "bluss", + "repository": "/~https://github.com/bluss/rawpointer/", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Extra methods for raw pointers and `NonNull`. For example `.post_inc()` and `.pre_dec()` (c.f. `ptr++` and `--ptr`), `offset` and `add` for `NonNull`, and the function `ptrdistance`." + }, + { + "name": "regex", + "version": "1.9.5", + "authors": "The Rust Project Developers|Andrew Gallant ", + "repository": "/~https://github.com/rust-lang/regex", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "An implementation of regular expressions for Rust. This implementation uses finite automata and guarantees linear time matching on all inputs." + }, + { + "name": "regex-automata", + "version": "0.3.8", + "authors": "The Rust Project Developers|Andrew Gallant ", + "repository": "/~https://github.com/rust-lang/regex/tree/master/regex-automata", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Automata construction and matching using regular expressions." + }, + { + "name": "regex-syntax", + "version": "0.7.5", + "authors": "The Rust Project Developers|Andrew Gallant ", + "repository": "/~https://github.com/rust-lang/regex/tree/master/regex-syntax", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A regular expression parser." + }, + { + "name": "reqwest", + "version": "0.11.20", + "authors": "Sean McArthur ", + "repository": "/~https://github.com/seanmonstar/reqwest", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "higher level HTTP client library" + }, + { + "name": "ring", + "version": "0.16.20", + "authors": "Brian Smith ", + "repository": "/~https://github.com/briansmith/ring", + "license": null, + "license_file": "LICENSE", + "description": "Safe, fast, small crypto using Rust." + }, + { + "name": "rust-htslib", + "version": "0.39.5", + "authors": "Christopher Schröder |Johannes Köster ", + "repository": "/~https://github.com/rust-bio/rust-htslib.git", + "license": "MIT", + "license_file": null, + "description": "This library provides HTSlib bindings and a high level Rust API for reading and writing BAM files." + }, + { + "name": "rust-htslib", + "version": "0.44.1", + "authors": "Christopher Schröder |Johannes Köster ", + "repository": "/~https://github.com/rust-bio/rust-htslib.git", + "license": "MIT", + "license_file": null, + "description": "This library provides HTSlib bindings and a high level Rust API for reading and writing BAM files." + }, + { + "name": "rust-lib-reference-genome", + "version": "0.2.0", + "authors": null, + "repository": null, + "license": null, + "license_file": null, + "description": null + }, + { + "name": "rustc-demangle", + "version": "0.1.23", + "authors": "Alex Crichton ", + "repository": "/~https://github.com/alexcrichton/rustc-demangle", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Rust compiler symbol demangling." + }, + { + "name": "rustc-hash", + "version": "1.1.0", + "authors": "The Rust Project Developers", + "repository": "/~https://github.com/rust-lang-nursery/rustc-hash", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "speed, non-cryptographic hash used in rustc" + }, + { + "name": "rustc_version", + "version": "0.1.7", + "authors": "Marvin Löbel ", + "repository": "/~https://github.com/Kimundi/rustc-version-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A library for querying the version of a installed rustc compiler" + }, + { + "name": "rustix", + "version": "0.38.30", + "authors": "Dan Gohman |Jakub Konka ", + "repository": "/~https://github.com/bytecodealliance/rustix", + "license": "Apache-2.0 OR Apache-2.0 WITH LLVM-exception OR MIT", + "license_file": null, + "description": "Safe Rust bindings to POSIX/Unix/Linux/Winsock-like syscalls" + }, + { + "name": "rustls", + "version": "0.21.7", + "authors": null, + "repository": "/~https://github.com/rustls/rustls", + "license": "Apache-2.0 OR ISC OR MIT", + "license_file": null, + "description": "Rustls is a modern TLS library written in Rust." + }, + { + "name": "rustls-pemfile", + "version": "1.0.3", + "authors": null, + "repository": "/~https://github.com/rustls/pemfile", + "license": "Apache-2.0 OR ISC OR MIT", + "license_file": null, + "description": "Basic .pem file parser for keys and certificates" + }, + { + "name": "rustls-webpki", + "version": "0.101.4", + "authors": null, + "repository": "/~https://github.com/rustls/webpki", + "license": "ISC", + "license_file": null, + "description": "Web PKI X.509 Certificate Verification." + }, + { + "name": "rustversion", + "version": "1.0.14", + "authors": "David Tolnay ", + "repository": "/~https://github.com/dtolnay/rustversion", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Conditional compilation according to rustc compiler version" + }, + { + "name": "ryu", + "version": "1.0.15", + "authors": "David Tolnay ", + "repository": "/~https://github.com/dtolnay/ryu", + "license": "Apache-2.0 OR BSL-1.0", + "license_file": null, + "description": "Fast floating point to string conversion" + }, + { + "name": "safe_arch", + "version": "0.7.1", + "authors": "Lokathor ", + "repository": "/~https://github.com/Lokathor/safe_arch", + "license": "Apache-2.0 OR MIT OR Zlib", + "license_file": null, + "description": "Crate that exposes `core::arch` safely via `#[cfg()]`." + }, + { + "name": "sct", + "version": "0.7.0", + "authors": "Joseph Birr-Pixton ", + "repository": "/~https://github.com/ctz/sct.rs", + "license": "Apache-2.0 OR ISC OR MIT", + "license_file": null, + "description": "Certificate transparency SCT verification library" + }, + { + "name": "semver", + "version": "0.1.20", + "authors": "The Rust Project Developers", + "repository": "/~https://github.com/rust-lang/semver", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Semantic version parsing and comparison." + }, + { + "name": "serde", + "version": "1.0.188", + "authors": "Erick Tryzelaar |David Tolnay ", + "repository": "/~https://github.com/serde-rs/serde", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A generic serialization/deserialization framework" + }, + { + "name": "serde_derive", + "version": "1.0.188", + "authors": "Erick Tryzelaar |David Tolnay ", + "repository": "/~https://github.com/serde-rs/serde", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Macros 1.1 implementation of #[derive(Serialize, Deserialize)]" + }, + { + "name": "serde_json", + "version": "1.0.105", + "authors": "Erick Tryzelaar |David Tolnay ", + "repository": "/~https://github.com/serde-rs/json", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A JSON serialization file format" + }, + { + "name": "serde_urlencoded", + "version": "0.7.1", + "authors": "Anthony Ramine ", + "repository": "/~https://github.com/nox/serde_urlencoded", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "`x-www-form-urlencoded` meets Serde" + }, + { + "name": "simba", + "version": "0.6.0", + "authors": "sebcrozet ", + "repository": "/~https://github.com/dimforge/simba", + "license": "Apache-2.0", + "license_file": null, + "description": "SIMD algebra for Rust" + }, + { + "name": "simdutf8", + "version": "0.1.4", + "authors": "Hans Kratz ", + "repository": "/~https://github.com/rusticstuff/simdutf8", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "SIMD-accelerated UTF-8 validation." + }, + { + "name": "simple-error", + "version": "0.2.3", + "authors": "Wangshan Lu ", + "repository": "/~https://github.com/WiSaGaN/simple-error.git", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A simple error type backed by a string" + }, + { + "name": "simple-error", + "version": "0.3.1", + "authors": null, + "repository": "/~https://github.com/WiSaGaN/simple-error.git", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A simple error type backed by a string" + }, + { + "name": "slab", + "version": "0.4.9", + "authors": "Carl Lerche ", + "repository": "/~https://github.com/tokio-rs/slab", + "license": "MIT", + "license_file": null, + "description": "Pre-allocated storage for a uniform data type" + }, + { + "name": "socket2", + "version": "0.4.9", + "authors": "Alex Crichton |Thomas de Zeeuw ", + "repository": "/~https://github.com/rust-lang/socket2", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Utilities for handling networking sockets with a maximal amount of configuration possible intended." + }, + { + "name": "socket2", + "version": "0.5.3", + "authors": "Alex Crichton |Thomas de Zeeuw ", + "repository": "/~https://github.com/rust-lang/socket2", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Utilities for handling networking sockets with a maximal amount of configuration possible intended." + }, + { + "name": "spin", + "version": "0.5.2", + "authors": "Mathijs van de Nes |John Ericson ", + "repository": "/~https://github.com/mvdnes/spin-rs.git", + "license": "MIT", + "license_file": null, + "description": "Synchronization primitives based on spinning. They may contain data, are usable without `std`, and static initializers are available." + }, + { + "name": "static_assertions", + "version": "1.1.0", + "authors": "Nikolai Vazquez", + "repository": "/~https://github.com/nvzqz/static-assertions-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Compile-time assertions to ensure that invariants are met." + }, + { + "name": "statrs", + "version": "0.16.0", + "authors": "Michael Ma", + "repository": "/~https://github.com/boxtown/statrs", + "license": "MIT", + "license_file": null, + "description": "Statistical computing library for Rust" + }, + { + "name": "strsim", + "version": "0.10.0", + "authors": "Danny Guo ", + "repository": "/~https://github.com/dguo/strsim-rs", + "license": "MIT", + "license_file": null, + "description": "Implementations of string similarity metrics. Includes Hamming, Levenshtein, OSA, Damerau-Levenshtein, Jaro, Jaro-Winkler, and Sørensen-Dice." + }, + { + "name": "strum", + "version": "0.25.0", + "authors": "Peter Glotfelty ", + "repository": "/~https://github.com/Peternator7/strum", + "license": "MIT", + "license_file": null, + "description": "Helpful macros for working with enums and strings" + }, + { + "name": "strum", + "version": "0.26.2", + "authors": "Peter Glotfelty ", + "repository": "/~https://github.com/Peternator7/strum", + "license": "MIT", + "license_file": null, + "description": "Helpful macros for working with enums and strings" + }, + { + "name": "strum_macros", + "version": "0.25.2", + "authors": "Peter Glotfelty ", + "repository": "/~https://github.com/Peternator7/strum", + "license": "MIT", + "license_file": null, + "description": "Helpful macros for working with enums and strings" + }, + { + "name": "strum_macros", + "version": "0.26.4", + "authors": "Peter Glotfelty ", + "repository": "/~https://github.com/Peternator7/strum", + "license": "MIT", + "license_file": null, + "description": "Helpful macros for working with enums and strings" + }, + { + "name": "syn", + "version": "1.0.109", + "authors": "David Tolnay ", + "repository": "/~https://github.com/dtolnay/syn", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Parser for Rust source code" + }, + { + "name": "syn", + "version": "2.0.74", + "authors": "David Tolnay ", + "repository": "/~https://github.com/dtolnay/syn", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Parser for Rust source code" + }, + { + "name": "termcolor", + "version": "1.2.0", + "authors": "Andrew Gallant ", + "repository": "/~https://github.com/BurntSushi/termcolor", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "A simple cross platform library for writing colored text to a terminal." + }, + { + "name": "thiserror", + "version": "1.0.63", + "authors": "David Tolnay ", + "repository": "/~https://github.com/dtolnay/thiserror", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "derive(Error)" + }, + { + "name": "thiserror-impl", + "version": "1.0.63", + "authors": "David Tolnay ", + "repository": "/~https://github.com/dtolnay/thiserror", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Implementation detail of the `thiserror` crate" + }, + { + "name": "threadpool", + "version": "1.8.1", + "authors": "The Rust Project Developers|Corey Farwell |Stefan Schindler ", + "repository": "/~https://github.com/rust-threadpool/rust-threadpool", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A thread pool for running a number of jobs on a fixed set of worker threads." + }, + { + "name": "time", + "version": "0.1.45", + "authors": "The Rust Project Developers", + "repository": "/~https://github.com/time-rs/time", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Utilities for working with time-related functions in Rust." + }, + { + "name": "time", + "version": "0.3.36", + "authors": "Jacob Pratt |Time contributors", + "repository": "/~https://github.com/time-rs/time", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Date and time library. Fully interoperable with the standard library. Mostly compatible with #![no_std]." + }, + { + "name": "time-core", + "version": "0.1.2", + "authors": "Jacob Pratt |Time contributors", + "repository": "/~https://github.com/time-rs/time", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "This crate is an implementation detail and should not be relied upon directly." + }, + { + "name": "time-macros", + "version": "0.2.18", + "authors": "Jacob Pratt |Time contributors", + "repository": "/~https://github.com/time-rs/time", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Procedural macros for the time crate. This crate is an implementation detail and should not be relied upon directly." + }, + { + "name": "tinyvec", + "version": "1.6.0", + "authors": "Lokathor ", + "repository": "/~https://github.com/Lokathor/tinyvec", + "license": "Apache-2.0 OR MIT OR Zlib", + "license_file": null, + "description": "`tinyvec` provides 100% safe vec-like data structures." + }, + { + "name": "tinyvec_macros", + "version": "0.1.1", + "authors": "Soveu ", + "repository": "/~https://github.com/Soveu/tinyvec_macros", + "license": "Apache-2.0 OR MIT OR Zlib", + "license_file": null, + "description": "Some macros for tiny containers" + }, + { + "name": "tokio", + "version": "1.32.0", + "authors": "Tokio Contributors ", + "repository": "/~https://github.com/tokio-rs/tokio", + "license": "MIT", + "license_file": null, + "description": "An event-driven, non-blocking I/O platform for writing asynchronous I/O backed applications." + }, + { + "name": "tokio-rustls", + "version": "0.24.1", + "authors": null, + "repository": "/~https://github.com/rustls/tokio-rustls", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Asynchronous TLS/SSL streams for Tokio using Rustls." + }, + { + "name": "tokio-util", + "version": "0.7.8", + "authors": "Tokio Contributors ", + "repository": "/~https://github.com/tokio-rs/tokio", + "license": "MIT", + "license_file": null, + "description": "Additional utilities for working with Tokio." + }, + { + "name": "tower-service", + "version": "0.3.2", + "authors": "Tower Maintainers ", + "repository": "/~https://github.com/tower-rs/tower", + "license": "MIT", + "license_file": null, + "description": "Trait representing an asynchronous, request / response based, client or server." + }, + { + "name": "tracing", + "version": "0.1.37", + "authors": "Eliza Weisman |Tokio Contributors ", + "repository": "/~https://github.com/tokio-rs/tracing", + "license": "MIT", + "license_file": null, + "description": "Application-level tracing for Rust." + }, + { + "name": "tracing-core", + "version": "0.1.31", + "authors": "Tokio Contributors ", + "repository": "/~https://github.com/tokio-rs/tracing", + "license": "MIT", + "license_file": null, + "description": "Core primitives for application-level tracing." + }, + { + "name": "triple_accel", + "version": "0.4.0", + "authors": "c0deb0t ", + "repository": "/~https://github.com/Daniel-Liu-c0deb0t/triple_accel", + "license": "MIT", + "license_file": null, + "description": "Rust edit distance routines accelerated using SIMD. Supports fast Hamming, Levenshtein, restricted Damerau-Levenshtein, etc. distance calculations and string search." + }, + { + "name": "try-lock", + "version": "0.2.4", + "authors": "Sean McArthur ", + "repository": "/~https://github.com/seanmonstar/try-lock", + "license": "MIT", + "license_file": null, + "description": "A lightweight atomic lock." + }, + { + "name": "typenum", + "version": "1.16.0", + "authors": "Paho Lurie-Gregg |Andre Bogus ", + "repository": "/~https://github.com/paholg/typenum", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Typenum is a Rust library for type-level numbers evaluated at compile time. It currently supports bits, unsigned integers, and signed integers. It also provides a type-level array of type-level numbers, but its implementation is incomplete." + }, + { + "name": "unicode-bidi", + "version": "0.3.13", + "authors": "The Servo Project Developers", + "repository": "/~https://github.com/servo/unicode-bidi", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Implementation of the Unicode Bidirectional Algorithm" + }, + { + "name": "unicode-ident", + "version": "1.0.11", + "authors": "David Tolnay ", + "repository": "/~https://github.com/dtolnay/unicode-ident", + "license": "(MIT OR Apache-2.0) AND Unicode-DFS-2016", + "license_file": null, + "description": "Determine whether characters have the XID_Start or XID_Continue properties according to Unicode Standard Annex #31" + }, + { + "name": "unicode-normalization", + "version": "0.1.22", + "authors": "kwantam |Manish Goregaokar ", + "repository": "/~https://github.com/unicode-rs/unicode-normalization", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "This crate provides functions for normalization of Unicode strings, including Canonical and Compatible Decomposition and Recomposition, as described in Unicode Standard Annex #15." + }, + { + "name": "untrusted", + "version": "0.7.1", + "authors": "Brian Smith ", + "repository": "/~https://github.com/briansmith/untrusted", + "license": "ISC", + "license_file": null, + "description": "Safe, fast, zero-panic, zero-crashing, zero-allocation parsing of untrusted inputs in Rust." + }, + { + "name": "url", + "version": "2.4.1", + "authors": "The rust-url developers", + "repository": "/~https://github.com/servo/rust-url", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "URL library for Rust, based on the WHATWG URL Standard" + }, + { + "name": "utf8parse", + "version": "0.2.1", + "authors": "Joe Wilm |Christian Duerr ", + "repository": "/~https://github.com/alacritty/vte", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Table-driven UTF-8 parser" + }, + { + "name": "vcpkg", + "version": "0.2.15", + "authors": "Jim McGrath ", + "repository": "/~https://github.com/mcgoo/vcpkg-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A library to find native dependencies in a vcpkg tree at build time in order to be used in Cargo build scripts." + }, + { + "name": "vec_map", + "version": "0.8.2", + "authors": "Alex Crichton |Jorge Aparicio |Alexis Beingessner |Brian Anderson <>|tbu- <>|Manish Goregaokar <>|Aaron Turon |Adolfo Ochagavía <>|Niko Matsakis <>|Steven Fackler <>|Chase Southwood |Eduard Burtescu <>|Florian Wilkens <>|Félix Raimundo <>|Tibor Benke <>|Markus Siemens |Josh Branchaud |Huon Wilson |Corey Farwell |Aaron Liblong <>|Nick Cameron |Patrick Walton |Felix S Klock II <>|Andrew Paseltiner |Sean McArthur |Vadim Petrochenkov <>", + "repository": "/~https://github.com/contain-rs/vec-map", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A simple map based on a vector for small integer keys" + }, + { + "name": "vergen", + "version": "8.2.4", + "authors": "Jason Ozias ", + "repository": "/~https://github.com/rustyhorde/vergen", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Generate 'cargo:rustc-env' instructions via 'build.rs' for use in your code via the 'env!' macro" + }, + { + "name": "version_check", + "version": "0.9.4", + "authors": "Sergio Benitez ", + "repository": "/~https://github.com/SergioBenitez/version_check", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Tiny crate to check the version of the installed/running rustc." + }, + { + "name": "waffle_con", + "version": "0.4.2", + "authors": null, + "repository": null, + "license": null, + "license_file": null, + "description": null + }, + { + "name": "want", + "version": "0.3.1", + "authors": "Sean McArthur ", + "repository": "/~https://github.com/seanmonstar/want", + "license": "MIT", + "license_file": null, + "description": "Detect when another Future wants a result." + }, + { + "name": "wasi", + "version": "0.10.0+wasi-snapshot-preview1", + "authors": "The Cranelift Project Developers", + "repository": "/~https://github.com/bytecodealliance/wasi", + "license": "Apache-2.0 OR Apache-2.0 WITH LLVM-exception OR MIT", + "license_file": null, + "description": "Experimental WASI API bindings for Rust" + }, + { + "name": "wasi", + "version": "0.11.0+wasi-snapshot-preview1", + "authors": "The Cranelift Project Developers", + "repository": "/~https://github.com/bytecodealliance/wasi", + "license": "Apache-2.0 OR Apache-2.0 WITH LLVM-exception OR MIT", + "license_file": null, + "description": "Experimental WASI API bindings for Rust" + }, + { + "name": "wasm-bindgen", + "version": "0.2.87", + "authors": "The wasm-bindgen Developers", + "repository": "/~https://github.com/rustwasm/wasm-bindgen", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Easy support for interacting between JS and Rust." + }, + { + "name": "wasm-bindgen-backend", + "version": "0.2.87", + "authors": "The wasm-bindgen Developers", + "repository": "/~https://github.com/rustwasm/wasm-bindgen/tree/master/crates/backend", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Backend code generation of the wasm-bindgen tool" + }, + { + "name": "wasm-bindgen-futures", + "version": "0.4.37", + "authors": "The wasm-bindgen Developers", + "repository": "/~https://github.com/rustwasm/wasm-bindgen/tree/master/crates/futures", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Bridging the gap between Rust Futures and JavaScript Promises" + }, + { + "name": "wasm-bindgen-macro", + "version": "0.2.87", + "authors": "The wasm-bindgen Developers", + "repository": "/~https://github.com/rustwasm/wasm-bindgen/tree/master/crates/macro", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Definition of the `#[wasm_bindgen]` attribute, an internal dependency" + }, + { + "name": "wasm-bindgen-macro-support", + "version": "0.2.87", + "authors": "The wasm-bindgen Developers", + "repository": "/~https://github.com/rustwasm/wasm-bindgen/tree/master/crates/macro-support", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "The part of the implementation of the `#[wasm_bindgen]` attribute that is not in the shared backend crate" + }, + { + "name": "wasm-bindgen-shared", + "version": "0.2.87", + "authors": "The wasm-bindgen Developers", + "repository": "/~https://github.com/rustwasm/wasm-bindgen/tree/master/crates/shared", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Shared support between wasm-bindgen and wasm-bindgen cli, an internal dependency." + }, + { + "name": "web-sys", + "version": "0.3.64", + "authors": "The wasm-bindgen Developers", + "repository": "/~https://github.com/rustwasm/wasm-bindgen/tree/master/crates/web-sys", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Bindings for all Web APIs, a procedurally generated crate from WebIDL" + }, + { + "name": "webpki-roots", + "version": "0.25.2", + "authors": null, + "repository": "/~https://github.com/rustls/webpki-roots", + "license": "MPL-2.0", + "license_file": null, + "description": "Mozilla's CA root certificates for use with webpki" + }, + { + "name": "wide", + "version": "0.7.11", + "authors": "Lokathor ", + "repository": "/~https://github.com/Lokathor/wide", + "license": "Apache-2.0 OR MIT OR Zlib", + "license_file": null, + "description": "A crate to help you go wide." + }, + { + "name": "winapi", + "version": "0.3.9", + "authors": "Peter Atashian ", + "repository": "/~https://github.com/retep998/winapi-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Raw FFI bindings for all of Windows API." + }, + { + "name": "winapi-i686-pc-windows-gnu", + "version": "0.4.0", + "authors": "Peter Atashian ", + "repository": "/~https://github.com/retep998/winapi-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import libraries for the i686-pc-windows-gnu target. Please don't use this crate directly, depend on winapi instead." + }, + { + "name": "winapi-util", + "version": "0.1.5", + "authors": "Andrew Gallant ", + "repository": "/~https://github.com/BurntSushi/winapi-util", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "A dumping ground for high level safe wrappers over winapi." + }, + { + "name": "winapi-x86_64-pc-windows-gnu", + "version": "0.4.0", + "authors": "Peter Atashian ", + "repository": "/~https://github.com/retep998/winapi-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import libraries for the x86_64-pc-windows-gnu target. Please don't use this crate directly, depend on winapi instead." + }, + { + "name": "windows", + "version": "0.48.0", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Rust for Windows" + }, + { + "name": "windows-sys", + "version": "0.48.0", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Rust for Windows" + }, + { + "name": "windows-sys", + "version": "0.52.0", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Rust for Windows" + }, + { + "name": "windows-targets", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import libs for Windows" + }, + { + "name": "windows-targets", + "version": "0.52.0", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import libs for Windows" + }, + { + "name": "windows_aarch64_gnullvm", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_aarch64_gnullvm", + "version": "0.52.0", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_aarch64_msvc", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_aarch64_msvc", + "version": "0.52.0", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_i686_gnu", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_i686_gnu", + "version": "0.52.0", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_i686_msvc", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_i686_msvc", + "version": "0.52.0", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_x86_64_gnu", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_x86_64_gnu", + "version": "0.52.0", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_x86_64_gnullvm", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_x86_64_gnullvm", + "version": "0.52.0", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_x86_64_msvc", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_x86_64_msvc", + "version": "0.52.0", + "authors": "Microsoft", + "repository": "/~https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "winreg", + "version": "0.50.0", + "authors": "Igor Shaula ", + "repository": "/~https://github.com/gentoo90/winreg-rs", + "license": "MIT", + "license_file": null, + "description": "Rust bindings to MS Windows Registry API" + }, + { + "name": "zip", + "version": "0.6.6", + "authors": "Mathijs van de Nes |Marli Frost |Ryan Levick ", + "repository": "/~https://github.com/zip-rs/zip.git", + "license": "MIT", + "license_file": null, + "description": "Library to support the reading and writing of zip files." + } +] diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..c86d710 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,15 @@ +# Pacific Biosciences Software License Agreement +1. **Introduction and Acceptance.** This Software License Agreement (this “**Agreement**”) is a legal agreement between you (either an individual or an entity) and Pacific Biosciences of California, Inc. (“**PacBio**”) regarding the use of the PacBio software accompanying this Agreement, which includes documentation provided in “online” or electronic form (together, the “**Software**”). PACBIO PROVIDES THE SOFTWARE SOLELY ON THE TERMS AND CONDITIONS SET FORTH IN THIS AGREEMENT AND ON THE CONDITION THAT YOU ACCEPT AND COMPLY WITH THEM. BY DOWNLOADING, DISTRIBUTING, MODIFYING OR OTHERWISE USING THE SOFTWARE, YOU (A) ACCEPT THIS AGREEMENT AND AGREE THAT YOU ARE LEGALLY BOUND BY ITS TERMS; AND (B) REPRESENT AND WARRANT THAT: (I) YOU ARE OF LEGAL AGE TO ENTER INTO A BINDING AGREEMENT; AND (II) IF YOU REPRESENT A CORPORATION, GOVERNMENTAL ORGANIZATION OR OTHER LEGAL ENTITY, YOU HAVE THE RIGHT, POWER AND AUTHORITY TO ENTER INTO THIS AGREEMENT ON BEHALF OF SUCH ENTITY AND BIND SUCH ENTITY TO THESE TERMS. IF YOU DO NOT AGREE TO THE TERMS OF THIS AGREEMENT, PACBIO WILL NOT AND DOES NOT LICENSE THE SOFTWARE TO YOU AND YOU MUST NOT DOWNLOAD, INSTALL OR OTHERWISE USE THE SOFTWARE OR DOCUMENTATION. +2. **Grant of License.** Subject to your compliance with the restrictions set forth in this Agreement, PacBio hereby grants to you a non-exclusive, non-transferable license during the Term to install, copy, use, distribute in binary form only, and host the Software. If you received the Software from PacBio in source code format, you may also modify and/or compile the Software. +3. **License Restrictions.** You may not remove or destroy any copyright notices or other proprietary markings. You may only use the Software to process or analyze data generated on a PacBio instrument or otherwise provided to you by PacBio. Any use, modification, translation, or compilation of the Software not expressly authorized in Section 2 is prohibited. You may not use, modify, host, or distribute the Software so that any part of the Software becomes subject to any license that requires, as a condition of use, modification, hosting, or distribution, that (a) the Software, in whole or in part, be disclosed or distributed in source code form or (b) any third party have the right to modify the Software, in whole or in part. +4. **Ownership.** The license granted to you in Section 2 is not a transfer or sale of PacBio’s ownership rights in or to the Software. Except for the license granted in Section 2, PacBio retains all right, title and interest (including all intellectual property rights) in and to the Software. The Software is protected by applicable intellectual property laws, including United States copyright laws and international treaties. +5. **Third Party Materials.** The Software may include software, content, data or other materials, including related documentation and open source software, that are owned by one or more third parties and that are subject to separate licensee terms (“**Third-Party Licenses**”). A list of all materials, if any, can be found the documentation for the Software. You acknowledge and agree that such third party materials subject to Third-Party Licenses are not licensed to you pursuant to the provisions of this Agreement and that this Agreement shall not be construed to grant any such right and/or license. You shall have only such rights and/or licenses, if any, to use such third party materials as set forth in the applicable Third-Party Licenses. +6. **Feedback.** If you provide any feedback to PacBio concerning the functionality and performance of the Software, including identifying potential errors and improvements (“**Feedback**”), such Feedback shall be owned by PacBio. You hereby assign to PacBio all right, title, and interest in and to the Feedback, and PacBio is free to use the Feedback without any payment or restriction. +7. **Confidentiality.** You must hold in the strictest confidence the Software and any related materials or information including, but not limited to, any Feedback, technical data, research, product plans, or know-how provided by PacBio to you, directly or indirectly in writing, orally or by inspection of tangible objects (“**Confidential Information**”). You will not disclose any Confidential Information to third parties, including any of your employees who do not have a need to know such information, and you will take reasonable measures to protect the secrecy of, and to avoid disclosure and unauthorized use of, the Confidential Information. You will immediately notify the PacBio in the event of any unauthorized or suspected use or disclosure of the Confidential Information. To protect the Confidential Information contained in the Software, you may not reverse engineer, decompile, or disassemble the Software, except to the extent the foregoing restriction is expressly prohibited by applicable law. +8. **Termination.** This Agreement will terminate upon the earlier of: (a) your failure to comply with any term of this Agreement; or (b) return, destruction, or deletion of all copies of the Software in your possession. PacBio’s rights and your obligations will survive the termination of this Agreement. The “**Term**” means the period beginning on when this Agreement becomes effective until the termination of this Agreement. Upon termination of this Agreement for any reason, you will delete from all of your computer libraries or storage devices or otherwise destroy all copies of the Software and derivatives thereof. +9. **NO OTHER WARRANTIES.** THE SOFTWARE IS PROVIDED ON AN “AS IS” BASIS. YOU ASSUME ALL RESPONSIBILITIES FOR SELECTION OF THE SOFTWARE TO ACHIEVE YOUR INTENDED RESULTS, AND FOR THE INSTALLATION OF, USE OF, AND RESULTS OBTAINED FROM THE SOFTWARE. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, PACBIO DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO IMPLIED WARRANTIES OF MERCHANTABILITY, QUALITY, ACCURACY, TITLE, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE WITH RESPECT TO THE SOFTWARE AND THE ACCOMPANYING WRITTEN MATERIALS. THERE IS NO WARRANTY AGAINST INTERFERENCE WITH THE ENJOYMENT OF THE SOFTWARE OR AGAINST INFRINGEMENT. THERE IS NO WARRANTY THAT THE SOFTWARE OR PACBIO’S EFFORTS WILL FULFILL ANY OF YOUR PARTICULAR PURPOSES OR NEEDS. +10. **LIMITATION OF LIABILITY.** UNDER NO CIRCUMSTANCES WILL PACBIO BE LIABLE FOR ANY CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL OR PUNITIVE DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, LOSS OF DATA OR OTHER SUCH PECUNIARY LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THE SOFTWARE, EVEN IF PACBIO HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL PACBIO’S AGGREGATE LIABILITY FOR DAMAGES ARISING OUT OF THIS AGREEMENT EXCEED $5. THE FOREGOING EXCLUSIONS AND LIMITATIONS OF LIABILITY AND DAMAGES WILL NOT APPLY TO CONSEQUENTIAL DAMAGES FOR PERSONAL INJURY. +11. **Indemnification.** You will indemnify, hold harmless, and defend PacBio (including all of its officers, employees, directors, subsidiaries, representatives, affiliates, and agents) and PacBio’s suppliers from and against any damages (including attorney’s fees and expenses), claims, and lawsuits that arise or result from your use of the Software. +12. **Trademarks.** Certain of the product and PacBio names used in this Agreement, the Software may constitute trademarks of PacBio or third parties. You are not authorized to use any such trademarks. +13. **Export Restrictions.** YOU UNDERSTAND AND AGREE THAT THE SOFTWARE IS SUBJECT TO UNITED STATES AND OTHER APPLICABLE EXPORT-RELATED LAWS AND REGULATIONS AND THAT YOU MAY NOT EXPORT, RE-EXPORT OR TRANSFER THE SOFTWARE OR ANY DIRECT PRODUCT OF THE SOFTWARE EXCEPT AS PERMITTED UNDER THOSE LAWS. WITHOUT LIMITING THE FOREGOING, EXPORT, RE-EXPORT, OR TRANSFER OF THE SOFTWARE TO CUBA, IRAN, NORTH KOREA, SYRIA, RUSSIA, BELARUS, AND THE REGIONS OF CRIMEA, LNR, AND DNR OF UKRAINE IS PROHIBITED. +14. **General.** This Agreement is governed by the laws of the State of California, without reference to its conflict of laws principles. This Agreement is the entire agreement between you and PacBio and supersedes any other communications with respect to the Software. If any provision of this Agreement is held invalid or unenforceable, the remainder of this Agreement will continue in full force and effect. diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..c47c31d --- /dev/null +++ b/build.rs @@ -0,0 +1,17 @@ +use std::error::Error; +use vergen::EmitBuilder; + +fn main() -> Result<(), Box> { + EmitBuilder::builder() + .fail_on_error() + .all_git() + .git_describe(true, false, Some("ThisPatternShouldNotMatchAnythingEver")) + .emit()?; + + // emit build handles the git configuration and build.rs, but we also need to track the toml and src folder + let rerun_if_changed = "cargo:rerun-if-changed=Cargo.toml +cargo:rerun-if-changed=src"; + println!("{rerun_if_changed}"); + + Ok(()) +} \ No newline at end of file diff --git a/src/build_database.rs b/src/build_database.rs new file mode 100644 index 0000000..10c4fd2 --- /dev/null +++ b/src/build_database.rs @@ -0,0 +1,599 @@ + +use bio::io::fasta; +use log::{debug, info, trace, warn}; +use rustc_hash::FxHashMap as HashMap; +use serde::Deserialize; +use std::collections::BTreeMap; +use std::collections::hash_map::Entry::{Occupied, Vacant}; +use std::io::Read; +use simple_error::bail; + +use crate::data_types::alleles::{AlleleDefinition, VariantDefinition}; +use crate::data_types::cpic_api_results::CpicAlleleDefinition; +use crate::data_types::database::PgxDatabase; +use crate::hla::alleles::HlaAlleleDefinition; + +// CPIC API quickstart: /~https://github.com/cpicpgx/cpic-data/wiki +// CPI API full book: https://documenter.getpostman.com/view/1446428/Szt78VUJ?version=latest +// Useful postgrest reference: https://postgrest.org/en/v7.0.0/api.html#horizontal-filtering-rows + +/// Base API addresses +const CPIC_API_URL: &str = "https://api.cpicpgx.org/v1"; + +// fortunately, HLA is version controlled on GitHub, makes life a little better for versioning +const HLA_REPO_LOOKUP: &str = "https://api.github.com/repos/ANHIG/IMGTHLA/releases/latest"; +const HLA_GITHUB_PREFIX: &str = "https://raw.githubusercontent.com/ANHIG/IMGTHLA"; +const HLA_GENOME_FASTA: &str = "fasta/hla_gen.fasta"; +const HLA_GENOME_FASTA_ZIP: &str = "fasta/hla_gen.fasta.zip"; // started with v3.57.0-alpha +const HLA_CDNA_FASTA: &str = "fasta/hla_nuc.fasta"; + +// PharmVar API: https://www.pharmvar.org/documentation +// PharmVar gene information with useful download: https://www.pharmvar.org/gene/CYP2D6 +// from there, you can get this zip file: https://www.pharmvar.org/get-download-file?name={gene}&refSeq=ALL&fileType=zip&version={version} +// version can be "current" or numbered like "6.0.8" +// PharmVar API link: https://www.pharmvar.org/api-service/alleles?exclude-sub-alleles=false&include-reference-variants=false&include-retired-alleles=false&include-retired-reference-sequences=false&reference-sequence=NC_000022.11 +// I do not think we need this though + +/// This is the primary call to build out our database locally via CPIC API queries. +/// # Errors +/// * if there are errors retrieving the CPIC gene list +/// * if there are errors retrieving allele definitions for a gene +pub fn pull_database_cpic_api() -> Result> { + // first get all the CPI genes + info!("Starting CPIC API queries..."); + let all_genes: HashMap = get_all_genes()?; + + // If testing, you can limit this to a particular gene + let query_limit = None; //Some("CACNA1S"); + + // get the alleles for the gene + let alleles: Vec = query_gene_cpic_api(query_limit)?; + + // now we need to pull down HLA data as well + info!("Starting HLA queries..."); + let latest_hla_version: String = get_latest_hla_tag()?; + info!("Found latest HLA version: {latest_hla_version}"); + let hla_data: BTreeMap = get_hla_sequences(&latest_hla_version)?; + + // finally, get the PharmVar CYP2D6 data + // let (pharmvar_version, cyp2d6_data) = get_pharmvar_sequences("CYP2D6", "current")?; + let (pharmvar_version, cyp2d6_data) = get_pharmvar_variants("CYP2D6", "current")?; + info!("Found latest PharmVar version: {pharmvar_version}"); + + // now build our database and ship it back + let full_database: PgxDatabase = PgxDatabase::new( + &all_genes, + &alleles, + latest_hla_version, + hla_data, + pharmvar_version, + cyp2d6_data + )?; + Ok(full_database) +} + +/// This pulls the list of genes that are available from CPIC and stores useful metadata like chromosome +/// # Errors +/// * if the URL request has issues connecting or converting to JSON +/// * if duplicate gene names are detected while parsing +/// * if required entries are missing or fail to parse +fn get_all_genes() -> Result, Box> { + // this endpoint gets the list of genes, ordered by symbol, where the URL field is not empty + // this tends to correlate with genes that have allele definitions + // if we ever find that does not hold, we can remove the url= filter component and just accept extra queries downstream + let gene_url: String = format!("{CPIC_API_URL}/gene?url=not.eq.null&order=symbol"); + info!("\tQuerying gene list via {gene_url}"); + + // hit the end point so we can parse it + let result: String = reqwest::blocking::get(gene_url)?.text()?; + debug!("Response received."); + + // now parse it via serde + // we are using a generic Value here because we really just need one field right now + let parsed: Vec = serde_json::from_str(&result)?; + debug!("Parsing complete."); + + // now pull out the chromosome for each gene we care about + let mut ret: HashMap = Default::default(); + for gene_entry in parsed.iter() { + // make sure we get a gene name and a chromosome + let gene_name: String = match gene_entry["symbol"].as_str() { + Some(s) => s.to_string(), + None => bail!("Error while parsing field \"symbol\" as a string") + }; + let chromosome: String = match gene_entry["chr"].as_str() { + Some(s) => s.to_string(), + None => { + warn!("Error while parsing field \"chr\" for {gene_name}, ignoring: {:?}", gene_entry["chr"]); + continue; + } + }; + + // the clippy warning here is far less readable IMO, disabling it + #[allow(clippy::map_entry)] + if ret.contains_key(&gene_name) { + bail!("Detected duplicate gene name during parsing: {gene_name}"); + } else { + debug!("\t\t{gene_name} -> {chromosome}"); + ret.insert(gene_name, chromosome); + } + } + + Ok(ret) +} + +/// This will pull all the CPIC allele definitions via a single API query. +/// # Arguments +/// * `gene` - the gene name to query; if None, then all definitions are pulled +/// # Errors +/// * if the URL fails to get +/// * if the response fails to parse into JSON or our allele definition +fn query_gene_cpic_api(gene: Option<&str>) -> Result, Box> { + // this will pull allele definitions as well as the variants that go with them + // let definition_url: String = format!("{CPIC_API_URL}/allele_definition?genesymbol=eq.{gene}&select=*,%20allele_location_value(*,%20sequence_location(*))&order=name"); + let definition_url: String = match gene { + Some(g) => format!("{CPIC_API_URL}/allele_definition?genesymbol=eq.{g}&select=*,%20allele_location_value(*,%20sequence_location(*))&order=name"), + None => format!("{CPIC_API_URL}/allele_definition?select=*,%20allele_location_value(*,%20sequence_location(*))&order=name") + }; + let label = gene.unwrap_or("all_genes"); + info!("\tQuerying \"{label}\" via {definition_url}"); + + // hit the end point so we can parse it + let result: String = reqwest::blocking::get(definition_url)?.text()?; + debug!("Response received."); + + // now parse it via serde + let parsed: Vec = serde_json::from_str(&result)?; + debug!("Parsing complete."); + Ok(parsed) +} + +/// Gets the latest version tag of the HLA sequences +/// # Errors +/// * if the URL request fails +/// * if "tag_name" is not present in the response or cannot be converted into a string +fn get_latest_hla_tag() -> Result> { + // we need the User Agent specified for GitHub queries, set it to our tool name + let client = reqwest::blocking::Client::builder() + .user_agent(env!("CARGO_PKG_NAME")) + .build()?; + + // hit the end point so we can parse it + info!("\tQuerying lastest HLA tag via {HLA_REPO_LOOKUP}"); + let result: String = client.get(HLA_REPO_LOOKUP) + .send()? + .error_for_status()? + .text()?; + debug!("Response received."); + + // now parse it via serde + let parsed: serde_json::Value = serde_json::from_str(&result)?; + match parsed.get("tag_name") { + Some(v) => { + match v.as_str() { + Some(str_form) => { + Ok(str_form.to_string()) + }, + None => { + bail!("Key \"tag_name\" could not be converted to a String."); + } + } + }, + None => { + bail!("Key \"tag_name\" was not found in GitHub latest response for HLA repository."); + } + } +} + +/// This will pull the files for the corresponding version string and get them ready +/// # Arguments +/// * `version` - the version we are pulling out of GitHub, must be a valid tag +/// # Errors +/// if the URLs requested have an error +/// if we cannot convert the response into FASTA sequences +/// if we get an error while collapsing the DNA and cDNA entries together +fn get_hla_sequences(version: &str) -> Result, Box> { + // hit the DNA end point + let dna_url: String = format!("{HLA_GITHUB_PREFIX}/{version}/{HLA_GENOME_FASTA_ZIP}"); + info!("\tQuerying HLA DNA sequences via {dna_url}"); + let client = reqwest::blocking::Client::builder() + .timeout(std::time::Duration::from_secs(300)) + .build()?; + let dna_result: String = match client.get(dna_url).send()?.error_for_status() { + Ok(r) => { + let result = r.bytes()?; + + // convert into a cursor for the archive and open it + let cursor_result = std::io::Cursor::new(result.to_vec()); + let mut archive = zip::ZipArchive::new(cursor_result)?; + + // get the exact file we want and read into the string + let mut zip_file = archive.by_name("hla_gen.fasta")?; + let mut text_form: String = Default::default(); + zip_file.read_to_string(&mut text_form)?; + text_form + }, + Err(e) => { + debug!("\tFailed to find zipped HLA fasta, error: {e}"); + let dna_url_unzip = format!("{HLA_GITHUB_PREFIX}/{version}/{HLA_GENOME_FASTA}"); + info!("\tQuerying HLA DNA sequences via backup URL: {dna_url_unzip}"); + let unzip_result = client.get(dna_url_unzip).send()?.error_for_status()?; + unzip_result.text()? + } + }; + + debug!("Response received."); + let dna_data: HashMap = convert_fasta_str_to_map(&dna_result, false)?; + debug!("Parsing complete."); + + // now hit the cDNA (e.g., exon) end point + let cdna_url: String = format!("{HLA_GITHUB_PREFIX}/{version}/{HLA_CDNA_FASTA}"); + info!("\tQuerying HLA cDNA sequences via {cdna_url}"); + let response = client.get(cdna_url).send()?.error_for_status()?; + let cdna_result: String = response.text()?; + debug!("Response received."); + let cdna_data: HashMap = convert_fasta_str_to_map(&cdna_result, false)?; + debug!("Parsing complete."); + + let collapsed_lookup = collapse_hla_lookup(dna_data, cdna_data)?; + Ok(collapsed_lookup) +} + +/// This will take a loaded FASTA file as a string and convert it into a HashMap, performing some minor checks. +/// The key of this HashMap is an HLA identifer and the values is the (star-allele, sequence). +/// # Arguments +/// * `raw_fasta` - the raw FASTA sequence to convert +/// * `reversed_ids` - if True, then we need to swap the "id" and "star_allele" in the lookups +/// # Errors +/// * if the FASTA is invalid format +/// * if the sequence cannot be converted from UTF-8 +/// * if a duplicate entry is detected in the FASTA +fn convert_fasta_str_to_map(raw_fasta: &str, reversed_ids: bool) -> Result, Box> { + let mut ret: HashMap = Default::default(); + let reader = fasta::Reader::new(raw_fasta.as_bytes()); + for result in reader.records() { + let record = result?; + let id: String = record.id().to_string(); + let seq: String = String::from_utf8(record.seq().to_vec())?; + + // this converts from desc() = Some("A*01:01:01:01 3503 bp") -> "A*01:01:01:01" + let star_allele: String = record.desc().unwrap_or_default() + .split_whitespace().next().unwrap_or_default() + .to_string(); + + // handle a swap if necessary + let (id, star_allele) = if reversed_ids { + (star_allele, id) + } else { + (id, star_allele) + }; + + trace!("Found record {}({}) with length {}", id, star_allele, seq.len()); + + // apparently there are some duplicates in the FASTA file for some reason + match ret.entry(id.clone()) { + Occupied(entry) => { + // make sure the entry is just a duplicate + if entry.get() != &(star_allele, seq) { + bail!("FASTA record with multiple IDs/sequences detected: {id}"); + } + }, + Vacant(entry) => { + // normal path, we insert the new entry + entry.insert((star_allele, seq)); + } + }; + } + Ok(ret) +} + +/// Helper function that collapse the DNA and cDNA entries into a single HlaAlleleDefinition for each allele. +/// # Arguments +/// * `dna_data` - a map from HLA ID to (star allele ID, DNA sequence) +/// * `cdna_data` - a map from HLA ID to (star allele ID, cDNA sequence) +/// # Errors +/// * if the key sets for each HashMap do not match +/// * if the star alleles for an HLA ID are different in the two maps +/// * if there is an error parsing an HLA allele definition +fn collapse_hla_lookup(dna_data: HashMap, cdna_data: HashMap) + -> Result, Box> { + // every DNA has a cDNA, but not all cDNAs have a DNA; count them before we drain the `cdna_data` + let mut missed_dna: usize = 0; + for hla_id in dna_data.keys() { + if !cdna_data.contains_key(hla_id) { + missed_dna += 1; + } + } + if missed_dna > 0 { + warn!("Detected {missed_dna} DNA entries that do not have a cDNA, ignoring them."); + } + + // now we can build up all the usable entries + let mut ret: BTreeMap = Default::default(); + let mut ignored_alleles = 0; + for (hla_id, (cdna_desc, cdna_seq)) in cdna_data.into_iter() { + // future note: this is similar to a .map() function but we do it this way so we can propagate the bail! correctly + let opt_dna_seq = match dna_data.get(&hla_id) { + Some((dna_desc, dna_seq)) => { + if dna_desc != &cdna_desc { + bail!("{hla_id} has description \"{dna_desc}\" for DNA and \"{cdna_desc}\" for cDNA."); + }; + Some(dna_seq.clone()) + }, + None => None + }; + + // checks out so far, make the allele and insert it + let new_allele = HlaAlleleDefinition::new( + hla_id.clone(), &cdna_desc, opt_dna_seq, cdna_seq + )?; + + // restrict our database to the alleles we plan to match, we can relax this if we ever do an update to HLA + if new_allele.gene_name() == "HLA-A" || new_allele.gene_name() == "HLA-B" { + ret.insert(hla_id, new_allele); + } else { + ignored_alleles += 1; + } + } + debug!("Removed {ignored_alleles} alleles that are not HLA-A or HLA-B."); + + Ok(ret) +} + +/// Gets a PharmVar zip file from the website and returns a tuple with the version and the PgxGene definition +/// # Arguments +/// * `gene` - the gene to retrieve +/// * `version` - the version to fetch; can be "current" to get the latest +/// # Errors +/// * if the URL fetch fails +/// * if parsing the ZIP archive fails +/// * if the file containing fasta sequence cannot be found +fn get_pharmvar_variants(gene: &str, version: &str) -> Result<(String, BTreeMap), Box> { + // the URL is pretty standard + let gene_url = format!("https://www.pharmvar.org/get-download-file?name={gene}&refSeq=ALL&fileType=zip&version={version}"); + info!("Querying PharmVar({gene}, {version}) via {gene_url}"); + + // hit the end point so we can parse it; returns as bytes::Bytes + let client = reqwest::blocking::Client::builder() + .timeout(std::time::Duration::from_secs(300)) + .build()?; + let response = client.get(gene_url).send()?.error_for_status()?; + let result = response.bytes()?; + debug!("Response received."); + + // convert into a cursor for the archive + let cursor_result = std::io::Cursor::new(result.to_vec()); + // now open up the archive + let mut archive = zip::ZipArchive::new(cursor_result)?; + let mut version: Option = None; + let coordinate_version = "GRCh38"; + + // time to populate the alleles + let mut ret: BTreeMap = Default::default(); + for i in 0..archive.len() { + let mut file = archive.by_index(i)?; + let outpath = match file.enclosed_name() { + Some(path) => path.to_owned(), + None => continue, + }; + if outpath.is_dir() { + // ignore directories + continue; + } + + // make sure version is consistent + let parent_split: Vec<&str> = outpath.components() + .map(|c| c.as_os_str().to_str().unwrap()) + .collect(); + + // we only care about files that are two folders deep + if parent_split.len() != 3 { + continue; + } + + // root folder should be "{gene}-{version}" + let root_folder = parent_split[0]; + assert!(root_folder.starts_with(gene)); + let file_version = root_folder[gene.len()+1..].to_string(); + match version.as_ref() { + Some(v) => { + if v != &file_version { + bail!("Found mismatched versions in ZIP file: {v}, {file_version}"); + } + }, + None => { + trace!("file_version={file_version}"); + version = Some(file_version); + } + }; + + let filename = outpath.file_name().unwrap().to_str().unwrap(); + + // structure is "{gene}-{version}/{coordinate_version}/{gene}_{allele}.vcf" + if parent_split[1] == coordinate_version && filename.ends_with(".vcf") { + // pull out the actual allele name from the filename + assert_eq!(&filename[..gene.len()], gene); + let allele = filename[gene.len()+1..filename.len()-4].to_string(); + + // now we need to actually parse the contained VCF file + let mut vcf_content: String = Default::default(); + let _bytes = file.read_to_string(&mut vcf_content)?; + let start_index = vcf_content.find("#CHROM").unwrap(); + + // now put this into a VCF reader for ease of use + let variants = load_vcf_from_bytes(vcf_content[start_index..].as_bytes())?; + + // create an allele definition using the full star ID; we do not get the special identifiers with this approach + let full_star = format!("{gene}*{allele}"); + let allele_def = AlleleDefinition::new(None, &full_star, variants)?; + + // insert it using the ID; make sure the previous value was empty + assert!(ret.insert(allele_def.id().to_string(), allele_def).is_none()); + } + } + + // we need to add in the reference key here at the end + let ref_allele = format!("{gene}*1.001"); + let allele_def = AlleleDefinition::new(None, &ref_allele, vec![])?; + assert!(ret.insert(allele_def.id().to_string(), allele_def).is_none()); + + if let Some(v) = version { + Ok((v, ret)) + } else { + bail!("No files or version identified in ZIP file"); + } +} + +/// This assists in parsing the CSV reader of the VCF +#[derive(Debug, Deserialize)] +struct VcfRow { + #[serde(alias = "#CHROM")] + pub chrom: String, + #[serde(alias = "POS")] + pub pos: usize, + #[serde(alias = "ID")] + pub id: String, + #[serde(alias = "REF")] + pub reference: String, + #[serde(alias = "ALT")] + pub alternate: String, + // #[serde(alias = "QUAL")] + // quality: String, + // #[serde(alias = "FILTER")] + // filter: String, + #[serde(alias = "INFO")] + pub info: String +} + +/// This is a wrapper function to the load the VCF files for CYP2D6 from memory (e.g., from the ZIP in memory). +/// In theory, you wouldn't normally need this, but rust_htslib does not have a function to parse VCFs from bytes in memory. +/// # Arguments +/// * `vcf_content` - the bytes that make up the VCF starting at "#CHROM" (i.e., the header is skipped) +/// # Errors +/// * if the deserializing throws errors +/// # Panics +/// * we baked in some specific assumptions about VCF content and those currently panic if they fail +fn load_vcf_from_bytes(vcf_content: &[u8]) -> Result, Box> { + /* + rust_htslib does not have a way to read from memory, I guess we can do one of these: + 1. write the files out and read them as normal - I'd like to avoid creating temp files if possible + 2. read them via noodles_bcf which can load from anything implementing Read - turns out noodles does not like these file formats + 3. parse it out ourselves via csv - "easiest" for now, could have scaling issues if VCFs get complicated + */ + // open the tab-delimited reader + let mut reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .from_reader(vcf_content); + let mut ret = vec![]; + + // go through each row and build a variant entry from it + for row in reader.deserialize() { + let record: VcfRow = row?; + let mut extras: BTreeMap = Default::default(); + for key_value in record.info.split(';') { + if !key_value.is_empty() && key_value != "." { + // get the key value extra here + let kv_split: Vec<&str> = key_value.split('=').collect(); + assert_eq!(kv_split.len(), 2); + + // insert and make sure we do not have that key already + let k = kv_split[0].to_string(); + let v = kv_split[1].to_string(); + assert!(extras.insert(k, v).is_none()); + } + } + + // set the record ID if it's not empty + let record_id = if record.id != "." { + Some(record.id.clone()) + } else { + None + }; + + // now make the variant and save it + let var_def = VariantDefinition::new( + record_id, record.chrom.clone(), record.pos - 1, //POS is 1-based inside the VCF file, so shift it to 0-based + record.reference.clone(), record.alternate.clone(), + extras + )?; + ret.push(var_def); + } + + Ok(ret) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_all_genes() { + let all_genes: HashMap = get_all_genes().unwrap(); + + // this list can change, but presumably things will not get removed + assert!(all_genes.len() >= 24); + + // check a gene + assert_eq!(all_genes.get("CACNA1S").unwrap(), "chr1"); + } + + #[test] + fn test_get_latest_hla_tag() { + let latest_tag = get_latest_hla_tag(); + assert!(latest_tag.is_ok()); + } + + #[test] + fn test_get_hla_sequences() { + // note to future self: this particular version appear to be malformed; most others would be v3.54.0-alpha + let fixed_version = "v3.54.0-alpha"; + let hla_db = get_hla_sequences(fixed_version).unwrap(); + // assert_eq!(hla_db.len(), 38408); // this was before we restricted to just A and B + assert_eq!(hla_db.len(), 17585); + + let first_entry = hla_db.get("HLA:HLA00001").unwrap(); + assert_eq!(first_entry.hla_id(), "HLA:HLA00001"); + assert_eq!(first_entry.gene_name(), "HLA-A"); + assert_eq!(first_entry.star_allele(), vec!["01".to_string(); 4]); + assert_eq!(first_entry.dna_sequence().unwrap().len(), 3503); + assert_eq!(first_entry.cdna_sequence().len(), 1098); + } + + #[test] + fn test_zip_hla_sequences() { + // note to future self: this particular version appear to be malformed; most others would be v3.54.0-alpha + let fixed_version = "v3.57.0-alpha"; + let hla_db = get_hla_sequences(fixed_version).unwrap(); + // assert_eq!(hla_db.len(), 38408); // this was before we restricted to just A and B + assert_eq!(hla_db.len(), 18461); + + let first_entry = hla_db.get("HLA:HLA00001").unwrap(); + assert_eq!(first_entry.hla_id(), "HLA:HLA00001"); + assert_eq!(first_entry.gene_name(), "HLA-A"); + assert_eq!(first_entry.star_allele(), vec!["01".to_string(); 4]); + assert_eq!(first_entry.dna_sequence().unwrap().len(), 3503); + assert_eq!(first_entry.cdna_sequence().len(), 1098); + } + + #[test] + fn test_get_pharmvar_sequences() { + let fixed_version = "6.0.8"; + let (version, cyp2d6_db) = get_pharmvar_variants("CYP2D6", fixed_version).unwrap(); + assert_eq!(&version, fixed_version); + assert_eq!(cyp2d6_db.len(), 510); // we got 511 when we did the FASTA based, may need to resolve that in the future + + // make sure we have the entry that is missing a VCF due to no variants + let first_entry = cyp2d6_db.get("CYP2D6*1.001").unwrap(); + assert_eq!(first_entry.id(), "CYP2D6*1.001"); + assert_eq!(first_entry.gene_name(), "CYP2D6"); + assert_eq!(first_entry.star_allele(), "1.001"); + assert_eq!(first_entry.variants().len(), 0); + + // check *2 also + let second_entry = cyp2d6_db.get("CYP2D6*2").unwrap(); + assert_eq!(second_entry.id(), "CYP2D6*2"); + assert_eq!(second_entry.gene_name(), "CYP2D6"); + assert_eq!(second_entry.star_allele(), "2"); + assert_eq!(second_entry.variants().len(), 2); + } +} \ No newline at end of file diff --git a/src/cli/core.rs b/src/cli/core.rs new file mode 100644 index 0000000..71b4aa3 --- /dev/null +++ b/src/cli/core.rs @@ -0,0 +1,85 @@ + +use clap::{Parser, Subcommand}; +use chrono::Datelike; +use lazy_static::lazy_static; +use log::error; +use std::path::Path; + +use crate::cli::diplotype::DiplotypeSettings; +use crate::cli::db_build::BuildSettings; + +lazy_static! { + /// Stores the full version string we plan to use, which is generated in build.rs + /// # Examples + /// * `0.11.0-6bb9635-dirty` - while on a dirty branch + /// * `0.11.0-6bb9635` - with a fresh commit + pub static ref FULL_VERSION: String = format!("{}-{}", env!("CARGO_PKG_VERSION"), env!("VERGEN_GIT_DESCRIBE")); + + /// Shared after help string containing the legalese. + pub static ref AFTER_HELP: String = format!("Copyright (C) 2004-{} Pacific Biosciences of California, Inc. +This program comes with ABSOLUTELY NO WARRANTY; it is intended for +Research Use Only and not for use in diagnostic procedures.", chrono::Utc::now().year()); +} + +#[derive(Parser)] +#[clap(author, + version = &**FULL_VERSION, + about, + after_help = &**AFTER_HELP)] +#[command(propagate_version = true)] +pub struct Cli { + #[command(subcommand)] + pub command: Commands +} + +// Here lies PharmGOAT, you were truly the greatest of all time. +// ,,~~--___---, +// / .~, +// / _,~ ) +// (_-(~) ~, ),,,( /' +// Z6 .~`' || \ | +// /_,/ || || +// ~~~~~~~~~~~~~~~W`~~~~~~W`~~~~~~~~~ +// PharmGOAT, a tool for diplotyping PGx genes from HiFi data. + +/// pb-StarPhase, a tool for diplotyping PGx genes from HiFi data. +/// Select a subcommand to see more usage information: +#[derive(Subcommand)] +pub enum Commands { + /// Download and build the database from CPIC + Build(Box), + /// Run the diplotyper on a dataset + Diplotype(Box) +} + +pub fn get_cli() -> Cli { + Cli::parse() +} + +/// Checks if a file exists and will otherwise exit +/// # Arguments +/// * `filename` - the file path to check for +/// * `label` - the label to use for error messages +pub fn check_required_filename(filename: &Path, label: &str) { + if !filename.exists() { + error!("{} does not exist: \"{}\"", label, filename.display()); + std::process::exit(exitcode::NOINPUT); + } else { + // file exists, we're good + } +} + +/// Checks if a file exists and will otherwise exit +/// # Arguments +/// * `filename` - the file path to check for +/// * `label` - the label to use for error messages +pub fn check_optional_filename(opt_filename: Option<&Path>, label: &str) { + if let Some(filename) = opt_filename { + if !filename.exists() { + error!("{} does not exist: \"{}\"", label, filename.display()); + std::process::exit(exitcode::NOINPUT); + } else { + // file exists, we're good + } + } +} \ No newline at end of file diff --git a/src/cli/db_build.rs b/src/cli/db_build.rs new file mode 100644 index 0000000..42b22fd --- /dev/null +++ b/src/cli/db_build.rs @@ -0,0 +1,32 @@ + + +use clap::Args; +use log::info; +use std::path::PathBuf; + +use crate::cli::core::AFTER_HELP; + +#[derive(Clone, Args)] +#[clap(author, about, + after_help = &**AFTER_HELP)] +pub struct BuildSettings { + /// Output database location (JSON) + #[clap(required = true)] + #[clap(short = 'o')] + #[clap(long = "output-db")] + #[clap(value_name = "JSON")] + #[clap(help_heading = Some("Input/Output"))] + pub output_database: PathBuf, + + /// Enable verbose output. + #[clap(short = 'v')] + #[clap(long = "verbose")] + #[clap(action = clap::ArgAction::Count)] + pub verbosity: u8, +} + +pub fn check_build_settings(settings: BuildSettings) -> BuildSettings { + // dump stuff to the logger + info!("Output database: {:?}", settings.output_database); + settings +} diff --git a/src/cli/diplotype.rs b/src/cli/diplotype.rs new file mode 100644 index 0000000..859cbea --- /dev/null +++ b/src/cli/diplotype.rs @@ -0,0 +1,278 @@ + + +use clap::Args; +use log::{debug, info, warn}; +use simple_error::bail; +use std::path::PathBuf; + +use crate::cli::core::{AFTER_HELP, check_optional_filename, check_required_filename}; + +#[derive(Args, Clone, Default)] +#[clap(author, about, + after_help = &**AFTER_HELP)] +pub struct DiplotypeSettings { + /// Input database file (JSON) + #[clap(required = true)] + #[clap(short = 'd')] + #[clap(long = "database")] + #[clap(value_name = "JSON")] + #[clap(help_heading = Some("Input/Output"))] + pub input_database: PathBuf, + + /// Reference FASTA file + #[clap(required = true)] + #[clap(short = 'r')] + #[clap(long = "reference")] + #[clap(value_name = "FASTA")] + #[clap(help_heading = Some("Input/Output"))] + pub reference_filename: PathBuf, + + /// Input variant file in VCF format + #[clap(short = 'c')] + #[clap(long = "vcf")] + #[clap(value_name = "VCF")] + #[clap(help_heading = Some("Input/Output"))] + pub vcf_filename: Option, + + /// Input alignment file in BAM format, can be specified multiple times; required for HLA diplotyping + #[clap(short = 'b')] + #[clap(long = "bam")] + #[clap(value_name = "BAM")] + #[clap(help_heading = Some("Input/Output"))] + pub bam_filenames: Vec, + + /// Output diplotype call file (JSON) + #[clap(required = true)] + #[clap(short = 'o')] + #[clap(long = "output-calls")] + #[clap(value_name = "JSON")] + #[clap(help_heading = Some("Input/Output"))] + pub diplotype_filename: PathBuf, + + /// Output file that can be provided to PharmCAT for further call interpretation + #[clap(long = "pharmcat-tsv")] + #[clap(value_name = "TSV")] + #[clap(help_heading = Some("Input/Output"))] + pub pharmcat_tsv: Option, + + /// Optional file indicating the list of genes to include in diplotyping, one per line + #[clap(long = "include-set")] + #[clap(value_name = "TXT")] + #[clap(help_heading = Some("Input/Output"))] + pub include_fn: Option, + + /// Optional file indicating the list of genes to exclude from diplotyping, one per line + #[clap(long = "exclude-set")] + #[clap(value_name = "TXT")] + #[clap(help_heading = Some("Input/Output"))] + pub exclude_fn: Option, + + /// Optional output debug folder + #[clap(long = "output-debug")] + #[clap(value_name = "DIR")] + #[clap(help_heading = Some("Input/Output"))] + pub debug_folder: Option, + + /// Enables scoring by cDNA and tie-breaking with DNA + #[clap(hide = true)] + #[clap(long = "disable-cdna-scoring")] + #[clap(help_heading = Some("HLA calling"))] + pub disable_cdna_scoring: bool, + + /// Requires HLA alleles to have a DNA sequence definition + #[clap(long = "hla-require-dna")] + #[clap(help_heading = Some("HLA calling"))] + pub hla_require_dna: bool, + + /// The maximum error rate for a read to the HLA reference allele + #[clap(long = "max-error-rate")] + #[clap(value_name = "FLOAT")] + #[clap(default_value = "0.07")] + #[clap(help_heading = Some("HLA calling"))] + pub max_error_rate: f64, + + /// The minimum cumulative distribution function probability for a heterozygous call + #[clap(long = "min-cdf-prob")] + #[clap(value_name = "FLOAT")] + #[clap(default_value = "0.001")] + #[clap(help_heading = Some("HLA calling"))] + pub min_cdf: f64, + + /// Additional HLA targets for the debug BAM file + #[clap(hide = true)] + #[clap(long = "debug-hla-target")] + #[clap(value_name = "HLA_ID")] + #[clap(help_heading = Some("HLA debug"))] + pub debug_hla_targets: Vec, + + /// Allows us to skip HLA for the purpose of debugging quickly + #[clap(hide = true)] + #[clap(long = "debug-skip-hla")] + #[clap(help_heading = Some("HLA debug"))] + pub debug_skip_hla: bool, + + /// (Deprecated) Optional output realignment file for CYP2D6 consensus sequences + #[clap(hide = true)] + #[clap(long = "output-cyp2d6-bam")] + #[clap(value_name = "BAM")] + #[clap(help_heading = Some("CYP2D6 calling"))] + pub cyp2d6_bam_filename: Option, + + /// Enables inferrence of connected alleles based on population observations + #[clap(long = "infer-connections")] + #[clap(help_heading = Some("CYP2D6 calling"))] + pub infer_connections: bool, + + /// Disables normalizing coverage with D7 and hybrid alleles + #[clap(long = "normalize-d6-only")] + #[clap(help_heading = Some("CYP2D6 calling"))] + pub normalize_d6_only: bool, + + /// The minimum fraction of sequences required to split into multiple consensuses (e.g. MAF) + #[clap(long = "min-consensus-fraction")] + #[clap(value_name = "FLOAT")] + #[clap(default_value = "0.10")] + #[clap(help_heading = Some("Consensus (HLA and CYP2D6)"))] + pub min_consensus_fraction: f64, + + /// The minimum counts of sequences required to split into multiple consensuses + #[clap(long = "min-consensus-count")] + #[clap(value_name = "COUNT")] + #[clap(default_value = "3")] + #[clap(help_heading = Some("Consensus (HLA and CYP2D6)"))] + pub min_consensus_count: u64, + + /// The edit distance delta threshold to stop tracking divergent sequences (efficiency heuristic) + #[clap(long = "dual-max-ed-delta")] + #[clap(value_name = "COUNT")] + #[clap(default_value = "100")] + #[clap(help_heading = Some("Consensus (HLA and CYP2D6)"))] + pub dual_max_ed_delta: usize, + + /// Number of threads to use for phasing. + #[clap(hide = true)] + #[clap(short = 't')] + #[clap(long = "threads")] + #[clap(value_name = "THREADS")] + #[clap(default_value = "1")] + pub threads: usize, + + /// Enable verbose output. + #[clap(short = 'v')] + #[clap(long = "verbose")] + #[clap(action = clap::ArgAction::Count)] + pub verbosity: u8, +} + +pub fn check_diplotype_settings(mut settings: DiplotypeSettings) -> Result> { + info!("Inputs:"); + + // check for all the required input files + check_required_filename(&settings.input_database, "Database JSON"); + check_required_filename(&settings.reference_filename, "Reference FASTA"); + check_optional_filename(settings.vcf_filename.as_deref(), "VCF file"); + + // these are optional, but make sure that any specified exist + for bam_fn in settings.bam_filenames.iter() { + check_required_filename(bam_fn, "Alignment file"); + } + + // dump stuff to the logger + info!("\tDatabase: {:?}", settings.input_database); + info!("\tReference: {:?}", &settings.reference_filename); + if let Some(vcf_fn) = settings.vcf_filename.as_ref() { + info!("\tVCF: {:?}", vcf_fn); + } else { + warn!("\tVCF: No variant call files provided, all variant-based diplotyping is disabled") + } + if settings.bam_filenames.is_empty() { + warn!("\tBAM: No alignment files provided, HLA and CYP2D6 diplotyping is disabled"); + } else { + // these are optional, but make sure that any specified exist + for bam_fn in settings.bam_filenames.iter() { + info!("\tBAM: {:?}", bam_fn); + } + } + + if settings.vcf_filename.is_none() && settings.bam_filenames.is_empty() { + // user didn't provide any data, bail out + bail!("Must provide a VCF file and/or aligned BAM file to perform diplotyping."); + } + + if settings.include_fn.is_some() && settings.exclude_fn.is_some() { + bail!("Only one of --exclude-set and --include-set can be specified."); + } + if let Some(ifn) = settings.include_fn.as_ref() { + check_required_filename(ifn, "Include set"); + info!("\tInclude set file: {ifn:?}"); + } + if let Some(efn) = settings.exclude_fn.as_ref() { + check_required_filename(efn, "Exclude set"); + info!("\tExclude set file: {efn:?}"); + } + + // outputs + info!("Outputs:"); + info!("\tDiplotype calls: {:?}", settings.diplotype_filename); + if let Some(filename) = settings.pharmcat_tsv.as_ref() { + info!("\tPharmCAT TSV: {:?}", filename); + } + if let Some(debug_folder) = settings.debug_folder.as_ref() { + debug!("\tDebug folder: {debug_folder:?}"); + } + + // miscellaneous settings + if !settings.bam_filenames.is_empty() { + info!("HLA settings:"); + if settings.disable_cdna_scoring { + info!("\tDisable cDNA scoring: {}", settings.disable_cdna_scoring); + if !settings.hla_require_dna { + settings.hla_require_dna = true; + info!("\tAutomatically enabling HLA DNA requirement") + } + } + + if settings.hla_require_dna { + info!("\tRequire DNA for HLA calls: {}", settings.hla_require_dna); + } + + if !(0.0..=1.0).contains(&settings.max_error_rate) { + bail!("--max-error-rate must be between 0.0 and 1.0"); + } + info!("\tMax read error rate: {}", settings.max_error_rate); + + if !(0.0..=1.0).contains(&settings.min_cdf) { + bail!("--min-cdf-prob must be between 0.0 and 1.0"); + } + info!("\tMinimum CDF probability: {}", settings.min_cdf); + + if settings.debug_folder.is_some() { + debug!("\tHLA debug targets: {:?}", settings.debug_hla_targets); + } + + if settings.cyp2d6_bam_filename.is_some() { + warn!("The --output-cyp2d6-bam option is deprecated, use --output-debug instead."); + } + + info!("CYP2D6 settings:"); + info!("\tConnection inferrence: {}", if settings.infer_connections { "ENABLED" } else { "DISABLED" }); + info!("\tNormalize D6 only: {}", if settings.normalize_d6_only { "ENABLED" } else { "DISABLED" }); + + info!("Consensus settings:"); + if !(0.0..=1.0).contains(&settings.min_consensus_fraction) { + bail!("--min-consensus-fraction must be between 0.0 and 1.0"); + } + info!("\tMinimum consensus fraction: {}", settings.min_consensus_fraction); + info!("\tMinimum consensus count: {}", settings.min_consensus_count); + info!("\tDual max edit distance delta: {}", settings.dual_max_ed_delta); + + if settings.threads == 0 { + settings.threads = 1; + } + if settings.threads != 1 { + warn!("Threads (deprecated): {}", settings.threads); + } + } + + Ok(settings) +} diff --git a/src/cli/mod.rs b/src/cli/mod.rs new file mode 100644 index 0000000..02bcd46 --- /dev/null +++ b/src/cli/mod.rs @@ -0,0 +1,7 @@ + +/// the main CLI module +pub mod core; +/// the diplotype CLI subcommand for calling the diplotypes +pub mod diplotype; +/// The build CLI subcommand for constructing our local database via CPIC API +pub mod db_build; diff --git a/src/cyp2d6/caller.rs b/src/cyp2d6/caller.rs new file mode 100644 index 0000000..a6e43d0 --- /dev/null +++ b/src/cyp2d6/caller.rs @@ -0,0 +1,949 @@ + +use log::{debug, error, info, warn}; +use itertools::Itertools; +use rust_htslib::bam::Read; +use rust_lib_reference_genome::reference_genome::ReferenceGenome; +use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet}; +use simple_error::bail; +use std::collections::BTreeMap; +use std::path::PathBuf; +use waffle_con::cdwfa_config::{CdwfaConfig, CdwfaConfigBuilder}; +use waffle_con::consensus::{Consensus, ConsensusDWFA}; +use waffle_con::multi_consensus::MultiConsensus; +use waffle_con::priority_consensus::{PriorityConsensus, PriorityConsensusDWFA}; + +use crate::cli::diplotype::DiplotypeSettings; +use crate::cyp2d6::chaining::{find_best_chain_pair, weight_sequence, ChainPenalties, SequenceWeights}; +use crate::cyp2d6::haplotyper::{AlleleMapping, Cyp2d6Extractor}; +use crate::cyp2d6::region_label::{Cyp2d6RegionLabel, Cyp2d6RegionType}; +use crate::cyp2d6::visualization::create_custom_cyp2d6_reference; +use crate::data_types::database::PgxDatabase; +use crate::data_types::pgx_diplotypes::{Diplotype, PgxGeneDetails, PgxMultiMappingDetails}; +use crate::util::file_io::save_fasta; +use crate::util::homopolymers::hpc_with_guide; +use crate::visualization::debug_bam_writer::{unmapped_record, DebugBamWriter}; +use crate::visualization::igv_session_writer::{IgvSessionWriter, CUSTOM_CONTIG}; + +/// This is the main function to call for CYP2D6 diplotyping from a BAM file. +/// # Arguments +/// * `database` - the pre-loaded database +/// * `bam_filenames` - list of BAM files containing reads to scan +/// * `reference_genome` - already loaded reference genome +/// * `cli_settings` - settings for diplotyping +/// # Errors +/// * if we cannot open or parse a BAM file correctly +pub fn diplotype_cyp2d6( + database: &PgxDatabase, + bam_filenames: &[PathBuf], reference_genome: &ReferenceGenome, + debug_bam_writer: Option<&mut DebugBamWriter>, + cli_settings: &DiplotypeSettings +) -> Result> { + info!("Solving CYP2D6..."); + + // load the D6 typing engine, it's complicated so we moved it into cyp2d6_typer.rs + let d6_typer = Cyp2d6Extractor::new( + database, + reference_genome + )?; + + // 4 - extract all the reads from our BAM file + // this will store the records of interest + let mut read_collection: HashMap = Default::default(); + // these are the corresponding sequences, which need to be cached until we are done + let mut read_sequences: HashMap = Default::default(); + + // prep all the bam readers + let mut bam_readers: Vec = vec![]; + for bam_fn in bam_filenames.iter() { + let mut b = rust_htslib::bam::IndexedReader::from_path(bam_fn)?; + b.set_reference(reference_genome.filename())?; + bam_readers.push(b); + } + + // get the extraction region from CYP2D6 + let bam_region = database.cyp2d6_config().extraction_region(); + debug!("Parsing reads in region: {bam_region:?}"); + + // iterate over each bam, and fetch the reads + for (bam_index, bam) in bam_readers.iter_mut().enumerate() { + // bam.fetch(coordinate.fetch_definition())?; + match bam.fetch(bam_region.fetch_definition()) { + Ok(()) => {}, + Err(e) => { + let filename = &bam_filenames[bam_index]; + warn!("Received error \"{e}\" while fetching {bam_region} in {filename:?}, assuming no reads for region."); + continue; + } + }; + + for read_entry in bam.records() { + let mut read = read_entry.unwrap(); + + // make sure we do not do the same read twice + let qname: String = std::str::from_utf8(read.qname())?.to_string(); + if read_collection.contains_key(&qname) { + continue; + } + + /* + // TODO: thus far, we haven't applied this filter; leaving it in place in case we want to think about this in the future + // make sure we care about the alignment + if filter_out_alignment_record(&read, min_mapq) { + continue; + } + */ + + //build out the cigar info + read.cache_cigar(); + + let sequence = String::from_utf8(read.seq().as_bytes())?; + + // insert, make sure it was not already inserted + assert!(read_collection.insert(qname.clone(), read).is_none()); + // assert!(read_sequences.insert(qname, filtered_sequence).is_none()); + assert!(read_sequences.insert(qname, sequence).is_none()); + } + } + + // These are candidate for CLI parameters, but I think keep them hidden until we have an obvious reason not to do so + // constants for removing matches that are too short from chaining and/or consensus steps + let min_chain_frac = 0.5; // requires this fraction to go into the chaining step + let min_consensus_frac = 0.5; // requires this fraction to go into the consensus step; we usually want this to be relatively high + let min_typing_frac = 0.9; // requires high fraction of the allele to get a type; otherwise assigned to "UNKNOWN" + + // derive these, which are what is actually used below + let max_missing_chain_frac = 1.0 - min_chain_frac; + let max_missing_consensus_frac = 1.0 - min_consensus_frac; + let max_missing_typing_frac = 1.0 - min_typing_frac; + assert!(max_missing_chain_frac >= max_missing_consensus_frac); // make sure we never break this assumption + + // 4? - identify all putative D6, D7, hybrid, and deletion regions + // we want these region results in a BTree because we need to traverse them consistently (i.e., sorted order) so we can map the downstream results. + let mut regions_of_interest: BTreeMap> = Default::default(); + for (read_id, record) in read_collection.iter() { + debug!("Searching {read_id} at {}", record.pos()); + // let read_sequence = String::from_utf8(record.seq().as_bytes())?; + let read_sequence = read_sequences.get(read_id).unwrap(); // we want the filtered sequence, not the original + let penalize_unmapped = false; // penalize is fine here, we just want the locations at this point + let initial_regions = d6_typer.find_base_type_in_sequence( + read_sequence, + penalize_unmapped, + max_missing_chain_frac + )?; + debug!("Found {} regions of interest.", initial_regions.len()); + regions_of_interest.insert(read_id.clone(), initial_regions); + } + + // TODO: many of these should likely be CLI options - depth and AF will be relevant when we get to targeted + // relevant question: what does `min_count` mean when our reads don't fully span a region anymore, should we weight by fraction covered? + // I think this will be more important for chaining that consensus + // prep the consensus algo + let offset_compare_length = 100; + let offset_window = 50; // we want it to look +-50 bp, but the config only lets us look before; so we have to shift things + let config_offset_window = 2 * offset_window; + let consensus_config = CdwfaConfigBuilder::default() + .wildcard(Some(b'*')) + .min_count(cli_settings.min_consensus_count) + .min_af(cli_settings.min_consensus_fraction) + .dual_max_ed_delta(cli_settings.dual_max_ed_delta) + .allow_early_termination(true) + .weighted_by_ed(false) // currently, I'm not convinced on either approach + .consensus_cost(waffle_con::cdwfa_config::ConsensusCost::L1Distance) + .max_queue_size(20) + .max_capacity_per_size(10) + .offset_window(config_offset_window) + .offset_compare_length(offset_compare_length) + .build()?; + + // now prep the priority consensus runner - we have HPC as priority, then full length + let mut consensus_dwfa = PriorityConsensusDWFA::with_config(consensus_config.clone())?; + + // load all the corresponding sequences for the regions of interest into the consensus generator; these are in a fixed order + let mut raw_sequences: Vec<&str> = vec![]; // stores just the part of the read matching the region + let mut shifted_sequences: Vec = vec![]; // stores the part of the read matching the region AND any prefix wildcards to shift it + let mut hpc_shifted_sequences: Vec = vec![]; // stores the HPC string AND any prefix wildcards to HPC shift it; note, this prefix will be shorter than above due to re-anchoring + let mut base_offsets: Vec = vec![]; + let mut hpc_offsets: Vec = vec![]; + + let mut sequence_ids: Vec = vec![]; // an ID to reference the read, region, and allele name + let mut flattened_regions_of_interest: Vec<(String, AlleleMapping)> = Default::default(); // flattened version of the regions_of_interest for quick lookup + + for (read_id, regions) in regions_of_interest.iter() { + let read_sequence = read_sequences.get(read_id).unwrap().as_bytes(); + for region in regions.iter() { + if region.mapping_stats().custom_score(true).score() > max_missing_consensus_frac { + debug!("Ignoring {read_id}-{:?} for consensus generation: {}", region.region(), region.mapping_stats().custom_score_string(true)); + continue; + } + + // get the prefix and postfix components + let prefix = String::from_utf8(vec![b'*'; region.mapping_stats().clipped_start().unwrap()])?; + // let postfix = String::from_utf8(vec![b'*'; region.mapping_stats.clipped_end().unwrap()])?; + let seq = std::str::from_utf8(&read_sequence[region.region().clone()])?; + + // this is just the raw matching sequence + raw_sequences.push(seq); + + // generate the full target sequence + let full_sequence = prefix.clone() + seq;// + &postfix; + shifted_sequences.push(full_sequence); + base_offsets.push( if prefix.is_empty() { + 0 + } else { + prefix.len() + offset_window + }); // add in the offset window buffer since we want it to look +- the approx offset + + let guide_id = region.allele_label(); + let guide_seq = d6_typer.get_allele(guide_id).unwrap(); + let (hpc_sequence, prefix_offset) = hpc_with_guide(seq, guide_seq, prefix.len())?; + hpc_shifted_sequences.push(hpc_sequence); + hpc_offsets.push(if prefix_offset == 0 { + 0 + } else { + prefix_offset + offset_window + }); // add in the offset window buffer since we want it to look +- the approx offset + + // add a sequence ID for easy tracking + sequence_ids.push(format!("{read_id}_{}_{}_{}", region.region().start, region.region().end, region.allele_label())); + + // flatten the regions also + flattened_regions_of_interest.push((read_id.clone(), region.clone())); + } + } + + debug!("sequence_ids: {sequence_ids:?}"); + + // now add all of them to the DWFA + // for ((hpc_ss, ss), (_fr_read_id, fr_region)) in (hpc_shifted_sequences.iter().zip(shifted_sequences.iter())).zip(flattened_regions_of_interest.iter()) { + for (seq_index, hpc_ss) in hpc_shifted_sequences.iter().enumerate() { + // let ss = &shifted_sequences[seq_index]; + let raw_seq = raw_sequences[seq_index]; + let fr_region = &flattened_regions_of_interest[seq_index].1; + + // this basically forces all *5 into a separate grouping from the get-go + // we needed this because have three hyper-diverse alleles in one led to death spiral in one sample + let seed = match fr_region.allele_label().region_type() { + Cyp2d6RegionType::Cyp2d6Deletion => Some(0), + Cyp2d6RegionType::Rep6 => Some(1), + Cyp2d6RegionType::Rep7 => Some(2), + Cyp2d6RegionType::Spacer => Some(3), + Cyp2d6RegionType::LinkRegion => Some(4), + _ => None + }; + + // we chain first by HPC and second by the full sequence + let sequence_chain = vec![ + hpc_ss.as_bytes(), + raw_seq.as_bytes() + ]; + + // build the offset chains, set anything with offset 0 to just be auto-start + let hpc_off = if hpc_offsets[seq_index] == 0 { + None + } else { + Some(hpc_offsets[seq_index]) + }; + let base_off = if base_offsets[seq_index] == 0 { + None + } else { + Some(base_offsets[seq_index]) + }; + + let offset_chain = vec![ + hpc_off, base_off + ]; + consensus_dwfa.add_seeded_sequence_chain(sequence_chain, offset_chain, seed)?; + } + + // make sure we found some sequence + if consensus_dwfa.sequences().is_empty() { + warn!("No reads found for CYP2D6 consensus generation."); + + // finally lets build our results + let diplotypes = vec![Diplotype::new("NO_READS", "NO_READS")]; + debug!("Full diplotype for CYP2D6 => \"{}\"", diplotypes[0].diplotype()); + + let pgx_gene_details = PgxGeneDetails::new_from_multi_mappings( + diplotypes, None, vec![] + )?; + return Ok(pgx_gene_details); + } + + // set to true if you need to debug and print out some sequences in a fasta-like system + let debug_sequences: bool = false; + + // now solve the core consensus + let raw_consensus_result = consensus_dwfa.consensus()?; + debug!("Found {} raw consensus sequences", raw_consensus_result.consensuses().len()); + if debug_sequences { + for (i, c) in raw_consensus_result.consensuses().iter().enumerate() { + println!(">raw_con_{i}"); + println!("{}", std::str::from_utf8(c[0].sequence()).unwrap()); + } + } + + debug!("Found {} expanded consensus sequence", raw_consensus_result.consensuses().len()); + if debug_sequences { + for (i, c) in raw_consensus_result.consensuses().iter().enumerate() { + println!(">expanded_con_{i}"); + println!("{}", std::str::from_utf8(c[1].sequence()).unwrap()); + } + } + + // nowe we need to collapse those that are identical at HPC and map to same allele + let consensus_result = merge_consensus_results( + // &shifted_sequences, + &raw_sequences, + &base_offsets, + &consensus_config, + &raw_consensus_result, + &d6_typer, + max_missing_typing_frac + )?; + + debug!("Found {} final consensus sequences", consensus_result.consensuses().len()); + if debug_sequences { + for (i, c) in consensus_result.consensuses().iter().enumerate() { + println!(">final_con_{i}"); + println!("{}", std::str::from_utf8(c.sequence()).unwrap()); + } + } + + /* + // this is a test block that will compare HPC sequences + for (i, c) in consensus_result.consensuses().iter().enumerate() { + let hpc1 = hpc(std::str::from_utf8(c.sequence()).unwrap())?; + for (j, c2) in consensus_result.consensuses().iter().enumerate().skip(i+1) { + let ed = waffle_con::sequence_alignment::wfa_ed(c.sequence(), c2.sequence()); + let hpc2 = hpc(std::str::from_utf8(c2.sequence()).unwrap())?; + let hpc_ed = waffle_con::sequence_alignment::wfa_ed(hpc1.as_bytes(), hpc2.as_bytes()); + println!("{i} {j} => {ed} {hpc_ed}"); + } + } + todo!("inspect above"); + */ + + debug!("Sequence to consensus: {:?}", consensus_result.sequence_indices()); + assert_eq!(raw_sequences.len(), consensus_result.sequence_indices().len()); + + // figure out what each consensus haplotype is + let mut hap_labels = vec![]; + let mut sequences_labeled: HashSet = Default::default(); + for (i, c) in consensus_result.consensuses().iter().enumerate() { + let matches = consensus_result.sequence_indices().iter() + .filter(|&&con_index| con_index == i) + .count(); + + let sequence_to_type = std::str::from_utf8(c.sequence()).unwrap() + .trim_matches('*'); + + debug!("Typing consensus #{i} with {matches} matches, {} wildcards trimmed", c.sequence().len() - sequence_to_type.len()); + + // we are *expecting* full length sequences in our consensus + let force_assignment = true; // at this point, we need a label even if there is some ambiguity + let hap_label = match d6_typer.find_full_type_in_sequence(sequence_to_type, max_missing_typing_frac, force_assignment) { + Ok(hl) => hl, + Err(e) => { + let unknown = Cyp2d6RegionLabel::new_unknown(); + error!("Error while typing consensus #{i}, setting to {unknown}."); + error!("Typing error: {e}"); + unknown + } + }; + + // do a check to make sure we do not already have this sequence + let hap_label = if sequences_labeled.contains(sequence_to_type) { + // we somehow created two identical alleles and split them into two groups + // label this second one as a FalseAllele so we ignore it later + debug!("Detected duplicate allele in consensus {i}, marking as FalseAllele"); + Cyp2d6RegionLabel::new( + Cyp2d6RegionType::FalseAllele, + Some(hap_label.full_allele()) + ) + } else { + // new sequence, no changes needed + sequences_labeled.insert(sequence_to_type.to_string()); + hap_label + }; + + hap_labels.push(hap_label); + } + + // make the output BAM if requested + if let Some(dbw) = debug_bam_writer { + // create an unmapped record for each sequence that went into consensus + let mut unmapped_records = vec![]; + for (seq_id, (raw_seq, &phase_id)) in raw_sequences.iter().zip(consensus_result.sequence_indices().iter()).enumerate() { + // TODO: long-term, we probably want to trace this to a read if possible + let qname = format!("seq_{seq_id}"); + let sequence = raw_seq.to_string(); // rev-comp not necessary because these are sourced from BAM records + let tags = [ + ("HP".to_string(), format!("{phase_id}_{}", hap_labels[phase_id])) + ].into_iter().collect(); + + // add the record + match unmapped_record(&qname, &sequence, &tags) { + Ok(umr) => { + unmapped_records.push(umr); + }, + Err(e) => { + error!("Error while creating unmapped record: {e}"); + } + }; + } + + // this does all the alignment work for us + match dbw.map_records_to_region(&unmapped_records, &bam_region) { + Ok(()) => {}, + Err(e) => { + error!("Error while mappings records to debug BAM: {e}"); + } + }; + } + + // build up all the chains + let mut qname_chains: BTreeMap>> = Default::default(); + let mut unique_chains: HashSet> = Default::default(); + let mut best_allele_mapping_counts: Vec = vec![0; hap_labels.len()]; + let mut qname_chain_scores: BTreeMap> = Default::default(); + let mut multi_mapping_details = vec![]; + for (read_id, regions) in regions_of_interest.iter() { + if regions.is_empty() { + continue; + } + debug!("Labeling chains in {read_id}..."); + let read_sequence = read_sequences.get(read_id).unwrap().as_bytes(); + let mut putative_chains: Vec> = vec![vec![]]; + let mut weighted_chains: Vec = vec![]; + for (region_index, region) in regions.iter().enumerate() { + debug!("\tScanning {:?}", region.region()); + + // pull the sequence + let seq = std::str::from_utf8(&read_sequence[region.region().clone()])?; + + // score this sequence against each consensus + let weighted_scores = weight_sequence(seq, &consensus_result, &hap_labels)?; + if weighted_scores.is_empty() { + // all mappings were bad, we should skip this one + // this should only happen at the edges, print a warning if it doesn't + if region_index != 0 && region_index != regions.len()-1 { + warn!("\tRemoved putative allele mid-read due to no good matches, chaining may be impacted."); + } else { + debug!("\tRemoved putative allele at start/end due to no good matches."); + } + continue; + } + + let min_ed = weighted_scores.iter() + .min_by(|a, b| a.0.partial_cmp(&b.0).unwrap()) + .unwrap().0; + let num_minimum = weighted_scores.iter() + .filter(|a| a.0 == min_ed) + .count(); + + // generate new chains by extending the current ones + let mut new_pc = vec![]; + for pc in putative_chains.into_iter() { + for (ci, score) in weighted_scores.iter().enumerate() { + if score.0 == min_ed { + let mut new_chain = pc.clone(); + new_chain.push(ci); + new_pc.push(new_chain); + + if num_minimum == 1 { + // this is a unique best match + best_allele_mapping_counts[ci] += 1; + } + } + } + } + + // now overwrite the original + putative_chains = new_pc; + + // save the scores + weighted_chains.push(weighted_scores); + } + + if putative_chains.is_empty() || (putative_chains.len() == 1 && putative_chains[0].is_empty()) { + debug!("\tNo chains found."); + } else { + if putative_chains.len() == 1 { + if putative_chains[0].len() > 1 { + // we only add those with at least one pair to this list + // TODO: do we need to parse out if there are 3+ alleles into pairs? + unique_chains.insert(putative_chains[0].clone()); + + if putative_chains[0].len() > 2 { + for i in 0..(putative_chains[0].len()-1) { + let pair = vec![putative_chains[0][i], putative_chains[0][i+1]]; + unique_chains.insert(pair); + } + } + } + debug!("\tMost likely chain: {:?}", putative_chains[0]); + } else { + debug!("\tAmbig chains found: {:?}", putative_chains); + } + + qname_chains.insert(read_id.clone(), putative_chains.clone()); + qname_chain_scores.insert(read_id.clone(), weighted_chains); + } + } + + // remove any chains with non-unique sub-alleles + for (_qname, chain_set) in qname_chains.iter_mut() { + let new_chain_set: Vec> = chain_set.iter() + .filter(|chain| { + chain.iter() + .all(|&c_index| best_allele_mapping_counts[c_index] > 0) + }) + .cloned() + .collect(); + + if new_chain_set.is_empty() { + panic!("chain collapse: {chain_set:?} => {new_chain_set:?}"); + } + + *chain_set = new_chain_set; + + // make sure we didn't somehow remove all options + assert!(!chain_set.is_empty()); + } + + // count the chain frequencies + let mut single_frequency: BTreeMap = Default::default(); + let mut chain_frequency: BTreeMap, f64> = Default::default(); + + // also count the ambiguous ones, but these are down-weighted by the number of candidates + for (qname, chain_set) in qname_chains.iter() { + let weight = 1.0 / chain_set.len() as f64; + for chain in chain_set.iter() { + // increment the chain + let entry = chain_frequency.entry(chain.clone()).or_default(); + *entry += weight; + + // increment each singleton + for &c in chain.iter() { + let entry = single_frequency.entry(c).or_default(); + *entry += weight; + } + } + + if chain_set.len() == 1 { + for (&consensus_index, region) in chain_set[0].iter().zip(regions_of_interest.get(qname).unwrap().iter()) { + multi_mapping_details.push(PgxMultiMappingDetails::new( + qname.clone(), + region.region().clone(), + consensus_index, + hap_labels[consensus_index].full_allele() + )); + } + } + } + + // debug for unique counts + debug!("Uniquely assigned table:"); + for (con_index, &unique_count) in best_allele_mapping_counts.iter().enumerate() { + // if there are no unique reads; THEN we mark this as a FalseAllele + if unique_count == 0 && + hap_labels[con_index].region_type() != Cyp2d6RegionType::Unknown && + hap_labels[con_index].region_type() != Cyp2d6RegionType::FalseAllele { + // this is a false allele, nothing is uniquely mapping to it; retain the original label as the subtype + hap_labels[con_index] = Cyp2d6RegionLabel::new( + Cyp2d6RegionType::FalseAllele, + Some(hap_labels[con_index].full_allele()) + ); + } + debug!("\t{con_index}_{} => {unique_count}", hap_labels[con_index]); + } + + // debug output for the table + debug!("Allele count table:"); + for (&con_index, count) in single_frequency.iter() { + debug!("\t{con_index}_{} => {count}", hap_labels[con_index]); + } + + // print the uniquely assigned chains as well + debug!("Unique chains:"); + for chain in unique_chains.iter().sorted() { + let string_form: Vec = chain.iter() + .map(|&c_index| format!("{c_index}_{}", hap_labels[c_index])) + .collect(); + debug!("\t{string_form:?}"); + } + + // debug output for the table + debug!("Chain count table:"); + for (chain, count) in chain_frequency.iter() { + let string_form: Vec = chain.iter() + .map(|&c_index| format!("{c_index}_{}", hap_labels[c_index])) + .collect(); + debug!("\t{string_form:?} => {count}") + } + + if let Some(debug_folder) = cli_settings.debug_folder.as_ref() { + // make the output link graph if we have a debug folder + let out_graph_fn = debug_folder.join("cyp2d6_link_graph.svg"); + debug!("Generating CYP2D6 graph at {out_graph_fn:?}"); + if let Err(e) = crate::cyp2d6::visualization::generate_debug_graph(&hap_labels, &chain_frequency, &out_graph_fn) { + error!("Error while generating CYP2D6 debug graph: {e}"); + } + + // make the fasta output as well + let consensus_fn = debug_folder.join("consensus_CYP2D6.fa"); + debug!("Saving consensus for CYP2D6 to {consensus_fn:?}"); + let mut consensus_map: BTreeMap = Default::default(); + for (index, (label, consensus)) in hap_labels.iter().zip(consensus_result.consensuses().iter()).enumerate() { + let k = format!("{index}_{label}"); + let v = std::str::from_utf8(consensus.sequence())?.to_string(); + consensus_map.insert(k, v); + } + save_fasta(&consensus_map, &consensus_fn)?; + } + + // parameters that control chaining + let infer_connections = cli_settings.infer_connections; + let normalize_all_alleles = !cli_settings.normalize_d6_only; + let penalties: ChainPenalties = Default::default(); + let ignore_chain_label_limits = false; // this should always be false in prod; true is just for testing + let (best_result, chain_warnings) = find_best_chain_pair( + database.cyp2d6_config(), + &qname_chains, &qname_chain_scores, &hap_labels, + infer_connections, normalize_all_alleles, penalties, ignore_chain_label_limits + )?; + if !chain_warnings.is_empty() { + warn!("Chain warnings: {chain_warnings:?}"); + } + + if best_result.len() != 2 { + bail!("best_result has non-2 length: {best_result:?}"); + } + + debug!("Best_results:"); + for chain in best_result.iter() { + let string_form = chain.iter() + .map(|&c_index| format!("{c_index}_{}", hap_labels[c_index])) + .join(" -> "); + debug!("\t{string_form}"); + } + + if let Some(debug_folder) = cli_settings.debug_folder.as_ref() { + // we have all the data to build a custom session file now; first, we need to build our custom reference genome + let contig_key = CUSTOM_CONTIG.to_string(); + match create_custom_cyp2d6_reference( + reference_genome, database, + &consensus_result, &hap_labels, &best_result + ) { + Ok(cust_ref) => { + let custom_sequence = &cust_ref.sequence; + let custom_regions = &cust_ref.regions; + let mut custom_reference = ReferenceGenome::empty_reference(); + custom_reference.add_contig(contig_key, custom_sequence).unwrap(); + + // collect all our reads for re-mapping into the special D6 regions + let all_records = read_collection.into_iter() + .filter_map(|(k, v)| { + if !regions_of_interest.get(&k).unwrap().is_empty() { + // only keep a read if we found something meaningful inside it + Some(v) + } else { + None + } + }) + .collect(); + + // finally, put it all together + let session_folder = debug_folder.join("cyp2d6_igv_custom"); + let mut session_writer = IgvSessionWriter::new( + session_folder, + custom_reference, + custom_regions.clone(), + all_records + ); + if let Err(e) = session_writer.write_session() { + error!("Error while writing custom session file: {e}"); + }; + }, + Err(e) => { + error!("Error while creating custom CYP2D6 reference file: {e}"); + } + } + } + + // finally lets build our results + let cyp_translate = d6_typer.cyp2d6_config().cyp_translate(); + let hap1 = convert_chain_to_hap(&best_result[0], &hap_labels, true, cyp_translate); + let hap2 = convert_chain_to_hap(&best_result[1], &hap_labels, true, cyp_translate); + let diplotypes = vec![Diplotype::new(&hap1, &hap2)]; + debug!("Full diplotype for CYP2D6 => \"{}\"", diplotypes[0].diplotype()); + + let hap1_collapsed = convert_chain_to_hap(&best_result[0], &hap_labels, false, cyp_translate); + let hap2_collapsed = convert_chain_to_hap(&best_result[1], &hap_labels, false, cyp_translate); + let diplotypes_collapsed = vec![Diplotype::new(&hap1_collapsed, &hap2_collapsed)]; + debug!("Simple diplotype for CYP2D6 => \"{}\"", diplotypes_collapsed[0].diplotype()); + + // build the PGx details for D6 + // TODO: this is currently only capturing mapping info at the consensus stage; ultimately we would want to know some details from the chaining + // and also probably the full length D6 sequences somehow + // additionally, the ambig_chains is getting split out above; we can avoid the split by checking for .len() == 1 and then keep reads in order + // ideally, this would let us follow a read through the *whole* process and create a final BAM as well instead of the current intermediate + let pgx_gene_details = PgxGeneDetails::new_from_multi_mappings(diplotypes, Some(diplotypes_collapsed), multi_mapping_details)?; + Ok(pgx_gene_details) +} + +/// This will take a priority based consensus and collapse HPC & assignment identical alleles into single entries. +/// # Arguments +/// * `sequences` - the full length sequences that built the consensus, should correspond to the second entries in the PriorityConsensus +/// * `cdwfa_config` - configuration used to determine consensus +/// * `raw_consensus_result` - the original consensus +/// * `d6_typer` - utility for assigning labels to alleles +/// * `max_missing_consensus_frac` - the maximum allowed missing from a consensus for identification +fn merge_consensus_results( + sequences: &[&str], + offsets: &[usize], + cdwfa_config: &CdwfaConfig, + raw_consensus_result: &PriorityConsensus, + d6_typer: &Cyp2d6Extractor, + max_missing_consensus_frac: f64 +) -> Result> { + let mut consensus_set: BTreeMap<(String, String), Vec> = Default::default(); + let mut unknown_set: BTreeMap> = Default::default(); + for (i, consensus) in raw_consensus_result.consensuses().iter().enumerate() { + // pull out the two consensus sequences + let hpc_consensus: String = std::str::from_utf8(consensus[0].sequence())?.to_string(); + let full_consensus: &str = std::str::from_utf8(consensus[1].sequence())?; + + // figure out the label for this consensus + let sequence_to_type = full_consensus.trim_matches('*'); + debug!("Typing consensus #{i}, {} wildcards trimmed", full_consensus.len() - sequence_to_type.len()); + + // we are *expecting* full length sequences in our consensus + let force_assignment = false; // if we have label ambiguity, then the allele is incomplete and we want it to get merged if possible + let allele_label = match d6_typer.find_full_type_in_sequence(sequence_to_type, max_missing_consensus_frac, force_assignment) { + Ok(al) => al, + Err(e) => { + let unknown = Cyp2d6RegionLabel::new_unknown(); + error!("Error while typing consensus #{i}, setting to {unknown}."); + error!("Typing error: {e}"); + unknown + } + }; + + // reduce the label for merging + let detailed = true; // true means we keep sub-alleles such as "*4.001" + let reduced_label = allele_label.simplify_allele(detailed, d6_typer.cyp2d6_config().cyp_translate()); + debug!("Reduced {allele_label} to {reduced_label} for merging."); + + if !allele_label.is_allowed_label() { + // this one is not allowed, lets see if we can merge it into the parent later + let entry = unknown_set.entry(hpc_consensus).or_default(); + entry.push(i); + } + else { + // now save it to key (hpc sequence, allele label) + let entry = consensus_set.entry((hpc_consensus, reduced_label)).or_default(); + entry.push(i); + } + } + + // see if we can collapse the unknowns into a parent + let mut intentional_ignore: HashSet<(String, String)> = Default::default(); + for (hpc_consensus, entries) in unknown_set.into_iter() { + // figure out how many matches we have + let mut other_keys = vec![]; + for (merge_key, _merge_set) in consensus_set.iter() { + if hpc_consensus == merge_key.0 { + other_keys.push(merge_key.clone()); + } + } + + let hpc_count = other_keys.len(); + + // TODO: handling of multiple or no matches may need to be explored further in the future + match hpc_count { + 0 => { + // we have no HPC neighbors, so just collapse all of them into one big UNKNOWN pile + let unknown = Cyp2d6RegionLabel::new_unknown(); + assert!(consensus_set.insert((hpc_consensus, unknown.full_allele()), entries).is_none()); + }, + 1 => { + // we found exactly one parent option we can merge with, so lets do it! + let other_key = other_keys.pop().unwrap(); + let entry = consensus_set.get_mut(&other_key).unwrap(); + debug!("Collapsing entries {entries:?} into HPC relative {} ({entry:?})", other_key.1); + entry.extend(entries); + } + _n => { + // there are multiple, and we don't have logic to resolve that currently... + // for now, we will completely ignore the read set + // TODO: there may come a point where we want to somehow compare these groupings to the HPC relatives and force them into one + debug!("Multiple collapse options detected for entries {entries:?}, ignoring."); + let unknown = Cyp2d6RegionLabel::new_unknown(); + intentional_ignore.insert((hpc_consensus.clone(), unknown.full_allele())); + assert!(consensus_set.insert((hpc_consensus, unknown.full_allele()), entries).is_none()); + } + }; + } + + let mut consensuses = vec![]; + let mut sequence_indices = vec![usize::MAX; raw_consensus_result.sequence_indices().len()]; + for (masked_consensus, con_indices) in consensus_set.iter() { + let con_index = consensuses.len(); + debug!("Collapsing {con_indices:?} into {con_index}"); + let consensus: Consensus = if intentional_ignore.contains(masked_consensus) { + // fill in the si + let mut num_scored = 0; + for (i, si) in raw_consensus_result.sequence_indices().iter().enumerate() { + if con_indices.contains(si) { + sequence_indices[i] = con_index; + num_scored += 1; + } + } + + // this was a group we made but that we should really ignore + Consensus::new(vec![], waffle_con::cdwfa_config::ConsensusCost::L1Distance, vec![0; num_scored]) + } else if con_indices.len() == 1 { + // we can just copy the result, but we do still need to propagate the sequence index information + for (i, &si) in raw_consensus_result.sequence_indices().iter().enumerate() { + if si == con_indices[0] { + sequence_indices[i] = con_index; + } + } + + // return the copy of the full consensus (index 1) + raw_consensus_result.consensuses()[con_indices[0]][1].clone() + } else { + // we have a merge situation + let mut combined_consensus = ConsensusDWFA::with_config(cdwfa_config.clone())?; + for (si, (seq, offset)) in (sequences.iter().zip(offsets.iter())).enumerate() { + let assigned_consensus = raw_consensus_result.sequence_indices()[si]; + if con_indices.contains(&assigned_consensus) { + let opt_offset = if *offset == 0 { None } else { Some(*offset) }; + + // add the sequence + combined_consensus.add_sequence_offset(seq.as_bytes(), opt_offset)?; + // at the same time, mark this sequence index + sequence_indices[si] = con_index; + } + } + + // now run the consensus + let mut new_consensus_set = combined_consensus.consensus()?; + assert!(!new_consensus_set.is_empty()); + if new_consensus_set.len() > 1 { + warn!("Multiple consensuses found during collapse, picking first."); + } + new_consensus_set.remove(0) + }; + consensuses.push(consensus); + } + + // make sure all sequences have been re-assigned to a valid index + assert!(sequence_indices.iter().all(|&v| v < consensuses.len())); + + Ok(MultiConsensus::new( + consensuses, + sequence_indices + )) +} + +/// Given a particular chain, this will construct the user friendly visual of that chain. +/// E.g. [0, 0, 1] will become "*4x2 + *10" +/// # Arguments +/// * `chain` - the chains of alleles to tie together +/// * `hap_labels` - the haplotype labels for the internals, these can get converted to alleles where appropriate +/// * `detailed` - if False, then this will reduce any D6 subunits into their integer form; e.g., *4.001 -> *4 +/// * `cyp_translate` - a map from internal CYP name to user friendly name +pub fn convert_chain_to_hap(chain: &[usize], hap_labels: &[Cyp2d6RegionLabel], detailed: bool, cyp_translate: &BTreeMap) -> String { + // track the number of non-deletion alleles we identify + // this is robust to a potential *5x2 situation (which may also have reporting issue, but that's a future problem) + let mut num_non_deletion = 0; + + // first identify the reportable indices in the chain + let reportable_indices: Vec = chain.iter() + .rev() + .filter(|&&c_index| { + // only keep CYP2D alleles that are not CYP2D7 + let keep_allele = hap_labels[c_index].is_cyp2d() && hap_labels[c_index].region_type() != Cyp2d6RegionType::Cyp2d7; + if keep_allele && hap_labels[c_index].region_type() != Cyp2d6RegionType::Cyp2d6Deletion { + num_non_deletion += 1; + } + keep_allele + }) + .copied() + .collect(); + + // secondary filtering and translation into a string + reportable_indices.iter() + .filter_map(|&c_index| { + if hap_labels[c_index].region_type() == Cyp2d6RegionType::Cyp2d6Deletion && num_non_deletion > 0 { + // this will filter out *5 (deletion) alleles if something else is on the same chain; i.e. *5+*10 is not an allowed report (although it may happen biologically) + None + } else { + // passed all above secondary filtering + // convert the allele index into a human readable string + let string_label = hap_labels[c_index].simplify_allele(detailed, cyp_translate); + Some(string_label) + } + }) + .group_by(|v| v.clone()) // group them by the ID so we get adjacent counts + .into_iter() + .map(|(string_label, group)| { + let group_len = group.count(); + // add in any xN values + if group_len > 1 { + format!("{string_label}x{group_len}") + } else { + string_label + } + }) + .join(" + ") +} + +#[cfg(test)] +mod tests { + use crate::cyp2d6::definitions::Cyp2d6Config; + + use super::*; + + /* + TODO: tests that ideally exist but are difficult to implement: + - diplotype_cyp2d6 - this is basically end-to-end test; for now, we will rely on the pipeline since this is difficult to encode + - merge_consensus_results - also difficult to do, we need to have a D6 typer built for this, which means loading one from somewhere; pipeline will suffice for now + */ + + #[test] + fn test_convert_chain_to_hap() { + let hap_labels = vec![ + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d7, None), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("1.001".to_string())), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("10".to_string())), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("1.002".to_string())), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("1.002".to_string())) + ]; + + // get the translator for testing + let cyp2d6_config = Cyp2d6Config::default(); + let cyp_translate = cyp2d6_config.cyp_translate(); + + // basic example + let chain = vec![2, 2, 1, 0]; + let hap = convert_chain_to_hap(&chain, &hap_labels, true, cyp_translate); + assert_eq!(&hap, "*1.001 + *10x2"); // remember these get reversed + + // eventually this will get collapsed instead of separate + let chain = vec![3, 1, 0]; + let hap = convert_chain_to_hap(&chain, &hap_labels, true, cyp_translate); + assert_eq!(&hap, "*1.001 + *1.002"); + + // test the collapsed version + let hap = convert_chain_to_hap(&chain, &hap_labels, false, cyp_translate); + assert_eq!(&hap, "*1x2"); + + // tests if there are two near-identical alleles with the same name get collapsed + let chain = vec![3, 4]; + let hap = convert_chain_to_hap(&chain, &hap_labels, true, cyp_translate); + assert_eq!(&hap, "*1.002x2"); + } +} diff --git a/src/cyp2d6/chaining.rs b/src/cyp2d6/chaining.rs new file mode 100644 index 0000000..0b151e2 --- /dev/null +++ b/src/cyp2d6/chaining.rs @@ -0,0 +1,1033 @@ + +use log::{debug, log_enabled, trace}; +use minimap2::Aligner; +use simple_error::bail; +use std::collections::BTreeMap; +use waffle_con::multi_consensus::MultiConsensus; + +use crate::cyp2d6::caller::convert_chain_to_hap; +use crate::cyp2d6::definitions::Cyp2d6Config; +use crate::cyp2d6::errors::{CallerError, CallerWarning}; +use crate::cyp2d6::region_label::{Cyp2d6RegionLabel, Cyp2d6RegionType}; +use crate::data_types::mapping::MappingStats; +use crate::util::stats::multinomial_ln_pmf; + +/// This is a wrapper that will be the same length as the number of identified consensus sequences. +/// Additionally, the scores are edit distance and the overlap score +pub type SequenceWeights = Vec<(usize, f64)>; + +/// Given a sequence and a multi-consensus, this will compare the sequence to each consensus and score it based on edits and overlaps. +/// This assumes that the full `sequence` will be used, so penalties for unmapped bases from `sequence` are included when scoring. +/// If the best found mapping has a score > 0.05, then all mappings are marked as equally bad. +/// This step is necessary to generate SequenceWeights prior to finding the best chain solution. +/// # Arguments +/// * `sequence` - the sequence to compare to the consensuses, we expect this to be fully represented +/// * `consensus` - the multi-consensus containing all allowed consensuses +/// * `con_labels` - human readable labels for the consensuses (e.g. star-alleles) +pub fn weight_sequence(sequence: &str, consensus: &MultiConsensus, con_labels: &[Cyp2d6RegionLabel]) -> Result> { + let dna_aligner: Aligner = Aligner::builder() + .map_hifi() + .with_cigar() + .with_seq(sequence.as_bytes())?; + let seq_len = sequence.len(); + + // we only need cigar and md for debugging + // other settings for mapping + let output_cigar: bool = true; + let output_md: bool = true; + let max_frag_len: Option = None; + let extra_flags = None; + + // default is the length of the sequence getting "deleted" with 0 overlap + let mut ret = vec![(seq_len, 0.0); con_labels.len()]; + + // track the minimum observed NM-only ED; if it's larger than the maximum then this read does not map well to anything + let penalize_unmapped = true; // we are mapping consensuses against a sequence from a read, penalize if it cannot fill out that sequence space + let maximum_allowed_ed: f64 = 0.05; // TODO: I have a feeling we _should_ lower this, maybe down to 2-3%? + let mut min_ed_frac: f64 = 1.0; + + for (con_index, (con, label)) in consensus.consensuses().iter().zip(con_labels.iter()).enumerate() { + let con_seq = con.sequence(); + + if !label.is_allowed_label() { + // we ignore all Unknown and FalseAlleles + continue; + } + + // first, map the sequence + let mappings = dna_aligner.map( + con_seq, + output_cigar, output_md, max_frag_len, extra_flags.clone() + )?; + + for m in mappings.iter() { + let con_len = con_seq.len(); + let nm = m.alignment.as_ref().unwrap().nm as usize; + let unmapped = seq_len - (m.target_end - m.target_start) as usize; + + // the amount clipped at the start is the amount into query that we start + let clipped_start = m.query_start as usize; + // the amount clipped at the end is the length minus the query end point + let clipped_end = con_len - m.query_end as usize; + + let mapping_stats = MappingStats::new_with_clippings( + seq_len, nm, unmapped, + clipped_start, clipped_end + ); + + // we are mapping consensuses against a sequence from a read, penalize if it cannot fill out that sequence space + // this means we count both the nm() and unmapped() *against* each mapping + let match_score = mapping_stats.nm() + mapping_stats.unmapped(); + let overlap_score = 1.0 - (clipped_start + clipped_end) as f64 / con_len as f64; + let mapping_score = (match_score, overlap_score); + + debug!("\t\t{con_index}_{label} => ({:.4}, {:.4}) => {}", mapping_score.0, mapping_score.1, mapping_stats.custom_score_string(penalize_unmapped)); + + // if the edit distance is less OR it's equal but the overlap score is higher + if mapping_score.0 < ret[con_index].0 || ( + mapping_score.0 == ret[con_index].0 && mapping_score.1 > ret[con_index].1 + ) { + ret[con_index] = mapping_score; + min_ed_frac = min_ed_frac.min(mapping_stats.custom_score(penalize_unmapped).score()); + } + } + } + + if min_ed_frac <= maximum_allowed_ed { + // we found something that is seemingly matching at least one sequence + Ok(ret) + } else { + // all of the mappings are bad, send back empty vec for ignoring + Ok(vec![]) + } +} + +/// Contains the costs associated with each type of penalty +#[derive(Clone, Debug)] +pub struct ChainPenalties { + /// this is the penalty for duplicating an allele, increasing this may lead to under-estimating of CN + pub lasso_penalty: f64, + /// the log penalty for each edit + pub ln_ed_penalty: f64, + /// a penalty applied for an unexpected chain combination + pub unexpected_chain_penalty: f64, + /// penalty applied for each inferred edge + pub inferred_edge_penalty: f64 +} + +impl Default for ChainPenalties { + fn default() -> Self { + Self { + lasso_penalty: 4.0, + ln_ed_penalty: 2.0, // -(0.01_f64.ln()) + unexpected_chain_penalty: 10.0, + inferred_edge_penalty: 2.0 + } + } +} + +impl ChainPenalties { + /// Creates a new collection of chaining penalties with the given penalty values + pub fn new(lasso_penalty: f64, ln_ed_penalty: f64, unexpected_chain_penalty: f64, inferred_edge_penalty: f64) -> ChainPenalties { + ChainPenalties { + lasso_penalty, + ln_ed_penalty, + unexpected_chain_penalty, + inferred_edge_penalty + } + } +} + +/// Wrapper for chain scoring, switched to struct for ease of use and to prevent bugs. +/// API is lite because this is private. +struct ChainScore { + /// The lasso cost of having too many or too few alleles; i.e. raw dups and dels have a cost + pub allele_expected_penalty: f64, + /// The number of best observations that are not met by the chain pair + pub unmet_observations: u64, + /// The number of edits that are required to make all observations match + pub edit_distance: u64, + /// The log penalty for the edit distance value + pub ln_ed_penalty: f64, + /// Multi-nomial likelihood penalty + pub mn_llh_penalty: f64, + /// Penalty for any chain events that are unexpected + pub unexpected_chain_penalty: f64, + /// Penalty for including inferred connections + pub inferred_chain_penalty: f64, + /// For debug, the allele IDs for the corresponding reduced probs and reduced coverage + pub reduced_alleles: Vec, + /// For debug, the probability of observing each allele + pub reduced_probs: Vec, + /// For debug, the actual observed counts (rounded) + pub reduced_coverage: Vec, + /// Associated chain index 1 + pub chain_index1: usize, + /// Associated chain index 2 + pub chain_index2: usize, +} + +impl ChainScore { + /// The primary score metric for this ChainScore, represented as log likelihood. + fn primary_score(&self) -> f64 { + self.ln_ed_penalty + self.mn_llh_penalty + self.allele_expected_penalty + self.unexpected_chain_penalty + self.inferred_chain_penalty + } + + /// Handy score string for debugging + fn primary_score_string(&self) -> String { + format!("{:.2} ({} ed / {} reads) + {:.2} (MNLLH) + {:.2} (CN) + {:.2} (unexp) + {:.2} (infer)", + self.ln_ed_penalty, self.edit_distance, self.unmet_observations, + self.mn_llh_penalty, + self.allele_expected_penalty, + self.unexpected_chain_penalty, + self.inferred_chain_penalty + ) + } + + /// Simplified compare function now. + fn compare(&self, other: &ChainScore) -> std::cmp::Ordering { + // all of them are within the boundaries, so just return whichever has the best cumulative score + self.primary_score().partial_cmp(&other.primary_score()).unwrap() + } +} + +/// This will search through all the observed chaining counts and return the best combination of chains, as well as warnings if we encounter any dangling chains. +/// # Arguments +/// * `cyp2d6_config` - a generally static collection for the CYP2D6 configuration +/// * `obs_chains` - a map from sequence ID to a list of equally possible chains +/// * `chain_scores` - a map from sequence ID to scores for each consensus allele in the chain; this is a tuple of form (match_score, overlap_score) where 0.0 is worst, 1.0 is best +/// * `hap_labels` - a set of labels, primarily for debug output +/// * `infer_connections` - if True, then this will infer connections between alleles that do not have direct observations +/// * `normalize_all_alleles` - if True, all alleles are used to normalize coverage; otherwise, just those passing `is_normalizing_allele` will get used +/// * `penalities` - the set of penalties that describe the calculations we do for tie-breaking +/// * `ignore_chain_label_limits` - in prod, this should be false; but we use true for testing simplified chains +#[allow(clippy::type_complexity, clippy::too_many_arguments)] +pub fn find_best_chain_pair( + cyp2d6_config: &Cyp2d6Config, + obs_chains: &BTreeMap>>, chain_scores: &BTreeMap>, + hap_labels: &[Cyp2d6RegionLabel], + infer_connections: bool, normalize_all_alleles: bool, + penalties: ChainPenalties, + ignore_chain_label_limits: bool +) -> Result<(Vec>, Vec), Box> { + let mut caller_warnings: Vec = vec![]; + + if penalties.lasso_penalty < 0.0 { + bail!("Lasso penalty must be >= 0.0"); + } + + // first, identify all connections + let num_haps = hap_labels.len(); + let mut downstream_possible: Vec> = vec![vec![false; num_haps]; num_haps]; // 2D [upstream][downstream] -> edge present + for (_qname, putative_chains) in obs_chains.iter() { + for chain in putative_chains.iter() { + if chain.len() > 1 { + for i in 1..chain.len() { + let upstream = chain[i-1]; + let downstream = chain[i]; + + // add in the downstream possibility for the upstream + if hap_labels[upstream].is_allowed_label() && hap_labels[downstream].is_allowed_label() { + if ignore_chain_label_limits || hap_labels[upstream].is_allowed_label_pair(&hap_labels[downstream]) { + downstream_possible[upstream][downstream] = true; + } else { + debug!("Ignoring observed chain: {upstream}_{} -> {downstream}_{}", hap_labels[upstream], hap_labels[downstream]) + } + } + } + } + } + } + + // add any inferred connections + let mut inferred_possible: Vec> = vec![vec![false; num_haps]; num_haps]; // 2D [upstream][downstream] -> edge present + let cyp_translate = cyp2d6_config.cyp_translate(); + if infer_connections { + let detailed_inference = false; // we do not care if it is *4.001 or *4.002, just *4 works + + debug!("Inferred population connections:"); + let mut found_inference = false; + for (i, h1) in hap_labels.iter().enumerate() { + // old method goes from D6 -> D6 -> D7; we want to use the chain inferrences now though + let h1_mod = h1.simplify_allele(detailed_inference, cyp_translate); + + // Note: This was a version that we tested for fixing the erroneous connection of *3 + *68 / *4. + // The problem here is that it creates situations where copy number can spiral due to a lack of checking the core alleles. + // The final solution is likely a combination of this with some more intelligent parsing. + // In the short term, this is an extreme edge case that is partly correct, we will push this to a later patch. + let downstream_no_link = !downstream_possible[i].iter().any(|&b| b); + + for (j, h2) in hap_labels.iter().enumerate() { + let upstream_no_link = !downstream_possible.iter().any(|v| v[j]); + + // simpler check, if the pairing is allowed, then we will infer it in the absence of any outbound edges + if (downstream_no_link || upstream_no_link) && // make sure at least one end of this has no observed connections + !downstream_possible[i][j] && // probably redundant check + hap_labels[i].is_allowed_label() && + hap_labels[j].is_allowed_label() && + hap_labels[i].is_allowed_label_pair(&hap_labels[j]) { + // the labels are allowed to be a pair, so we will infer it as possible + let h2_mod = h2.simplify_allele(detailed_inference, cyp_translate); + debug!("\t{i}_{h1} ({h1_mod}) => {j}_{h2} ({h2_mod}) = inferred"); + inferred_possible[i][j] = true; + found_inference = true; + } + } + } + + if !found_inference { + debug!("\tNone"); + } + } + + // figure out which regions can start a chain + let head_indices: Vec = hap_labels.iter().enumerate() + .filter_map(|(i, label)| { + if ignore_chain_label_limits || // if we are ignoring labels, then allow each thing to be a head index + label.is_candidate_chain_head(normalize_all_alleles) { // this is a valid candidate head + // add it to the list + Some(i) + } else { + // not valid, so filter it out + None + } + }).collect(); + + // make sure we have chain starts, otherwise everything else will fail + if head_indices.is_empty() { + return Err(Box::new(CallerError::NoChainingHead)); + } + + // build all possible chains that can come from the head indices + let mut remaining_chains: Vec> = head_indices.iter().map(|&start_index| vec![start_index]).collect(); + let mut possible_chains = vec![]; + let max_copy_number = 3; // TODO: this *could* be a CLI parameter in the future, we just need to be careful given that downstream application will balk + while let Some(current_chain) = remaining_chains.pop() { + // push the chain IF it ends with a D6/D7 allele (we expect to always end in D7 unless we have dropout) + // let last_label = hap_labels[*current_chain.last().unwrap()].as_str(); + // if ignore_chain_label_limits || last_label.starts_with("CYP2D") { + // TODO: can we add restrictions back in at some point? + + let (is_allowed_inferrence, is_allowed_candidate) = check_chain_inferrences(cyp2d6_config, ¤t_chain, hap_labels, &inferred_possible); + if !is_allowed_inferrence { + // there is an inferrence happening that is not allowed, discard this candidate entirely + continue; + } + + let simplified_chain = convert_chain_to_hap(¤t_chain, hap_labels, true, cyp_translate); + if ignore_chain_label_limits || (!simplified_chain.is_empty() && is_allowed_candidate) { + // only add a chain if we are ignore labels OR + // (if the chain produces a non-empty haplotype AND + // it is not over-inferring something) + possible_chains.push(current_chain.clone()); + } + + let current_index = *current_chain.last().unwrap(); + for (extension_index, &extension_possible) in downstream_possible[current_index].iter().enumerate() { + if !extension_possible { + // extension with this isn't even allowed, so skip it + continue; + } + + let extension_count = current_chain.iter().filter(|&&v| v == extension_index).count(); + if extension_count >= max_copy_number { + // we already have the maximum allowed copies of this (somewhere), don't allow any more in the chain + // this allows for non-adjacency + continue; + } + + // these checks prevent infinite loops by restricting it to 2 + let mut new_chain = current_chain.clone(); + new_chain.push(extension_index); + remaining_chains.push(new_chain); + } + + if infer_connections { + // we didn't found a direct downstream possibility, lets see if we can find an inferred connection + // add an extension for each inferrence, they are filtered later + for (extension_index, &extension_possible) in inferred_possible[current_index].iter().enumerate() { + if !extension_possible { + // extension with this isn't even allowed, so skip it + continue; + } + + let extension_count = current_chain.iter().filter(|&&v| v == extension_index).count(); + if extension_count >= 3 { + // we already have three copies of this (somewhere), don't allow a third copy at this time + // this allows for non-adjacency + continue; + } + + // these checks prevent infinite loops by restricting it to 2 + let mut new_chain = current_chain.clone(); + new_chain.push(extension_index); + remaining_chains.push(new_chain); + } + } + } + + if possible_chains.is_empty() { + // we did not find any valid chains, likely due to low coverage + return Err(Box::new(CallerError::NoChainsFound)); + } + + debug!("Possible chains (N={}):", possible_chains.len()); + for chain in possible_chains.iter() { + debug!("\t{chain:?}"); + } + + trace!("Multiple chain pairs detected, scoring them:"); + let mut score_sets: Vec = vec![]; + // figure out the best by what is explained + for i in 0..possible_chains.len() { + for j in i..possible_chains.len() { + let mut read_combined_ed: usize = 0; + let mut hap_weights = vec![0.0; hap_labels.len()]; + + for (_qname, chain_weights) in chain_scores.iter() { + let (score, chain_match) = containment_score(&possible_chains[i], &possible_chains[j], chain_weights); + // squared for RMS calculation - saturating is really only necessary for unit tests + read_combined_ed = read_combined_ed.saturating_add(score); + + // divide support between the matches + let split_frac = 1.0 / chain_match.len() as f64; + for (chain_offset, &con_index) in chain_match.iter() + .flat_map(|chain| chain.iter().enumerate()) { // convert each chain into offset + consensus index iterator + // the split fraction scales the chain weight for this part of the read (chain_offset) when compared to the given consensus index + // then we select the overlap fraction which is the second value + hap_weights[con_index] += split_frac * chain_weights[chain_offset][con_index].1; + } + } + + // at this point we have the hap weights assigned, now divide by the allele counts + let mut hap_counts = vec![0; hap_labels.len()]; + for &con_index in possible_chains[i].iter().chain(possible_chains[j].iter()) { + hap_counts[con_index] += 1; + } + + // this applies a penalty factor for each deviation from 1.0 copies + let allele_expected_penalty: f64 = penalties.lasso_penalty * hap_labels.iter().zip(hap_counts.iter()) + .filter_map(|(label, hc)| { + if label.is_allowed_label() && // make sure the label is allowed + (ignore_chain_label_limits || // if we are ignoring label (usually debug only) OR + label.is_normalizing_allele(normalize_all_alleles) || // if the allele is for normalizing OR + label.is_reported_allele() // the allele is going to appear in the output + ) { + if *hc > 0 { + Some((*hc - 1) as f64) // we are expecting 1 copy, so add the delta + } else { + // this is an allowed allele, but we do not have any copies in our hap_count, so ignore it + // we used to count this as Some(1.0), indicating that an allele was expected but is missing + // now, we just let the edit penalty handle it + None + } + } else { + // this is not an allowed label, and we are not using it in normalization + None + } + }) + .sum::(); + + // count the number of chains that we cannot exactly find in this diplotype + let mut unmet_observations = 0; + for (_qname, putative_chains) in obs_chains.iter() { + let supported = putative_chains.iter() + .any(|chain| { + // returns true if the putative chain is a sub-chain of either possible chain we're currently looking at + is_sub(&possible_chains[i], chain) || + is_sub(&possible_chains[j], chain) + }); + + // if nothing support the putative chains, then we have a read that is an unmet observation + if !supported { + unmet_observations += 1; + } + } + + let edit_distance = read_combined_ed as u64; + let ln_ed_penalty = (read_combined_ed as f64) * penalties.ln_ed_penalty; + + // now the multinomial calculation + let mut reduced_alleles: Vec = vec![]; + let mut reduced_counts: Vec = vec![]; + let mut reduced_coverage: Vec = vec![]; + + // one loop to populate all these vecs + for (hap_index, hl) in hap_labels.iter().enumerate() { + let hap_count = hap_counts[hap_index]; + if hap_count > 0 && (ignore_chain_label_limits || hl.is_normalizing_allele(normalize_all_alleles)) { + reduced_alleles.push(hap_index); + reduced_counts.push(hap_count); + let hap_weight = hap_weights[hap_index].round() as u64; + reduced_coverage.push(hap_weight); + } + } + + // now normalize the probabilities to 1.0 + let total_hap_count: i32 = reduced_counts.iter().sum(); + let reduced_probs: Vec = reduced_counts.into_iter() + .map(|c| (c as f64) / (total_hap_count as f64)) + .collect(); + + if reduced_probs.is_empty() || reduced_coverage.iter().sum::() == 0 { + // this happens when we have a haplotype with no actual D6 alleles, it's not a valid result + continue; + } + assert_eq!(reduced_probs.len(), reduced_coverage.len()); + + /* + // turns out Multinomial is not using all ln format under the hood so it overflows, we need a custom one + let total_coverage: u64 = reduced_coverage.iter().sum(); + let multinomial = Multinomial::new(&reduced_probs, total_coverage)?; + let mn_llh_penalty = -multinomial.ln_pmf(&reduced_coverage); + */ + let mn_llh_penalty = multinomial_ln_pmf(&reduced_probs, &reduced_coverage).abs(); + + // check if we have any unexpected chain pairs at the CYP2D level + let expectation_mismatch = if ignore_chain_label_limits { 0 } + else { unexpected_count(&possible_chains[i], hap_labels, cyp2d6_config) + unexpected_count(&possible_chains[j], hap_labels, cyp2d6_config) }; + let unexpected_chain_penalty = (expectation_mismatch as f64) * penalties.unexpected_chain_penalty; + + // count up the number of edges that are from inferrence + let num_inferred_edges = if infer_connections { + let mut count = 0; + for chain in [&possible_chains[i], &possible_chains[j]] { + for window in chain.windows(2) { + let h1 = window[0]; + let h2 = window[1]; + if inferred_possible[h1][h2] { + count += 1; + } + } + } + count + } else { + 0 + }; + let inferred_chain_penalty = (num_inferred_edges as f64) * penalties.inferred_edge_penalty; + + // trace!("\t{:?} {:?} => {:?} + {} + {} + {} => {:?}", possible_chains[i], possible_chains[j], combined_explained, allele_delta_penalty, allele_expected_penalty, full_chain_penalty, allele_counts); + score_sets.push(ChainScore { + allele_expected_penalty, + unmet_observations, + edit_distance, + ln_ed_penalty, + mn_llh_penalty, + unexpected_chain_penalty, + inferred_chain_penalty, + reduced_alleles, + reduced_probs, + reduced_coverage, + chain_index1: i, + chain_index2: j, + }); + } + } + + score_sets.sort_by(|a, b| { + // check if the combined explanations are "close enough" for equality + // TODO: now that we have a unified score, we could probably convert this into proper compare with PartialCmp and Cmp; low priority + a.compare(b) + }); + score_sets.reverse(); + + if log_enabled!(log::Level::Debug) { + let num_shown = 50; + debug!("Scored chain pairs (best {num_shown}):"); + let skip_count = if score_sets.len() > num_shown { score_sets.len() - num_shown } else { 0 }; + for chain_score in score_sets.iter().skip(skip_count) { + debug!("\t{:?} {:?} => {:.4} ({})", + possible_chains[chain_score.chain_index1], + possible_chains[chain_score.chain_index2], + chain_score.primary_score(), + chain_score.primary_score_string() + ); + debug!("\t\talleles: {:?}, probs: {:?}; obs: {:?}", + chain_score.reduced_alleles, + chain_score.reduced_probs, + chain_score.reduced_coverage + ); + } + } + + if score_sets.is_empty() { + // this shouldn't be possible anymore with the prior checks, but lets put an error catch here just in case + return Err(Box::new(CallerError::NoScorePairs)); + } + + // we sorted them above from worst to best + let highest_score = score_sets.last().unwrap(); + let best_pair = (highest_score.chain_index1, highest_score.chain_index2); + + let mut best_chain_pair = vec![ + possible_chains[best_pair.0].clone(), + possible_chains[best_pair.1].clone() + ]; + best_chain_pair.sort(); + let best_chains = best_chain_pair; + + // build all the chains for each head index + let mut index_used: Vec = vec![false; num_haps]; + for chain in best_chains.iter() { + for &i in chain.iter() { + index_used[i] = true; + } + } + + // now go through the used indices and report any that were not used as dangling alleles (i.e., unchained to the result) + for (i, &b) in index_used.iter().enumerate() { + if !b { + // this one was not used + caller_warnings.push(CallerWarning::DanglingAllele { allele_name: format!("{}_{}", i, hap_labels[i]) }); + } + } + + Ok((best_chains, caller_warnings)) +} + +/// Checks a putative haplotype chain for correct inferrences. +/// Returns a boolean of the form `(is_allowed_inferrence, is_allowed_candidate)` where +/// `is_allowed_inferrence` is only true if there are no inferrence issues in the most recent D6 pairing and +/// `is_allowed_candidate` is only true if it ends with an allowed CYP2D allele OR there are no inferrences since the most recent CYP2D allele. +/// # Arguments +/// * `cyp2d6_config` - a generally static collection for the CYP2D6 configuration +/// * `chain` - the putative chain to check +/// * `hap_labels` - the haplotype labels we need for interpreting the chain +/// * `inferred_connections` - contains the list of inferred chain links +fn check_chain_inferrences(cyp2d6_config: &Cyp2d6Config, chain: &[usize], hap_labels: &[Cyp2d6RegionLabel], inferred_connections: &[Vec]) -> (bool, bool) { + // sanity check + assert!(!chain.is_empty()); + + // config items + let cyp_translate = cyp2d6_config.cyp_translate(); + + // check if the last link is a CYP2D allele or not + let last_hap_index = *chain.last().unwrap(); + let last_is_cyp2d = hap_labels[last_hap_index].is_cyp2d(); + + // get the most recent CYP2D allele before last + let mut opt_index = None; + for (chain_index, &hap_index) in chain.iter().enumerate().rev().skip(1) { + if hap_labels[hap_index].is_cyp2d() { + opt_index = Some(chain_index); + break; + } + } + + // finally, check if we have an inferrence in between + let mut inferrence_detected = false; + for window in chain[opt_index.unwrap_or(0)..(chain.len())].windows(2) { + let h1 = window[0]; + let h2 = window[1]; + if inferred_connections[h1][h2] { + // this one was inferred + inferrence_detected = true; + } + } + + if inferrence_detected { + // we found an inferrence, we need to verify that it's an okay one + if last_is_cyp2d { + // the last allele in the chain is a D6 allele, verify that the previous D6 allele is valid + if let Some(previous_d6_index) = opt_index { + let detailed_inference = false; // we do not care if it is *4.001 or *4.002, just *4 works + let infer_d7_tail = true; // if true, then we infer connections from D6 to D7 + + // build out the allele labels + let previous_hap_index = chain[previous_d6_index]; + let h1 = &hap_labels[previous_hap_index]; + let h2 = &hap_labels[last_hap_index]; + let h1_mod = h1.simplify_allele(detailed_inference, cyp_translate); + let h2_mod = h2.simplify_allele(detailed_inference, cyp_translate); + + let connected = previous_hap_index != last_hap_index && // we do not allow inferred exact duplications + cyp2d6_config.inferred_connections().contains(&(h1_mod.clone(), h2_mod.clone())); // check if it is otherwise known + let d7_tail_connection = infer_d7_tail && + // must be a D7 link + h2.region_type() == Cyp2d6RegionType::Cyp2d7 && + // must be a non-D7 CYP2D allele + h1.region_type() != Cyp2d6RegionType::Cyp2d7 && + h1.region_type().is_cyp2d(); + + // allowed IF either it's a known inferred connection OR it's a D7 tail + let allowed = connected || d7_tail_connection; + (allowed, allowed) + } else { + // there is no previous D6 allele, so we should be okay on both counts + (true, true) + } + } else { + // we do not end with CYP2D allele and we have an inferrence + // in this case, it is not breaking any rules, but it also should not be allowed to be a full chain for scoring + (true, false) + } + } else { + // we did not detect an inferrence, so we can return true on both counts + (true, true) + } +} + +/// This return the best possible score for a collection of putative `chain_weights` when compared to two candidate chain sets. +/// It also returns a list of the chains that can generate that best score. +/// This score is shifted relative to the absolute best score (if we ignored chaining), such that exactly matching the best will return a "0" score. +/// # Arguments +/// * `chain_set1` - the first chain set to search +/// * `chain_set2` - the second chain set to search +/// * `chain_weights` - an observed chain and the collected scores; each allele in the chain has a Vec of score tuples (edit distance, overlap); edit distance is raw integer and overlap is 1.0 max +fn containment_score(chain_set1: &[usize], chain_set2: &[usize], chain_weights: &[SequenceWeights]) -> (usize, Vec>) { + // calculate the *best* possible score + let optimum_weight: usize = chain_weights.iter() + .map(|scores| { + scores.iter() + .map(|(w, _o)| *w) + .min().unwrap() + }).sum(); + + // calculate the *worst* as well + let worst_weight: usize = chain_weights.iter() + .map(|scores| { + scores.iter() + .map(|(w, _o)| *w) + .max().unwrap() + }).sum(); + + // initialize to double the worst score for the sake of distinguishing + let mut best_score = 2*worst_weight; + let mut best_chains = vec![]; + let weight_len = chain_weights.len(); + + for other in [chain_set1, chain_set2] { + if other.len() < weight_len { + // this one is too short to ever represent the full chain + continue; + } + + for start_index in 0..(other.len() - weight_len+1) { + let total_weight: usize = other[start_index..(start_index+weight_len)].iter().zip(chain_weights.iter()) + .map(|(&i, weight_dict)| { + weight_dict[i].0 + }) + .sum(); + + if total_weight < best_score { + best_score = total_weight; + best_chains.clear(); + } + if total_weight == best_score { + best_chains.push(other[start_index..(start_index+weight_len)].to_vec()); + } + } + } + + assert!(best_score >= optimum_weight); + + (best_score - optimum_weight, best_chains) +} + +/// This will look at a chain with haplotype labels and report out the total number of chain pairs that are not expected. +/// Checks for chains that are empty or that do not start with CYP2D6. +/// Also checks for any chain links that are not a part of our inferred set (meaning, we probably are not expecting it). +/// # Arguments +/// * `chain` - the chain to scan +/// * `hap_labels` - labels from index to String for each haplotype +fn unexpected_count(chain: &[usize], hap_labels: &[Cyp2d6RegionLabel], cyp2d6_config: &Cyp2d6Config) -> u32 { + let reduced_chain: Vec = chain.iter() + .filter_map(|&c_index| { + // remove all D7 alleles + if hap_labels[c_index].is_cyp2d() && hap_labels[c_index].region_type() != Cyp2d6RegionType::Cyp2d7 { + // convert the allele index into a human readable string + let string_label = hap_labels[c_index].simplify_allele(false, cyp2d6_config.cyp_translate()); + Some(string_label) + } else { + None + } + }).collect(); + + let mut errors_detected = 0; + if reduced_chain.is_empty() || // we do not expect something to be empty, that's for sure + !reduced_chain[0].starts_with('*') { // we expect a reduced chain to start with D6 + errors_detected += 1; + } else { + // chain is not empty AND the chain starts with a star-allele, so should be okay on this front + }; + + // check if there is an unexpected singleton + if reduced_chain.len() == 1 && cyp2d6_config.unexpected_singletons().contains(&reduced_chain[0]) { + // this is a single allele that we do not expect to find as a single allele under most circumstances + errors_detected += 1; + } + + // check for any aberrant chain pairs + for chain_pair in reduced_chain.windows(2) { + if !cyp2d6_config.inferred_connections().contains(&(chain_pair[0].clone(), chain_pair[1].clone())) { + // this is not a connection we would normally infer, so it's unexpected + errors_detected += 1; + } + } + + errors_detected +} + +/// From here: https://stackoverflow.com/questions/47043167/does-rust-contain-a-way-to-directly-check-whether-or-not-one-vector-is-a-substr +/// This is a simple function that will check if one slice (the haystack) contains another (the needle) +/// # Arguments +/// * `haystack` - the slice to search +/// * `needle` - the sub-slice to search for in the haystack +fn is_sub(haystack: &[T], needle: &[T]) -> bool { + haystack.windows(needle.len()).any(|c| c == needle) +} + +#[cfg(test)] +mod tests { + use super::*; + + use waffle_con::cdwfa_config::ConsensusCost; + use waffle_con::consensus::Consensus; + + /// Wrapper function that will generate CYP2D6 chains for us as "reads" that always span just two regions of interest. + /// User provides a list of haplotypes and the observed chains, then scores are auto-generated to match. + /// This only generates constant values where the "match" is a perfect match and all others have a constant ED. + /// # Arguments + /// * `hap_labels` - the labels for the haplotypes + /// * `chains` - all observed chains, these can just be expected diplotypes for ease of use + fn create_pairwise_chains(hap_labels: &[Cyp2d6RegionLabel], chains: &[Vec]) -> (BTreeMap>>, BTreeMap>) { + // contains the best observed chains for each + let mut obs_chains: BTreeMap>> = Default::default(); + // contains scores against all alleles + let mut chain_scores: BTreeMap> = Default::default(); + + let mut read_index: usize = 0; + for chain in chains.iter() { + assert!(chain.len() >= 2); + // create a read for each pair + for window in chain.windows(2) { + let seq_name = format!("read_{read_index}"); + obs_chains.insert(seq_name.clone(), vec![window.to_vec()]); + + // for each link in the chain, create penalties for all other comparators + let mut weights = vec![]; + for &hap_index in chain.iter() { + // set all scores to "bad"; 100 ED and fully overlapping + let mut all_scores = vec![(100, 1.0); hap_labels.len()]; + // set the one match to "good"; 0 ED + all_scores[hap_index] = (0, 1.0); + weights.push(all_scores); + } + chain_scores.insert(seq_name, weights); + read_index += 1; + } + } + + (obs_chains, chain_scores) + } + + #[test] + fn test_find_best_chain_pair() { + let hap_labels = vec![ + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("A".to_string())), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("B".to_string())), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("C".to_string())), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("D".to_string())) + ]; + let obs_chains = [ + // 0 -> 1 -> 2 + ("seq_1".to_string(), vec![vec![0, 2]]), + ("seq_2".to_string(), vec![vec![1, 1]]) + ].into_iter().collect(); + let chain_scores = [ + ("seq_1".to_string(), vec![ + vec![(0, 1.0), (1, 1.0), (1, 1.0), (1, 1.0)], + vec![(1, 1.0), (1, 1.0), (0, 1.0), (1, 1.0)] + ]), + ("seq_2".to_string(), vec![ + vec![(1, 1.0), (0, 1.0), (1, 1.0), (1, 1.0)], + vec![(1, 1.0), (0, 1.0), (1, 1.0), (1, 1.0)] + ]) + ].into_iter().collect(); + let penalties = Default::default(); + let cyp2d6_config = Cyp2d6Config::default(); + let (chain_result, danglers) = find_best_chain_pair(&cyp2d6_config, &obs_chains, &chain_scores, &hap_labels, false, true, penalties, true).unwrap(); + assert_eq!(chain_result, vec![ + vec![0, 2], + vec![1, 1] + ]); + assert_eq!(danglers, vec![CallerWarning::DanglingAllele { allele_name: "3_CYP2D6*D".to_string() }]); + } + + #[test] + fn test_ambiguous_find_best_chain_pair() { + let hap_labels = vec![ + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("A".to_string())), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("B".to_string())) + ]; + let obs_chains = [ + // B -> A -> A + ("seq_0".to_string(), vec![vec![1]]), + ("seq_1".to_string(), vec![vec![1, 0]]), + ("seq_2".to_string(), vec![vec![0, 0]]), + ("seq_3".to_string(), vec![vec![0]]), + // B -> A + ("seq_4".to_string(), vec![vec![1]]), + ("seq_5".to_string(), vec![vec![1, 0]]), + ("seq_6".to_string(), vec![vec![0]]) + ].into_iter().collect(); + + // scores are (edit distance, overlap score) + let chain_scores = [ + ("seq_0".to_string(), vec![ + vec![(10, 1.0), (0, 1.0)] + ]), + ("seq_1".to_string(), vec![ + vec![(10, 1.0), (0, 1.0)], + vec![(0, 1.0), (10, 1.0)] + ]), + ("seq_2".to_string(), vec![ + vec![(0, 1.0), (10, 1.0)], + vec![(0, 1.0), (10, 1.0)] + ]), + ("seq_3".to_string(), vec![ + vec![(0, 1.0), (10, 1.0)] + ]), + ("seq_4".to_string(), vec![ + vec![(10, 1.0), (0, 1.0)] + ]), + ("seq_5".to_string(), vec![ + vec![(10, 1.0), (0, 1.0)], + vec![(0, 1.0), (10, 1.0)] + ]), + ("seq_6".to_string(), vec![ + vec![(0, 1.0), (10, 1.0)] + ]) + ].into_iter().collect(); + + // no lasso penalty + let penalties = ChainPenalties::new(0.0, -(0.01_f64.ln()), 0.0, 2.0); + let cyp2d6_config = Cyp2d6Config::default(); + let (chain_result, danglers) = find_best_chain_pair(&cyp2d6_config, &obs_chains, &chain_scores, &hap_labels, false, true, penalties, true).unwrap(); + assert_eq!(chain_result, vec![ + vec![1], + vec![1, 0, 0, 0] // without lasso, it will just greedily add these + ]); + assert_eq!(danglers, vec![]); + + // now test with lasso penalty, it should drop the first 0 + let penalties = ChainPenalties::new(3.0, -(0.01_f64.ln()), 0.0, 2.0); + let (chain_result, danglers) = find_best_chain_pair(&cyp2d6_config, &obs_chains, &chain_scores, &hap_labels, false, true, penalties, true).unwrap(); + assert_eq!(chain_result, vec![ + vec![1], + vec![1, 0, 0] // with lasso, it restrict to what is observed + ]); + assert_eq!(danglers, vec![]); + } + + #[test] + fn test_weight_sequence() { + // I think this is doable, but we will see + let consensus = MultiConsensus::new( + vec![ + Consensus::new(b"AGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCACCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGAACATGGGATTATGGGCAAGGGTAACAGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCACCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGGACATGGGATTATGGGCAAGGGTAAC".to_vec(), ConsensusCost::L1Distance, vec![0]), + Consensus::new(b"AGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCACCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGCACATGGGATTATGGGCAAGGGTAACAGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCACCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGGACATGGGATTATGGGCAAGGGTAAC".to_vec(), ConsensusCost::L1Distance, vec![0]), + Consensus::new(b"AGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCACCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGGACATGGGATTATGGGCAAGGGTAACAGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCACCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGGACATGGGATTATGGGCAAGGGTAAC".to_vec(), ConsensusCost::L1Distance, vec![0]) + // ^change here + ], + vec![0] // these don't matter for the test + ); + let con_labels = vec![ + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("A".to_string())), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("C".to_string())), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("G".to_string())), + ]; + + // test mostly match, but best is the first entry + let sequence = "AGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCACCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGAACATGGGATTATGGGCAAGGGTAACAGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCACCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGGACATGGGATTATGGGCAAGGGTAAC"; + let score = weight_sequence(&sequence, &consensus, &con_labels).unwrap(); + assert_eq!(score.iter().min_by(|a, b| { + a.partial_cmp(b).unwrap() + }).unwrap(), &score[0]); + + // test equal match (G -> T) to each + let sequence = "AGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCACCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGNACATGGGATTATGGGCAAGGGTAACAGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCACCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGGACATGGGATTATGGGCAAGGGTAAC"; + let score = weight_sequence(&sequence, &consensus, &con_labels).unwrap(); + assert_eq!(score[0], score[1]); + assert_eq!(score[0], score[2]); + } + + #[test] + fn test_inferred_alleles() { + let hap_labels = [ + // *3 -> D7 + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("3".to_string())), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::LinkRegion, None), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Rep7, None), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Spacer, None), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d7, None), + // *4 -> *68, but through the same link types + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some("4".to_string())), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Hybrid, Some("CYP2D6::CYP2D7::exon2".to_string())) + ]; + let chains = [ + // two components for *3 -> link_region; Rep7 -> D7 + vec![0, 1], + vec![2, 3, 4], + // two components for *4 -> link_region; Rep7 -> *68 + // inferrence will be required to connect them correctly + vec![5, 1], + vec![2, 3, 6] + ]; + + // create pairwise chains from the above + let (obs_chains, chain_scores) = create_pairwise_chains(&hap_labels, &chains); + + // first, test with no inferrence, which should just find *3 / *4 + let infer = false; + let penalties: ChainPenalties = Default::default(); + let cyp2d6_config = Cyp2d6Config::default(); + let (chain_result, danglers) = find_best_chain_pair(&cyp2d6_config, &obs_chains, &chain_scores, &hap_labels, infer, true, penalties.clone(), false).unwrap(); + assert_eq!(chain_result, vec![ + vec![0, 1], // *3 + vec![5, 1] // *4 + ]); + // no inferrence leads to a bunch of danglers + assert_eq!(danglers, vec![ + CallerWarning::DanglingAllele { allele_name: "2_REP7".to_string() }, + CallerWarning::DanglingAllele { allele_name: "3_spacer".to_string() }, + CallerWarning::DanglingAllele { allele_name: "4_CYP2D7".to_string() }, + CallerWarning::DanglingAllele { allele_name: "6_CYP2D6::CYP2D7::exon2".to_string() } + ]); + + // now add in inferrence and re-test + // first, test with no inferrence, which should just find *3 / *4 + let cyp2d6_config = Cyp2d6Config::default(); + let infer = true; + let (chain_result, danglers) = find_best_chain_pair(&cyp2d6_config, &obs_chains, &chain_scores, &hap_labels, infer, true, penalties, false).unwrap(); + assert_eq!(chain_result, vec![ + vec![0, 1, 2, 3, 4], // *3 + D7 + vec![5, 1, 2, 3, 6] // *4 + *68, this should get inferred as the correct solution here + ]); + assert!(danglers.is_empty()); // should be no danglers now + } + + #[test] + fn test_chaining_errors() { + // defaults that do not matter for this test + let cyp2d6_config = Cyp2d6Config::default(); + let obs_chains = Default::default(); + let chain_scores = Default::default(); + let infer = false; + let penalties = Default::default(); + + // simple case where we just give the algorithm a bunch of non-starting hap labels + let hap_labels = vec![ + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d7, None), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::LinkRegion, None), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Spacer, None), + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Unknown, None) + ]; + let result = find_best_chain_pair(&cyp2d6_config, &obs_chains, &chain_scores, &hap_labels, infer, true, penalties, false); + assert!(result.is_err()); + assert_eq!(result.err().unwrap().downcast::().unwrap().as_ref(), &CallerError::NoChainingHead); + + // possible future TODO: we have two other error types, I don't think we can actually reach them though + // we will lazily add if a user manages to do it somehow + } +} \ No newline at end of file diff --git a/src/cyp2d6/definitions.rs b/src/cyp2d6/definitions.rs new file mode 100644 index 0000000..276cf1a --- /dev/null +++ b/src/cyp2d6/definitions.rs @@ -0,0 +1,497 @@ + +use rust_lib_reference_genome::reference_genome::ReferenceGenome; +use rustc_hash::FxHashMap as HashMap; +use std::collections::{BTreeMap, BTreeSet}; +use serde::{Deserialize, Serialize}; +use simple_error::{SimpleError, bail}; + +use crate::cyp2d6::region_label::{Cyp2d6RegionLabel, Cyp2d6RegionType}; +use crate::data_types::coordinates::Coordinates; + +// these are the fixed buffers around the *5 region that we search for +// in the future we may increase the pre-buffer, but this is basically the *minimum* to accurately find it in WGS +static STAR5_PRE_BUFFER: usize = 500; +static STAR5_POST_BUFFER: usize = 3000; + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct Cyp2d6Config { + /// High-level coordinates of the CYP2D regions + cyp_coordinates: BTreeMap, + /// Specific subregion, like exons + cyp_regions: BTreeMap>, + /// Coordinates for *5 deleted region + cyp2d6_star5_del: Coordinates, + /// Translation from certain alleles to a known star-allele + cyp_translate: BTreeMap, + /// Inferred connections we allow + inferred_connections: BTreeSet<(String, String)>, + /// List of alleles we do not expect to find alone + unexpected_singletons: BTreeSet +} + +impl Cyp2d6Config { + /// This function should be called after loading a config to verify that everything required to run the algorithms is present. + pub fn validate_config(&self) -> Result<(), SimpleError> { + // make sure all expected regions are defined + let expected_cyp_coordinates = [ + "CYP2D6", "CYP2D7", "REP6", "REP7", "spacer", "link_region", "CYP2D6_wfa_backbone" + ]; + for &ecc in expected_cyp_coordinates.iter() { + if !self.cyp_coordinates.contains_key(ecc) { + bail!("Coordinates for \"{}\" were not found in provided cyp_coordinates.", ecc); + } + } + + // make sure all exon regions are defined for our hybrids + let expected_cyp_region_keys = [ + "CYP2D6", "CYP2D7" + ]; + for &ecrk in expected_cyp_region_keys.iter() { + if !self.cyp_regions.contains_key(ecrk) { + bail!("Data for \"{}\" was not found in provided cyp_regions.", ecrk); + } + let cr = self.cyp_regions.get(ecrk).unwrap(); + let expected_cyp_region_exons = 1..10; + for ecre in expected_cyp_region_exons { + let exon_label = format!("exon{ecre}"); + if !cr.contains_key(&exon_label) { + bail!("Data for \"{}\" is missing coordinates for \"{}\" in cyp_regions.", ecrk, exon_label); + } + } + } + + // cyp2d6_star5_del - just needs a definition + // cyp_translate - nothing required here to prevent crash + // inferred_connections - nothing required here to prevent crash + // unexpected_singletons - nothing required here to prevent crash + + Ok(()) + } + + /// This will return the target extraction region for CYP2D6 and CYP2D7 based on the database coordinates. + pub fn extraction_region(&self) -> Coordinates { + let cyp_coordinates = self.cyp_coordinates(); + let cyp2d6_star5_del = self.cyp2d6_star5_del(); + + // figure out the BAM mapping coordinates + let full_d6_region = cyp_coordinates.get("CYP2D6").unwrap(); + let full_d7_region = cyp_coordinates.get("CYP2D7").unwrap(); + let rep6_region = cyp_coordinates.get("REP6").unwrap(); + let rep7_region = cyp_coordinates.get("REP7").unwrap(); + // link_region and spacer are not necessary here + let bam_region = Coordinates::new( + full_d6_region.chrom().to_string(), + [ + full_d6_region.start(), + cyp2d6_star5_del.start() - STAR5_PRE_BUFFER as u64, + full_d7_region.start(), + rep6_region.start(), + rep7_region.start() + ].into_iter().min().unwrap(), + [ + full_d6_region.end(), + cyp2d6_star5_del.end() + STAR5_POST_BUFFER as u64, + full_d7_region.end(), + rep6_region.end(), + rep7_region.end() + ].into_iter().max().unwrap() + ); + bam_region + } + + // getters + pub fn cyp_coordinates(&self) -> &BTreeMap { + &self.cyp_coordinates + } + + pub fn cyp_regions(&self) -> &BTreeMap> { + &self.cyp_regions + } + + pub fn cyp2d6_star5_del(&self) -> &Coordinates { + &self.cyp2d6_star5_del + } + + pub fn cyp_translate(&self) -> &BTreeMap { + &self.cyp_translate + } + + pub fn inferred_connections(&self) -> &BTreeSet<(String, String)> { + &self.inferred_connections + } + + pub fn unexpected_singletons(&self) -> &BTreeSet { + &self.unexpected_singletons + } +} + +impl Default for Cyp2d6Config { + fn default() -> Self { + let mut cyp_coordinates: BTreeMap = Default::default(); + let preshift = 1; + let postshift = 0; + + // These were our original coordinates that DID NOT include the REP regions + // CYP2D6 + // these coordinates were generated by using a +-50bp buffer around the variants in our DB, we assert! this below + let d6_start = 42126260 - preshift; + let d6_end = 42132424 - postshift; + cyp_coordinates.insert("CYP2D6".to_string(), Coordinates::new("chr22".to_string(), d6_start, d6_end)); + // CYP2D7 + // the coordinates were generated by mapping the above and looking at where it lands in D7 + let d7_start = 42139966 - preshift; + let d7_end = 42145903 - postshift; + cyp_coordinates.insert("CYP2D7".to_string(), Coordinates::new("chr22".to_string(), d7_start, d7_end)); + + /* + // These are new coordinates, D6 is recommended by Xiao (chr22:42123192-42132193) and then we used BLAT to find the equivalent in D7 + // end coordinate adjusted due to a variant in D6 + cyp_coordinates.insert("CYP2D6".to_string(), Coordinates::new("chr22".to_string(), 42123192 - preshift, 42132424 - postshift)); + cyp_coordinates.insert("CYP2D7".to_string(), Coordinates::new("chr22".to_string(), 42135344 - preshift, 42145903 - postshift)); + */ + + // regions upstream of D6 + let rep6_start = 42123192 - preshift; + let rep6_end = 42125963 - postshift; + cyp_coordinates.insert("REP6".to_string(), Coordinates::new("chr22".to_string(), rep6_start, rep6_end)); + // cyp_coordinates.insert("spacer_CYP2D6".to_string(), Coordinates::new("chr22".to_string(), 42125963 - preshift, 42125965 - postshift)); // spacer does not exist + // cyp_coordinates.insert("fiveprime_CYP2D6".to_string(), Coordinates::new("chr22".to_string(), 42125965 - preshift, 42126260 - postshift)); // region between spacer and start is very small + + // regions upstream of D7 and after link region; spacer starts where REP7 ends + let rep7_start = 42135344 - preshift; + let rep7_end = 42138115 - postshift; + cyp_coordinates.insert("REP7".to_string(), Coordinates::new("chr22".to_string(), rep7_start, rep7_end)); + let spacer_end = 42139679 - postshift; + cyp_coordinates.insert("spacer".to_string(), Coordinates::new("chr22".to_string(), rep7_end, spacer_end)); + // cyp_coordinates.insert("fiveprime_CYP2D7_spacer".to_string(), Coordinates::new("chr22".to_string(), 42139679 - preshift, 42139966 - postshift)); // region between spacer and start is very small + + // region between D6 and REP7 (typically); starts at the end of D6 and goes to start of REP7 + cyp_coordinates.insert("link_region".to_string(), Coordinates::new("chr22".to_string(), d6_end, rep7_start)); + + // these are the coordinates used for WFA realignment + cyp_coordinates.insert("CYP2D6_wfa_backbone".to_string(), Coordinates::new("chr22".to_string(), d6_start, d6_end)); + + // now save the exon-level information + let mut cyp_regions: BTreeMap> = Default::default(); + cyp_regions.insert("CYP2D6".to_string(), + { + // taken from RefSeq, remember these are on reverse strand + let mut regions: BTreeMap = Default::default(); + regions.insert("exon1".to_string(), Coordinates::new("chr22".to_string(), 42130612 - preshift, 42130810 - postshift)); + regions.insert("exon2".to_string(), Coordinates::new("chr22".to_string(), 42129738 - preshift, 42129909 - postshift)); + regions.insert("exon3".to_string(), Coordinates::new("chr22".to_string(), 42129033 - preshift, 42129185 - postshift)); + regions.insert("exon4".to_string(), Coordinates::new("chr22".to_string(), 42128784 - preshift, 42128944 - postshift)); + regions.insert("exon5".to_string(), Coordinates::new("chr22".to_string(), 42128174 - preshift, 42128350 - postshift)); + regions.insert("exon6".to_string(), Coordinates::new("chr22".to_string(), 42127842 - preshift, 42127983 - postshift)); + regions.insert("exon7".to_string(), Coordinates::new("chr22".to_string(), 42127447 - preshift, 42127634 - postshift)); + regions.insert("exon8".to_string(), Coordinates::new("chr22".to_string(), 42126851 - preshift, 42126992 - postshift)); + regions.insert("exon9".to_string(), Coordinates::new("chr22".to_string(), 42126499 - preshift, 42126752 - postshift)); + + // introns could be derived if we need them + // spacer region - should be "GGT" which is the dup ACC in the spacer ends; unclear if we need this annotated currently + // regions.insert("spacer".to_string(), Coordinates::new("chr22".to_string(), 42125963 - preshift, 42125965 - postshift)); + + regions + } + ); + cyp_regions.insert("CYP2D7".to_string(), + { + let mut regions: BTreeMap = Default::default(); + regions.insert("exon1".to_string(), Coordinates::new("chr22".to_string(), 42144284 - preshift, 42144483 - postshift)); + regions.insert("exon2".to_string(), Coordinates::new("chr22".to_string(), 42143410 - preshift, 42143581 - postshift)); + regions.insert("exon3".to_string(), Coordinates::new("chr22".to_string(), 42142728 - preshift, 42142880 - postshift)); + regions.insert("exon4".to_string(), Coordinates::new("chr22".to_string(), 42142479 - preshift, 42142639 - postshift)); + regions.insert("exon5".to_string(), Coordinates::new("chr22".to_string(), 42141868 - preshift, 42142044 - postshift)); + regions.insert("exon6".to_string(), Coordinates::new("chr22".to_string(), 42141534 - preshift, 42141675 - postshift)); + regions.insert("exon7".to_string(), Coordinates::new("chr22".to_string(), 42141152 - preshift, 42141339 - postshift)); + regions.insert("exon8".to_string(), Coordinates::new("chr22".to_string(), 42140555 - preshift, 42140696 - postshift)); + // for some reason, RefSeq has a really big exon 9 for CYP2D7, we changed it to be same size as D6 using UCSC + // regions.insert("exon9".to_string(), Coordinates::new("chr22".to_string(), 42139576 - preshift, 42140456 - postshift)); + regions.insert("exon9".to_string(), Coordinates::new("chr22".to_string(), 42140203 - preshift, 42140456 - postshift)); + + // spacer region - should start and end with "GGT" + // currently we do not use this + // regions.insert("spacer".to_string(), Coordinates::new("chr22".to_string(), 42138115 - preshift, 42139679 - postshift)); + + regions + } + ); + + /* + These are no longer used, but knowing them may be useful in the future + // started encountering noise in this region, which was already documented by the paraph-rs config + pub static ref CYP_NOISY_REGIONS: Vec = { + let preshift = 1; + let postshift = 0; + vec![ + // in paraphase config + Coordinates::new("chr22".to_string(), 42132023 - preshift, 42132051 - postshift), + // discovered via testing + Coordinates::new("chr22".to_string(), 42127650 - preshift, 42127655 - postshift), // poly-C, often extra C + Coordinates::new("chr22".to_string(), 42128657 - preshift, 42128662 - postshift), // poly-G, often extra G + ] + }; + */ + + // these are the coordinates of the deleted region, which looks like REP6 and spans over a gap to near the start of D7 + let star5_start = rep6_start; + let star5_end = 42135343 - postshift; + let cyp2d6_star5_del: Coordinates = Coordinates::new("chr22".to_string(), star5_start, star5_end); + + // this is a translator for the hybrids into the actual alleles + let cyp_translate: BTreeMap = [ + // all of the D7::D6 can all safely map to *13, but there are sub-alleles if we eventually want to get specific + ("CYP2D7::CYP2D6::intron1", "13"), + ("CYP2D7::CYP2D6::exon2", "13"), + ("CYP2D7::CYP2D6::intron2", "13"), + ("CYP2D7::CYP2D6::exon3", "13"), + ("CYP2D7::CYP2D6::intron3", "13"), + ("CYP2D7::CYP2D6::exon4", "13"), + ("CYP2D7::CYP2D6::intron4", "13"), + ("CYP2D7::CYP2D6::exon5", "13"), + ("CYP2D7::CYP2D6::intron5", "13"), + ("CYP2D7::CYP2D6::exon6", "13"), + ("CYP2D7::CYP2D6::intron6", "13"), + ("CYP2D7::CYP2D6::exon7", "13"), + ("CYP2D7::CYP2D6::intron7", "13"), + ("CYP2D7::CYP2D6::exon8", "13"), + ("CYP2D7::CYP2D6::intron8", "13"), + ("CYP2D7::CYP2D6::exon9", "13"), + // the ones we have seen so far are just *68 and *36; the rest are educated guesses + ("CYP2D6::CYP2D7::intron1", "68"), // intron 1; category A; found an example where we needed this for *68 + ("CYP2D6::CYP2D7::exon2", "68"), // intron 1; category A; this is our standard *68 + ("CYP2D6::CYP2D7::exon8", "61"), // intron 7; category B + ("CYP2D6::CYP2D7::intron8", "63") // exon 8; category B + // ("CYP2D6::CYP2D7::exon9", "") // this one becomes either *4.013, *36, or *83; all of which have full entries for D6 + ].iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + + // Any population inferred connections go here, this can include hybrid (e.g. *4 + *68) or duplication (e.g. *2 + *2). + // Allele pairs that are not on this list will get penalized during the chain assessment. + let inferred_connections: BTreeSet<(String, String)> = [ + // known dups + ("*1", "*1"), + ("*2", "*2"), + ("*3", "*3"), + ("*4", "*4"), + ("*6", "*6"), + ("*9", "*9"), + ("*10", "*10"), + ("*17", "*17"), + ("*28", "*28"), + ("*29", "*29"), + ("*35", "*35"), + ("*41", "*41"), + ("*43", "*43"), + ("*45", "*45"), + ("*146", "*146"), + // hybrid connections + ("*4", "*68"), + ("*10", "*36") + ].iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + + // This is a set of alleles that we expect to always find with something else; these are basically hybrids that don't fly solo + let unexpected_singletons: BTreeSet = [ + "*36", "*68" + ].iter() + .map(|k| k.to_string()) + .collect(); + + /* + Notes on the complex alleles described here: https://a.storyblok.com/f/70677/x/ecb9681e8d/cyp2d6_structural-variation_v3-0.pdf + - D6-like and D7-like downstream is based on the absence/presence of the spacer block + - basic D6->D7 partial conversions are listed as normal alleles; I checked 35.002 and 82 + - ABSENT: *5, the deletion allele; breakpoints supposedly in the REP6/7 regions (homologous) and should have the spacer region still + - duplications - no specific allele, just represented as *4x2 if known, or xN if unknown; + to date, only *1, *2, *4, and *41 have been described with more than 2 alleles + - D7::D6 hybrids - start as D7 but transition to D6 + - grouped under *13 overall (not in database) + - must include the frame-shift in exon 1; this is basically the *13 overall definition + - none of the sub-alleles seem to have a database entry either, we would have to encode the Figure 8 description + - seemingly all of them have D6-like 3' ends without the spacer region + - D6::D7 hybrids - start as D6 but transition to D7, also sometimes back to D6 again + - Category A - transitions back to D6 in downstream + - Category B - stays as D7 in downstream + - apparently both A and B are viable, so we might not even be able to use that information at all other than flagging it + - some of these have entries already + - ones without entries: 61, 63, 68 + - Figure 10 seems to have a drawing of some of these + - Table 5 has a list of commonly co-occuring star alleles + - reference materials (we probably sequenced some of these) are in Table 6 + - appendix has some information on tie-breaking that we may want to pull in at some point + */ + + Self { + cyp_coordinates, + cyp_regions, + cyp2d6_star5_del, + cyp_translate, + inferred_connections, + unexpected_singletons + } + } +} + +/// This will join together the reference version of CYP2D6 and CYP2D7 at each exon/intron boundary. +/// Sequences are labeled based on the coding orientation, which is on the rev-comp strand relative to the reference genome. +/// This matches the orientation in the PharmVar PDF describing D6::D7 hybrids. +/// This function will also add other "targets" that we are search for, namely: D6 (full), D7 (full), CYP2D6*5 (deletion), and the upstream regions. +/// # Arguments +/// * `reference_genome` - the reference genome we are encoding +/// # Errors +/// * if we can't decode a reference slice into UTF-8 (we have bigger problems if this happens though) +pub fn generate_cyp_hybrids(reference_genome: &ReferenceGenome, cyp2d6_config: &Cyp2d6Config) -> Result, Box> { + let mut ret: HashMap = Default::default(); + + let exon_count: usize = 9; + let gene1 = "CYP2D6"; + let gene2 = "CYP2D7"; + + // these are in reverse + let cyp_coordinates = cyp2d6_config.cyp_coordinates(); + let chrom = cyp_coordinates.get(gene1).unwrap().chrom(); + let g1_start = cyp_coordinates.get(gene1).unwrap().start() as usize; + let g1_end = cyp_coordinates.get(gene1).unwrap().end() as usize; + let g2_start = cyp_coordinates.get(gene2).unwrap().start() as usize; + let g2_end = cyp_coordinates.get(gene2).unwrap().end() as usize; + + // insert the full sequences first + ret.insert( + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, None), + String::from_utf8(reference_genome.get_slice(chrom, g1_start, g1_end).to_vec())? + ); + ret.insert( + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d7, None), + String::from_utf8(reference_genome.get_slice(chrom, g2_start, g2_end).to_vec())? + ); + + // now lets insert our *5 target sequence which is the breakpoint +- the buffer on each side + // the buffer values below were basically the minimum before UCSC BLAT correctly split it, there's a lot of homology downstream of the split + let cyp2d6_star5_del = cyp2d6_config.cyp2d6_star5_del(); + let star5_signature = String::from_utf8( + reference_genome.get_slice( + cyp2d6_star5_del.chrom(), + cyp2d6_star5_del.start() as usize - STAR5_PRE_BUFFER, + cyp2d6_star5_del.start() as usize + ).to_vec())? + std::str::from_utf8( + reference_genome.get_slice( + cyp2d6_star5_del.chrom(), + cyp2d6_star5_del.end() as usize, + cyp2d6_star5_del.end() as usize + STAR5_POST_BUFFER + ))?; + // debug!("star5_signature: {star5_signature}"); + ret.insert( + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6Deletion, None), + star5_signature + ); + + // now loop through and add in all of the hybrids we can by looking at exon/intron boundaries and splicing them together + let cyp_regions = cyp2d6_config.cyp_regions(); + for exon_index in 1..(exon_count+1) { + let current_exon = format!("exon{exon_index}"); + let g1_exon = cyp_regions.get(gene1).unwrap().get(¤t_exon).unwrap(); + let g2_exon = cyp_regions.get(gene2).unwrap().get(¤t_exon).unwrap(); + + // first exon will not have a breakpoint at the start, since that's just the full normal sequence + if exon_index != 1 { + // these breakpoints are at the start of the exon, which is "end()" here due to rev-comp + let breakpoint1 = g1_exon.end() as usize; + let breakpoint2 = g2_exon.end() as usize; + + //D6::D7 splices - remember this means the the early exons are D6 which will be the latter half of our 5'->3' sequence + let d6_d7_label = format!("{gene1}::{gene2}::exon{exon_index}"); + let d6_d7_seq = String::from_utf8(reference_genome.get_slice(chrom, g2_start, breakpoint2).to_vec())?+ + &String::from_utf8(reference_genome.get_slice(chrom, breakpoint1, g1_end).to_vec())?; + ret.insert( + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Hybrid, Some(d6_d7_label)), + d6_d7_seq + ); + + //D7::D6 splices - remember this means the the early exons are D7 which will be the latter half of our 5'->3' sequence + let d7_d6_label = format!("{gene2}::{gene1}::exon{exon_index}"); + let d7_d6_seq = String::from_utf8(reference_genome.get_slice(chrom, g1_start, breakpoint1).to_vec())?+ + &String::from_utf8(reference_genome.get_slice(chrom, breakpoint2, g2_end).to_vec())?; + ret.insert( + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Hybrid, Some(d7_d6_label)), + d7_d6_seq + ); + } + + // last exon will not have a breakpoint at the end, since that's just the full normal sequence + if exon_index != exon_count { + // these breakpoints are at the end of the exon (start of intron), which is "start()" here due to rev-comp + let breakpoint1 = g1_exon.start() as usize; + let breakpoint2 = g2_exon.start() as usize; + + //D6::D7 splices - remember this means the the early exons are D6 which will be the latter half of our 5'->3' sequence + let d6_d7_label = format!("{gene1}::{gene2}::intron{exon_index}"); + let d6_d7_seq = String::from_utf8(reference_genome.get_slice(chrom, g2_start, breakpoint2).to_vec())?+ + &String::from_utf8(reference_genome.get_slice(chrom, breakpoint1, g1_end).to_vec())?; + ret.insert( + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Hybrid, Some(d6_d7_label)), + d6_d7_seq + ); + + //D7::D6 splices - remember this means the the early exons are D7 which will be the latter half of our 5'->3' sequence + let d7_d6_label = format!("{gene2}::{gene1}::intron{exon_index}"); + let d7_d6_seq = String::from_utf8(reference_genome.get_slice(chrom, g1_start, breakpoint1).to_vec())?+ + &String::from_utf8(reference_genome.get_slice(chrom, breakpoint2, g2_end).to_vec())?; + ret.insert( + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Hybrid, Some(d7_d6_label)), + d7_d6_seq + ); + } + } + + // finally, add in the surrounding regions; which we need for chaining + let extras = [ + ("REP6", Cyp2d6RegionType::Rep6), + ("REP7", Cyp2d6RegionType::Rep7), + ("spacer", Cyp2d6RegionType::Spacer), + ("link_region", Cyp2d6RegionType::LinkRegion) + ]; + for (extra_region, region_type) in extras.into_iter() { + let extra_start = cyp_coordinates.get(extra_region).unwrap().start() as usize; + let extra_end = cyp_coordinates.get(extra_region).unwrap().end() as usize; + let sequence = String::from_utf8(reference_genome.get_slice(chrom, extra_start, extra_end).to_vec())?; + ret.insert(Cyp2d6RegionLabel::new(region_type, None), sequence); + } + + Ok(ret) +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::path::PathBuf; + + use crate::util::file_io::load_json; + + #[test] + fn test_config_full_length() { + // full file + let test_fn = PathBuf::from("test_data/CYP2D6_configs/full_length.json"); + let config: Cyp2d6Config = load_json(&test_fn).unwrap(); + assert!(config.validate_config().is_ok()); + } + + #[test] + fn test_config_missing_regions() { + // this one is missing a CYP2D6 coordinate + let test_fn = PathBuf::from("test_data/CYP2D6_configs/missing_regions.json"); + let config: Cyp2d6Config = load_json(&test_fn).unwrap(); + assert!(config.validate_config().is_err()); + } + + #[test] + fn test_config_missing_exons() { + // this one is missing a CYP2D6 exon + let test_fn = PathBuf::from("test_data/CYP2D6_configs/missing_exons.json"); + let config: Cyp2d6Config = load_json(&test_fn).unwrap(); + assert!(config.validate_config().is_err()); + } +} \ No newline at end of file diff --git a/src/cyp2d6/errors.rs b/src/cyp2d6/errors.rs new file mode 100644 index 0000000..c6b536d --- /dev/null +++ b/src/cyp2d6/errors.rs @@ -0,0 +1,18 @@ + +/// Errors that can be produced by the CYP2D6 calling algorithm, these result in a failure state +#[derive(thiserror::Error, Debug, PartialEq)] +pub enum CallerError { + #[error("no alleles were found that can start a chain")] + NoChainingHead, + #[error("no valid chains were identified")] + NoChainsFound, + #[error("no successful chain scoring pairs")] + NoScorePairs +} + +/// Warnings that can be produced by the CYP2D6 calling algorithm +#[derive(Debug, PartialEq)] +pub enum CallerWarning { + /// Indicates an allele that was detected but not included as part of the final reported diplotype + DanglingAllele { allele_name: String } +} diff --git a/src/cyp2d6/haplotyper.rs b/src/cyp2d6/haplotyper.rs new file mode 100644 index 0000000..779e1d4 --- /dev/null +++ b/src/cyp2d6/haplotyper.rs @@ -0,0 +1,828 @@ + +use hiphase::data_types::variants::Variant; +use hiphase::wfa_graph::{NodeAlleleMap, WFAGraph, WFAResult}; +use itertools::Itertools; +use log::{debug, trace}; +use minimap2::Aligner; +use rust_lib_reference_genome::reference_genome::ReferenceGenome; +use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet}; +use simple_error::bail; +use std::collections::BTreeMap; + +use crate::cyp2d6::definitions::{Cyp2d6Config, generate_cyp_hybrids}; +use crate::cyp2d6::region_label::{Cyp2d6RegionLabel, Cyp2d6RegionType}; +use crate::data_types::database::PgxDatabase; +use crate::data_types::mapping::MappingStats; + +/// The primary interface for identifying regions of interest within a sequence. +/// Can be used to find all base type regions (e.g. CYP2D6, link_region, etc.) within a longer sequence, or to find specific full-length star-alleles (e.g., CYP2D6*4.001). +pub struct Cyp2d6Extractor<'a> { + /// Set of loaded variants required for WFA typing + loaded_variants: LoadedVariants, + /// Contains a map from a haplotype the set of included variants + haplotype_lookup: BTreeMap>, + /// These are the baseline sequences to go from a label to a sequence + hybrid_sequences: HashMap, + /// The set of hybrids with deep typing (e.g. CYP2D6 and some hybrids) + mapped_hybrids: HashSet, + /// Contains a reference to the reference genome we are using + reference_genome: &'a ReferenceGenome, + /// Contains a reference to the CYP2D6 config we are using + cyp2d6_config: &'a Cyp2d6Config +} + +impl<'a> Cyp2d6Extractor<'a> { + /// Creates a new CYP2D6 region extractor based on a load PGx database and the provided reference genome sequences. + /// # Arguments + /// * `database` - a pre-loaded PGx database; should have information on the CYP2D6 allele definitions + /// * `reference_genome` - a pre-loaded reference genome, which is used to stitch the exon/intron sequences together + /// # Errors + /// * if the variants do not load correctly from the database + /// * if a variant in the database cannot be found (usually a database error/corruption) + /// * if hybrid generation fails + pub fn new(database: &'a PgxDatabase, reference_genome: &'a ReferenceGenome) -> Result, Box> { + // first, just load the variants in + let loaded_variants = load_variant_database(database)?; + + // this is a map from an allele ID to the vector of 0s and 1s for the variant set + let mut haplotype_lookup: BTreeMap> = Default::default(); + let variant_list = loaded_variants.ordered_variants(); + let num_variants = variant_list.len(); + for (_allele_id, allele_def) in database.cyp2d6_gene_def().iter() { + assert_eq!(allele_def.gene_name(), "CYP2D6"); + let mut allele_assignments: Vec = vec![0; num_variants]; + for variant_def in allele_def.variants().iter() { + // build the variant key + let var_pos = variant_def.position(); + let var_ref = variant_def.reference(); + let var_alt = variant_def.alternate(); + + // now set that bit to a 1 + let var_index = loaded_variants.index_variant(var_pos, var_ref, var_alt)?; + allele_assignments[var_index] = 1; + } + haplotype_lookup.insert( + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, Some(allele_def.star_allele().to_string())), + allele_assignments + ); + } + + // cache this reference + let cyp2d6_config = database.cyp2d6_config(); + + // here is the core region of interest + let cyp_coordinates = cyp2d6_config.cyp_coordinates(); + let full_d6_region = cyp_coordinates.get("CYP2D6").unwrap().clone(); + + /* + // 2 - construct a full D6 region that includes a little buffer around the first and last variants in our list + // we will use this to anchor our graph later + // this step was originally when we were doing some sanity checks, it's just stored in CYP_COORDINATES now + let first_variant_pos = loaded_variants.first_variant_pos() as u64; + let last_variant_pos = loaded_variants.last_variant_pos() as u64; + let buffer = 50; + + // the upstream region (remember, rev-comp) will be from the end of exon 1 to just past the last variant position + let first_exon = CYP_REGIONS.get("CYP2D6").unwrap().get("exon1").unwrap(); + let upstream_d6 = Coordinates::new("chr22".to_string(), first_exon.end(), last_variant_pos+buffer); + + // the downstream region (rev-comp) will be from just before the first variant position to the start of the final exon + let last_exon = CYP_REGIONS.get("CYP2D6").unwrap().get("exon9").unwrap(); + let downstream_d6 = Coordinates::new("chr22".to_string(), first_variant_pos-buffer, last_exon.start()); + + // this is the full target region for CYP2D6 variant calling; we will search for this in our samples + let derived_d6_region = Coordinates::new("chr22".to_string(), downstream_d6.start(), upstream_d6.end()); + // let full_d7_region = CYP_COORDINATES.get("CYP2D7").unwrap().clone(); + let full_d6_sequence = String::from_utf8(reference_genome.get_slice(full_d6_region.chrom(), full_d6_region.start() as usize, full_d6_region.end() as usize).to_vec())?; + println!(">full_d6_sequence"); + println!("{full_d6_sequence}"); + + // this is a sanity check until we auto-derive the D6/D7 coordinates + // NOTE: if this fails, then we need to check `extraction_region()` when we update. + if derived_d6_region != full_d6_region { + warn!("Hard-coded CYP2D6 coordinates do not match derived search space."); + } + */ + + // make sure our designated coordinates is larger than the variant list set + assert!(full_d6_region.start() <= variant_list[0].position() as u64); + assert!(full_d6_region.end() >= variant_list.last().unwrap().position() as u64); + + // 3 - contruct search sequences that match the D6, D7, hybrid, and deletion regions + let hybrid_sequences: HashMap = generate_cyp_hybrids(reference_genome, cyp2d6_config)?; + + // also enumerate the ones that go through deep genotyping + let mapped_hybrids: HashSet = vec![ + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Cyp2d6, None), + // hybrids currently in the database + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Hybrid, Some("CYP2D6::CYP2D7::exon9".to_string())) // *36 typically + ].into_iter() + .collect(); + + Ok(Cyp2d6Extractor { + loaded_variants, + haplotype_lookup, + hybrid_sequences, + mapped_hybrids, + reference_genome, + cyp2d6_config + }) + } + + /// This will search a given sequence for each of the D6 targets and select the best one when overlaps are detected. + /// This is fairly common since D6 and D7 are similar, and obviously the fusions are as well. + /// # Arguments + /// * `search_sequence` - sequence we are searching through for alleles + /// * `penalized_unmapped` - pass-through option on whether unmapped bases add to numerator (penalized) or subtract from denominator (non-penalized) + /// * `max_missing_frac` - the maximum missing fraction that is allowed; primarily enforced when `penalized_unmapped` is true + /// # Errors + /// * if the aligner fails to build + pub fn find_base_type_in_sequence( + &self, + search_sequence: &str, + penalize_unmapped: bool, + max_missing_frac: f64 + ) -> Result, Box> { + // first, handle the stupid case + if search_sequence.is_empty() { + return Ok(vec![]); + } + + // these get converted into our AlleleMapping when finished + let mut region_mappings: Vec<(Cyp2d6RegionLabel, std::ops::Range, MappingStats)> = vec![]; + let dna_aligner: Aligner = Aligner::builder() + .map_hifi() + .with_cigar() + .with_seq(search_sequence.as_bytes())?; + + // this is the maximum edit penalty we allow + // if `penalized_unmapped` is True, then this is (NM + unmapped) / total; otherwise, NM / (total - unmapped) + let max_ed_frac = 0.05; + + // if we are penalizing unmapped bases, then this threshold will get tighter + let max_penalized_frac = max_missing_frac; + let penalize_during_search = false; // we don't want to penalize during the initial search, but we will at the end if enabled + + // we only need cigar and md for debugging + // other settings for mapping + let output_cigar: bool = true; + let output_md: bool = true; + let max_frag_len: Option = None; + let extra_flags = None; + + // lets go through the keys in sorted order for output purposes + let mut uncollapsed_regions: Vec<(std::ops::Range, MappingStats, Cyp2d6RegionLabel)> = vec![]; + let key_order: Vec<&Cyp2d6RegionLabel> = self.hybrid_sequences.keys() + // .sorted() + .sorted_by(|a, b| { + // TODO: this is preserving the previous sort order (which apparently can matter given our current setup); ideally, we just used .sorted() + // current causes an issue in one sample when we change that; we should figure out a strategy that resolves + // the underlying issue at some point; for refactoring, lets just preserve the order by comparing the full_allele labels + a.full_allele().cmp(&b.full_allele()) + }) + .collect(); + + let penalized_types = [ + Cyp2d6RegionType::Cyp2d6Deletion, // this one needs the full region because it has to anchor two ends together + Cyp2d6RegionType::Rep6, Cyp2d6RegionType::Rep7, // these regions are too similar and get clipped in ways that lead to incorrect answers + // TODO: if we add an explicit clustering step, we may be able to relax this + // Xiao says that REP6 and REP7 are near identical except a few bases at the end; I bet that is the source of the issue + // if we cut those out, we could probably just have a "REP" region and push the deltas to the flanks + ]; + + for &target_id in key_order.iter() { + // pull the actual target sequence + let target_seq = self.hybrid_sequences.get(target_id).unwrap(); + + // now find all of the mappings of this sequence + let mappings = dna_aligner.map( + target_seq.as_bytes(), + output_cigar, output_md, max_frag_len, extra_flags.clone() + )?; + + if mappings.is_empty() { + trace!("\t{target_id}: None"); + } + + // it's possible to get multiple mappings + for m in mappings.iter() { + // all results are relative to the `target_seq`; aka, the D6/D7 allele + let nm = m.alignment.as_ref().unwrap().nm as usize; + let seq_len = target_seq.len(); + let unmapped = seq_len - (m.query_end - m.query_start) as usize; + + // the amount clipped at the start is the amount into query that we start + let clipped_start = m.query_start as usize; + // the amount clipped at the end is the length minus the query end point + let clipped_end = target_seq.len() - m.query_end as usize; + + let mapping_stats = MappingStats::new_with_clippings( + seq_len, nm, unmapped, + clipped_start, clipped_end + ); + + // for *5 specifically, we need really tight controls because it is relatively precise signature so we don't want to miss much + let penalize_star5 = penalize_during_search || + penalized_types.contains(&target_id.region_type()); + let custom_score = mapping_stats.custom_score(penalize_star5); + + if custom_score.score() > max_ed_frac { // && target_id == "CYP2D6*5" { + // ignore this one + debug!("\tIgnoring {target_id}: {}-{} => {}", m.target_start, m.target_end, mapping_stats.custom_score_string(penalize_star5)); + } else { + debug!("\t{target_id}: {}-{} => {}", m.target_start, m.target_end, mapping_stats.custom_score_string(penalize_during_search)); + uncollapsed_regions.push(( + m.target_start as usize..m.target_end as usize, + mapping_stats, + target_id.clone(), + )); + } + } + } + + // order the regions by the range they are a part of + uncollapsed_regions.sort_by(|v1, v2| { + (v1.0.start, v1.0.end).cmp(&(v2.0.start, v2.0.end)) + }); + + // now collapse the overlapping regions + // NOTE: this has the potential to fail for some weird edge cases, mainly if alignment sizes are vastly different, but this should be rare if it happens at all + // NOTE: for the purpose of initial pullout, we really just want the larger region (observation suggest it was always the "best" as well); we can try that if this ever becomes an issue + // the last bool here is if anything overlapping had the minimum mappings requirement + let mut current_region: Option<(std::ops::Range, MappingStats, Cyp2d6RegionLabel)> = None; + for (ucr_range, ucr_score, ucr_id) in uncollapsed_regions.into_iter() { + match current_region.as_ref() { + None => { + current_region = Some((ucr_range, ucr_score, ucr_id)); + } + Some(cr) => { + if overlap_score(&ucr_range, ¤t_region.as_ref().unwrap().0) > 0.9 { + // these overlap enough that we compare them + // if it's a *5 involved, we need to compare the full lengths + let star5_pairing = penalized_types.contains(&ucr_id.region_type()) || penalized_types.contains(&cr.2.region_type()); + let penalized_scoring = if star5_pairing { true } else { penalize_during_search }; + let ucr_priority = get_allele_priority(&ucr_id); + let cr_priority = get_allele_priority(&cr.2); + + // if our score is lower AND we have the same or better priority + // OR if our priority is better + if (ucr_score.custom_score(penalized_scoring) < cr.1.custom_score(penalized_scoring) && + ucr_priority >= cr_priority) || ucr_priority > cr_priority { + // the score of this new region is better than what we were looking at, so replace it + current_region = Some((ucr_range, ucr_score, ucr_id)); + } + } else { + // not an overlap, save the current region and push this one + let unwrapped = current_region.unwrap(); + region_mappings.push((unwrapped.2, unwrapped.0, unwrapped.1)); + current_region = Some((ucr_range, ucr_score, ucr_id)); + } + } + }; + } + + if let Some(unwrapped) = current_region { + // handle the last one + region_mappings.push((unwrapped.2, unwrapped.0, unwrapped.1)); + } + + // final results go here + let mut ret: Vec = vec![]; + + debug!("Collapsed calls:"); + for (region_label, mapping_region, mapping_stats) in region_mappings.into_iter() { + // 3b - the D7 alleles and most hybrids are done at this point + let penalized_score = mapping_stats.custom_score(true); + if penalized_score.score() > max_penalized_frac { + debug!("\tIgnoring {region_label} at {mapping_region:?}, too short: {}", mapping_stats.custom_score_string(true)); + } else { + debug!("\t{region_label} at {mapping_region:?}: {}", mapping_stats.custom_score_string(penalize_unmapped)); + ret.push(AlleleMapping::new( + region_label, mapping_region, mapping_stats + )); + } + } + + Ok(ret) + } + + /// This will search a given sequence for D6 targets and select the best one when overlaps are detected. + /// Additionally, it will go deeper than just "CYP2D6" and will attempt a full typing of the allele. + /// # Arguments + /// * `search_sequence` - the sequence to search; unmapped bases are penalized by default + /// * `max_missing_frac` - the maximum missing fraction that is allowed; primarily enforced when `penalized_unmapped` is true + /// * `force_assignment` - if True, then ambiguous CYP2D6 results will be arbitrarily assigned one of the equal values + /// # Errors + /// * if finding base types fails + /// * if assigning haplotype labels to a base type fails + pub fn find_full_type_in_sequence( + &self, search_sequence: &str, max_missing_frac: f64, force_assignment: bool + ) -> Result> { + // for each sequence, figure out what it best matches in our targets + // this should be full length matches UNLESS the consensus is incomplete + // I think this means we want to use a penalized version + let penalize_unmapped = true; + let best_matches = self.find_base_type_in_sequence( + search_sequence, + penalize_unmapped, + max_missing_frac + )?; + + if best_matches.is_empty() { + bail!("no matches found"); + } + + // sometimes we get multiples, usually when wildcards are involved; get the one with the lowest score (score = mismatches) + let best_match = best_matches.iter() + .min_by(|a, b| + a.mapping_stats.custom_score(penalize_unmapped).partial_cmp(&b.mapping_stats.custom_score(penalize_unmapped)).unwrap() + ) + .unwrap(); + + let final_type = if self.mapped_hybrids.contains(best_match.allele_label()) { + debug!("\tConverting {} to full allele definition...", best_match.allele_label()); + self.assign_haplotype( + search_sequence.as_bytes(), + force_assignment + )? + } else { + best_match.allele_label().clone() + }; + Ok(final_type) + } + + /// This will take a chosen region from a sequence and deep genotype it in CYP2D6 using our WFA-based variant system. + /// # Arguments + /// * `sequence` - presumably matches a D6 allele that we want to find the best match for. + /// * `force_assignment` - if True, then ambiguous results will be arbitrarily assigned one of the equal values + /// # Errors + /// * if graph backbone construction fails + /// * if aligner construction fails + /// * if WFA graph construction or traversal fails + fn assign_haplotype( + &self, + sequence: &[u8], + force_assignment: bool + ) -> Result> { + let cyp_coordinates = self.cyp2d6_config.cyp_coordinates(); + + // get the relevant sequences from the reference + let backbone_coordinates = cyp_coordinates.get("CYP2D6_wfa_backbone").unwrap(); + let chrom_seq = self.reference_genome.get_full_chromosome(backbone_coordinates.chrom()); + let graph_backbone = String::from_utf8(self.reference_genome.get_slice(backbone_coordinates.chrom(), backbone_coordinates.start() as usize, backbone_coordinates.end() as usize).to_vec())?; + + // we only need cigar and md for debugging + // other settings for mapping + let output_cigar: bool = true; + let output_md: bool = true; + let max_frag_len: Option = None; + let extra_flags = None; + + // now build out a mapper so we make sure we have the right coordinates + let dna_aligner: Aligner = Aligner::builder() + .map_hifi() + .with_cigar() + .with_seq(graph_backbone.as_bytes())?; + + // now find all of the mappings of our consensus onto this backbone + let mappings = dna_aligner.map( + sequence, + output_cigar, output_md, max_frag_len, extra_flags.clone() + )?; + + // we *should* only find one, but that does not always happen + assert!(!mappings.is_empty()); + let core_mapping = if mappings.len() == 1 { + &mappings[0] + } else { + // we somehow have multiple mappings, pick the longest one + let longest_index = mappings.iter() + .enumerate() + .map(|(i, m)| (m.block_len, i)) + .max().unwrap() + .1; + &mappings[longest_index] + }; + + // figure out which part of the backbone was actually use by pulling out the target coordinates and adding to the backbone start + let aligned_start = backbone_coordinates.start() as usize + core_mapping.target_start as usize; + let aligned_end = backbone_coordinates.start() as usize + core_mapping.target_end as usize; + + // figure out which part of the sequence is relevant + let sub_sequence_start = core_mapping.query_start as usize; + let sub_sequence_end = core_mapping.query_end as usize; + let sub_sequence = &sequence[sub_sequence_start..sub_sequence_end]; + + /* + println!(">sub_sequence"); + println!("{}", std::str::from_utf8(sub_sequence).unwrap()); + println!(">graph_backbone"); + println!("{}", graph_backbone); + */ + + // 4 - extract just that end-to-end region for GraphWFA + let start_time = std::time::Instant::now(); + let (wfa_graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants( + chrom_seq, + self.loaded_variants.ordered_variants(), // these are both range style indices + // use `target_region` to the do the full thing, even if the mapping is only partial + // target_region.start() as usize, + // target_region.end() as usize + // otherwise, let's fix to just the relevant aligned backbone region + aligned_start, + aligned_end + )?; + + // we can probably make this smaller eventually, but this is a safe distance for now + let wfa_prune_distance = 1000; + let wfa_result: WFAResult = wfa_graph.edit_distance_with_pruning(sub_sequence, wfa_prune_distance)?; + debug!( + "\t{} WFAGraph result ({}) => num_nodes: {}, read_len: {}, edit_distance: {}", + "", start_time.elapsed().as_secs_f32(), wfa_graph.get_num_nodes(), sub_sequence.len(), wfa_result.score() + ); + + // 5 - assign the alleles and label it + // we will populate these with the variant level info + let num_variants = self.loaded_variants.ordered_variants().len(); + let mut alleles: Vec = vec![3; num_variants]; + let first_overlap = 0; // we are force matching everything + + // this loop will set the alleles for what was aligned, allowing us to compare to the pre-defined alleles for the haplotypes + for traversed_index in wfa_result.traversed_nodes().iter() { + for &(var_index, allele_assignment) in node_to_alleles.get(traversed_index).unwrap_or(&vec![]).iter() { + let correct_index: usize = first_overlap+var_index; + if alleles[correct_index] == 3 { + alleles[correct_index] = allele_assignment; + } else if alleles[correct_index] != allele_assignment { + alleles[correct_index] = 2; + } + } + } + + // 3 - use that information to score based on matching VI alleles, and then matching total alleles + let mut best_id_set: HashSet = Default::default(); + best_id_set.insert(Cyp2d6RegionLabel::new(Cyp2d6RegionType::Unknown, None)); + let mut best_score: (usize, usize) = (0, 0); + + for (allele_id, haplotype_vec) in self.haplotype_lookup.iter() { + // these start as fully equal, and we remove things that do NOT match + let mut vi_match = 0; + let mut all_match = 0; + + // go through the pairs of alleles and add the ones that match together + // let haplotype_vec = haplotype_lookup.get(allele_id).unwrap(); + assert_eq!(alleles.len(), haplotype_vec.len()); + + for (i, (&seq_value, &hap_value)) in alleles.iter().zip(haplotype_vec.iter()).enumerate() { + // hap_value will *always* be 0 or 1 + assert!(hap_value == 0 || hap_value == 1); + + // seq_value can be 0, 1, 2 (ambiguous), or 3 (unset, usually meaning 0) + let is_match = match seq_value { + // both are clearly set, so compare for equality + 0 | 1 => hap_value == seq_value, + // it could be either 0 or 1, so count it + 2 => true, + // it was not set, this could be either unassigned OR a multi-allelic site; in either case, not counting is best + 3 => false, + v => panic!("Unexpected seq_value={v}") + }; + + if is_match { + // these were a match, count the appropriate fields + all_match += 1; + if self.loaded_variants.is_vi(i) { + vi_match += 1; + } + } + } + + let combined_score = (vi_match, all_match); + match combined_score.cmp(&best_score) { + std::cmp::Ordering::Greater => { + // new best, clear out the old and add the new + trace!("new best: {allele_id} = {combined_score:?}"); + best_id_set.clear(); + best_id_set.insert(allele_id.clone()); + best_score = combined_score; + }, + std::cmp::Ordering::Equal => { + // equal result, add to the existing set + trace!("new equi: {allele_id} = {combined_score:?}"); + best_id_set.insert(allele_id.clone()); + }, + std::cmp::Ordering::Less => {}, // score is less, do nothing + }; + } + + let best_id = if best_id_set.len() == 1 { + // only one result, drain it off + best_id_set.drain().next().unwrap() + } else { + // sort the candidates by allele label (which is ~numerically for tie-breaking) + let ordered_candidates: Vec = best_id_set.into_iter() + .sorted_by(|a, b| a.full_allele().cmp(&b.full_allele())) + .collect(); + let ordered_labels: Vec = ordered_candidates.iter().map(|l| l.to_string()).collect(); + + if force_assignment { + debug!("\tAmbiguous result detected, selecting first candidate; candidates: {ordered_labels:?}"); + ordered_candidates[0].clone() + } else { + debug!("\tAmbiguous result detected, setting to unknown; candidates: {ordered_labels:?}"); + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Unknown, None) + } + }; + + debug!("\t{} {best_id} -> {best_score:?}, ({:.4}, {:.4})", "", best_score.0 as f64 / self.loaded_variants.num_vi() as f64, best_score.1 as f64 / num_variants as f64); + + Ok(best_id) + } + + /// Returns the sequence for a given allele + /// # Arguments + /// * `allele_name` - the allele to look up + pub fn get_allele(&self, allele_name: &Cyp2d6RegionLabel) -> Option<&String> { + self.hybrid_sequences.get(allele_name) + } + + // getters + pub fn loaded_variants(&self) -> &LoadedVariants { + &self.loaded_variants + } + + pub fn cyp2d6_config(&self) -> &Cyp2d6Config { + self.cyp2d6_config + } +} + + +/// Wrapper for variants that have been loaded in preparation for genotyping. +pub struct LoadedVariants { + /// variants ordered by position and sequence + ordered_variants: Vec, + /// a lookup from (position, REF, ALT) to index in `ordered_variants` + variant_lookup: HashMap<(usize, String, String), usize>, + /// one-to-one with `ordered_variants`; if True, this variant had a VI flag + is_vi: Vec +} + +impl LoadedVariants { + /// Constructor for the loaded variants, see parameters. + /// # Arguments + /// * `ordered_variants` - variants ordered by position and sequence + /// * `variant_lookup` - a lookup from (position, REF, ALT) to index in `ordered_variants` + /// * `is_vi` - one-to-one with `ordered_variants`; if True, this variant had a VI flag indicating that it distinguishes the core allele + /// # Errors + /// * if `ordered_variants`, `is_vi`, and `variant_lookup` do not all have the same number of entriesa + pub fn new(ordered_variants: Vec, variant_lookup: HashMap<(usize, String, String), usize>, is_vi: Vec) -> Result> { + if ordered_variants.len() != is_vi.len() { + bail!("ordered_variants and is_vi must be same length"); + } + if ordered_variants.len() != variant_lookup.len() { + bail!("ordered_variants and variant_lookup must be same length"); + } + Ok(LoadedVariants { + ordered_variants, variant_lookup, is_vi + }) + } + + pub fn ordered_variants(&self) -> &[Variant] { + &self.ordered_variants + } + + /// Searches for a given variant in our loaded variants and return the index + /// # Arguments + /// * `position` - the coordinate to find, 0-based + /// * `reference` - the reference allele + /// * `alternate` - the alternate allele + /// # Errors + /// * if the allele is not found + pub fn index_variant(&self, position: usize, reference: &str, alternate: &str) -> Result> { + match self.variant_lookup.get(&(position, reference.to_string(), alternate.to_string())) { + Some(&idx) => Ok(idx), + None => bail!("({position}, {reference}, {alternate}) not found") + } + } + + /// Retrieves the position of the first variant in the dataset + pub fn first_variant_pos(&self) -> i64 { + self.ordered_variants[0].position() + } + + /// Retrieves the position of the lastt variant in the dataset + pub fn last_variant_pos(&self) -> i64 { + self.ordered_variants.last().unwrap().position() + } + + /// Returns true if the variant at the given index is VI + pub fn is_vi(&self, index: usize) -> bool { + self.is_vi[index] + } + + /// Returns the total number of variants marked as VI + pub fn num_vi(&self) -> usize { + self.is_vi.iter().filter(|&x| *x).count() + } +} + +/// This will parse the relevant CYP2D6 variants in preparation for using in GraphWFA +/// # Arguments +/// * `database` - the PGx database pre-loaded, we need to translate this for GraphWFA +/// # Errors +/// * if allele translation to UTF8 fails +fn load_variant_database(database: &PgxDatabase) -> Result> { + let mut inserted_variants: HashSet<(usize, String, String)> = Default::default(); + let mut variant_list: Vec = vec![]; + let mut vi_set: HashSet<(usize, String, String)> = Default::default(); + let mut all_set: HashSet<(usize, String, String)> = Default::default(); + for (_allele_id, allele_def) in database.cyp2d6_gene_def().iter() { + for variant_def in allele_def.variants().iter() { + let var_pos = variant_def.position(); + let var_ref = variant_def.reference().to_string(); + let var_alt = variant_def.alternate().to_string(); + let is_vi = variant_def.extras().get("VI").is_some(); + let var_key = (var_pos, var_ref.clone(), var_alt.clone()); + + // mark if this is a VI variant + if is_vi { + vi_set.insert(var_key.clone()); + } + all_set.insert(var_key.clone()); + + if inserted_variants.contains(&var_key) { + // do nothing, we already inserted this one + } else { + // create the variant and insert it + let variant = if var_ref.len() == 1 { + if var_alt.len() == 1 { + Variant::new_snv( + 0, var_pos as i64, + var_ref.into_bytes(), var_alt.into_bytes(), + 0, 1) + } else { + Variant::new_insertion( + 0, var_pos as i64, + var_ref.into_bytes(), var_alt.into_bytes(), + 0, 1 + ) + } + } else if var_alt.len() == 1 { + Variant::new_deletion( + 0, var_pos as i64, + var_ref.len(), var_ref.into_bytes(), var_alt.into_bytes(), + 0, 1) + } else { + Variant::new_indel( + 0, var_pos as i64, + var_ref.len(), var_ref.into_bytes(), var_alt.into_bytes(), + 0, 1 + ) + }; + variant_list.push(variant); + + // mark this key as inserted + inserted_variants.insert(var_key); + } + } + } + + // sort the variants so we can slice it up when we need to later + variant_list.sort_by_key(|v| v.position()); + /* + for v in variant_list.iter() { + println!("{v:?}"); + } + */ + let num_variants = variant_list.len(); + let first_variant_pos = variant_list[0].position() as u64; + let last_variant_pos = variant_list.last().unwrap().position() as u64; + debug!("Found {} unique variants for GraphWFA from chr22:{}-{}", num_variants, first_variant_pos+1, last_variant_pos+1); + + // build a lookup table for each of the variants and also the haplotypes + let mut variant_lookup: HashMap<(usize, String, String), usize> = Default::default(); + // quick lookup from variant index to know if it is VI or not + let mut is_vi_lookup: Vec = vec![false; num_variants]; + for (i, variant) in variant_list.iter().enumerate() { + let var_key = ( + variant.position() as usize, + String::from_utf8(variant.get_allele0().to_vec())?, + String::from_utf8(variant.get_allele1().to_vec())? + ); + + // if this is VI, label it as such + if vi_set.contains(&var_key) { + is_vi_lookup[i] = true; + } + + // now save this in our hashmap index + variant_lookup.insert(var_key, i); + } + + LoadedVariants::new(variant_list, variant_lookup, is_vi_lookup) +} + +/// Contains an allele name, the region it was found, and the mapping stats. +#[derive(Clone, Debug)] +pub struct AlleleMapping { + /// The allele that is mapped + allele_label: Cyp2d6RegionLabel, + /// The coordinate range inside it mapped to + region: std::ops::Range, + /// The score for the mapping + mapping_stats: MappingStats +} + +impl AlleleMapping { + /// Constructor + pub fn new(allele_label: Cyp2d6RegionLabel, region: std::ops::Range, mapping_stats: MappingStats) -> AlleleMapping { + AlleleMapping { + allele_label, + region, + mapping_stats + } + } + + // getters + pub fn allele_label(&self) -> &Cyp2d6RegionLabel { + &self.allele_label + } + + pub fn region(&self) -> &std::ops::Range { + &self.region + } + + pub fn mapping_stats(&self) -> &MappingStats { + &self.mapping_stats + } +} + +/// Given two ranges, this will report the overlap score of the ranges computed as shared / min(l1, l2). +/// By this definition, any non-overlapping get a score a 0.0, and then overlaps are scored based on the smaller region. +/// This means that if either region is _fully_ contained in the other, then the score will be 1.0. +/// # Arguments +/// * `r1` - the first range +/// * `r2` - the second range +fn overlap_score(r1: &std::ops::Range, r2: &std::ops::Range) -> f64 { + let min_end = r1.end.min(r2.end); + let max_start = r1.start.max(r2.start); + if max_start >= min_end { + 0.0 + } else { + let l1 = r1.len() as f64; + let l2 = r2.len() as f64; + let shared = (min_end - max_start) as f64; + /* + // old reciprocal overlap score, which is not the best here + 2.0 * shared / (l1+l2) + */ + shared / l1.min(l2) + } +} + +/// Returns an allele priority where higher values take precedence over others when the overlaps are high. +/// # Arguments +/// * `allele_id` - the allele ID we want to get priority for +fn get_allele_priority(allele_id: &Cyp2d6RegionLabel) -> usize { + match allele_id.region_type() { + Cyp2d6RegionType::Cyp2d6Deletion => 1, // *5 often has high overlap with other components; if detected, we should definitely prioritize it + _ => 0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /* + TODO: tests that ideally exist but are difficult to implement: + - Cyp2d6Extractor - this relies on a DB and a reference, can we encode this? the reference is the sticking point currently; big on disk, and long to load + - find_base_type_in_sequence - same issue + - find_full_type_in_sequence - same issue + - assign_haplotype - same issue + - generate_cyp_hybrids - requires reference, this is the root issue from above + */ + + #[test] + fn test_load_variant_database() { + // can be test on our real DB file + let test_db_fn = std::path::PathBuf::from("./data/v0.9.0/cpic_20240404.json.gz"); + let database: PgxDatabase = crate::util::file_io::load_json(&test_db_fn).unwrap(); + let vcb = load_variant_database(&database).unwrap(); + + // check all the high level, easy-to-verify stats + assert_eq!(vcb.first_variant_pos(), 42126309); + assert_eq!(vcb.last_variant_pos(), 42132374); + assert_eq!(vcb.ordered_variants().len(), 387); + assert_eq!(vcb.num_vi(), 144); + } + + #[test] + fn test_overlap_score() { + assert_eq!(overlap_score(&(0..1), &(1..2)), 0.0); // no overlap + assert_eq!(overlap_score(&(0..10), &(1..5)), 1.0); // fully contained + assert_eq!(overlap_score(&(0..10), &(5..100)), 0.5); // half shared of first + assert_eq!(overlap_score(&(15..100), &(0..20)), 0.25); // quarter shared of second + } +} \ No newline at end of file diff --git a/src/cyp2d6/mod.rs b/src/cyp2d6/mod.rs new file mode 100644 index 0000000..02713d2 --- /dev/null +++ b/src/cyp2d6/mod.rs @@ -0,0 +1,15 @@ + +/// The entry function for diplotyping CYP2D6 +pub mod caller; +/// Functionality that connects multiple regions together into a pair of full chains (i.e., the diplotype) +pub mod chaining; +/// Constants that control how the alleles are labeled and defined +pub mod definitions; +/// Errors and warnings that can come from the CYP2D6 algorithm that we want to cleanly handle +pub mod errors; +/// Functionality for extracting and haplotying CYP2D6 alleles in a sequence +pub mod haplotyper; +/// Wrapper for region labeling and constraining based on the labels +pub mod region_label; +/// Contains functionality for generating visualization of CYP2D6-related components +pub mod visualization; diff --git a/src/cyp2d6/region_label.rs b/src/cyp2d6/region_label.rs new file mode 100644 index 0000000..53adeb1 --- /dev/null +++ b/src/cyp2d6/region_label.rs @@ -0,0 +1,289 @@ + +use std::collections::BTreeMap; +use std::fmt::Display; + +/// Core region types that are associated with our CYP2D6 locus +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd, strum_macros::Display)] +pub enum Cyp2d6RegionType { + /// Generic unknown type + #[strum(to_string="UNKNOWN")] + Unknown, + /// REP6 region, before CYP2D6 typically + #[strum(to_string="REP6")] + Rep6, + /// Main CYP2D6 allele type + #[strum(to_string="CYP2D6")] + Cyp2d6, + /// Link region, follows CYP2D6 + #[strum(to_string="link_region")] + LinkRegion, + /// REP7, follow link region typically + #[strum(to_string="REP7")] + Rep7, + /// Spacer region, between REP7 and CYP2D7 + #[strum(to_string="spacer")] + Spacer, + /// Main CYP2D7 type + #[strum(to_string="CYP2D7")] + Cyp2d7, + /// CYP2D6*5 + #[strum(to_string="CYP2D6*5")] + Cyp2d6Deletion, + /// Currently capturing both D6::D7 and D7::D6 hybrids + Hybrid, + /// Sentinel for when we call an allele but it turns out to be a false positive + FalseAllele +} + +impl Cyp2d6RegionType { + /// Returns true if this is a "classic" CYP2D region + pub fn is_cyp2d(&self) -> bool { + match self { + // these do not count + Cyp2d6RegionType::Unknown | + Cyp2d6RegionType::Rep6 | + Cyp2d6RegionType::LinkRegion | + Cyp2d6RegionType::Rep7 | + Cyp2d6RegionType::Spacer | + Cyp2d6RegionType::FalseAllele => false, + // D6, D7, *5, and hybrids count + Cyp2d6RegionType::Cyp2d6 | + Cyp2d6RegionType::Cyp2d7 | + Cyp2d6RegionType::Cyp2d6Deletion | + Cyp2d6RegionType::Hybrid => true + } + } + + /// Returns true if this is a "REP" region + pub fn is_rep(&self) -> bool { + matches!(self, Cyp2d6RegionType::Rep6 | Cyp2d6RegionType::Rep7) + } + + /// Returns true if this type would show up in the final diplotype + pub fn is_reported_allele(&self) -> bool { + matches!(self, + Cyp2d6RegionType::Cyp2d6 | + Cyp2d6RegionType::Cyp2d6Deletion | + Cyp2d6RegionType::Hybrid + ) + } +} + +/// Fully describes a region we identify, first by the region type (D6 + nearby components) and then optionally with a more descriptive label. +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct Cyp2d6RegionLabel { + /// The high level type of region this represents + region_type: Cyp2d6RegionType, + /// If Some, this will generally contain a star allele; e.g. "*4.001". + subtype_label: Option +} + +impl Cyp2d6RegionLabel { + /// Constructor + pub fn new(region_type: Cyp2d6RegionType, subtype_label: Option) -> Cyp2d6RegionLabel { + Cyp2d6RegionLabel { + region_type, + subtype_label + } + } + + /// This constructs a simplified version of a particular allele. + /// E.g. CYP2D6*4.001 -> *4.001 (detailed) OR *4 (!detailed). + /// CYP2D7 and other non-descript hybrid should retain the full allele def. + /// # Arguments + /// * `detailed` - if False, then this will reduce any D6 subunits into their integer form; e.g., *4.001 -> *4 + /// * `cyp_translate` - a translation hashmap from internal representation to user-friendly + pub fn simplify_allele(&self, detailed: bool, cyp_translate: &BTreeMap) -> String { + match self.region_type { + Cyp2d6RegionType::Cyp2d6 | + Cyp2d6RegionType::Hybrid => { + if let Some(subtype_label) = self.subtype_label.as_deref() { + // we should not have to strip out the '*' here + if let Some(translation) = cyp_translate.get(subtype_label) { + // we have a direct translation already, so do that + format!("*{translation}") + } else if detailed { + // they want detailed, so no changes to the output + format!("*{subtype_label}") + } else { + // non-detailed, simplify any floats into ints + match subtype_label.parse::() { + Ok(float_value) => { + let int_value = float_value.floor() as i64; + format!("*{int_value}") + }, + Err(_e) => { + // we failed to parse this one into a float, so just strip the prefix and replace with an asterisk + format!("*{subtype_label}") + } + } + } + } else { + // we don't expect this to happen normally + self.full_allele() + } + }, + // this will always be *5 + Cyp2d6RegionType::Cyp2d6Deletion => "*5".to_string(), + // all the other should not get reported, so just spit out the internal string format + _ => self.full_allele() + } + } + + // Assembles a string version of the full allele name + pub fn full_allele(&self) -> String { + match self.region_type { + // pass through the strum format + Cyp2d6RegionType::Unknown | + Cyp2d6RegionType::Rep6 | + Cyp2d6RegionType::LinkRegion | + Cyp2d6RegionType::Rep7 | + Cyp2d6RegionType::Spacer | + Cyp2d6RegionType::Cyp2d7 | + Cyp2d6RegionType::Cyp2d6Deletion => format!("{}", self.region_type), + // e.g. CYP2D6*4.001 + Cyp2d6RegionType::Cyp2d6 => if let Some(stl) = self.subtype_label.as_ref() { + format!("{}*{}", self.region_type, stl) + } else { + self.region_type.to_string() + }, + // Hybrids will usually get translated later + Cyp2d6RegionType::Hybrid => if let Some(stl) = self.subtype_label.as_ref() { + stl.clone() + } else { + self.region_type.to_string() + } + // Hybrids will usually get translated later + Cyp2d6RegionType::FalseAllele => if let Some(stl) = self.subtype_label.as_ref() { + format!("{}_{}", self.region_type, stl) + } else { + self.region_type.to_string() + } + } + } + + /// Returns true if this label is allowed to be a part of a chain + pub fn is_allowed_label(&self) -> bool { + !matches!(self.region_type, Cyp2d6RegionType::Unknown | Cyp2d6RegionType::FalseAllele) + } + + /// Checks a link candidate is allowed to be linked from this label. + /// # Arguments + /// * `link_candidate` - the label we want to link to + pub fn is_allowed_label_pair(&self, link_candidate: &Cyp2d6RegionLabel) -> bool { + use Cyp2d6RegionType::*; + + // we can't have two *5s in one allele; this sometimes is "observed" when we have *5 but no D7 alleles identified + let type1 = self.region_type(); + let type2 = link_candidate.region_type(); + let double_star5 = type1 == Cyp2d6Deletion && type2 == Cyp2d6Deletion; + + let unexpected_order = // there may be a bunch of these + // in normal land, we expect REP6 to be the start + // Note: its very similar to REP7, so this restriction may need to change to a penalty if this becomes an issue + type2 == Rep6 || + // if we have a CYP2D allele (except *5), we should always expect a link region to follow + (type1.is_cyp2d() && type1 != Cyp2d6Deletion && type2 != LinkRegion) || + // if we are joining a link region, we should always have a full allele before it + (type2 == LinkRegion && !type1.is_cyp2d()) || + // if we have a link_region, we should always expect a REP to follow + (type1 == LinkRegion && !type2.is_rep()) || + // REP should always be preceeded by link_region + (type2.is_rep() && type1 != LinkRegion) || + // REP should always be followed by either the spacer or a CYP2D + (type1.is_rep() && !(type2 == Spacer || type2.is_cyp2d())) || + // if the chained-to is "spacer", then we expect a REP before it OR special case deletion: *5 + (type2 == Spacer && !(type1.is_rep() || type1 == Cyp2d6Deletion)) || + // if the chained-from is "spacer", then we should have a CYP2D alleles that follows + (type1 == Spacer && !type2.is_cyp2d()) || + // if the second allele is a D7 allele, then we should have a spacer before it + (type2 == Cyp2d7 && type1 != Spacer) || + // we do not expect any extensions after D7; it's the final link in a chain + type1 == Cyp2d7 + // TODO: we add the above because of a problem in NA12877 where a region gets skipped in a read, leading to problematic mapping + // this mapping has a higher "error" because it's just the wrong allele entirely, and it dominates the compute + // what we really want is a mechanism to recognize and ignore when that happens, likely with a penalty still, but not like what it's currently doing + // ideas: put a cap on error contribution per read? ignore anything > X, + // and add a static penalty because ignored? + // detect and allow for a "skip" of the bad allele, again with a penalty? + // add back in the explained reads threshold as primary? + // TODO: any others? + ; + + // only allowed if not a double *5 AND + !double_star5 && + // not in an unexpected order + !unexpected_order + } + + /// This will return true if this allele is a valid start point for a chain. + /// # Arguments + /// * `normalize_all_alleles` - if True, then all CYP2D alleles are used for coverage normalization + pub fn is_candidate_chain_head(&self, normalize_all_alleles: bool) -> bool { + use Cyp2d6RegionType::*; + match self.region_type { + // REP6 is normally the start, unless we have a deletion allele; both are explicitly allowed + Rep6 | + Cyp2d6Deletion => true, + + // Sometimes we don't get a REP6, in which case we should allow these alleles if they're used for normalizing + Cyp2d6 | + Hybrid => self.is_normalizing_allele(normalize_all_alleles), + + // these should NEVER start a chain + Unknown | + LinkRegion | + Rep7 | + Spacer | + Cyp2d7 | + FalseAllele => false + } + } + + /// Checks if a label matches one of those for normalizing coverage under targeted conditions where some alleles are under-captured. + /// In general, we expect all D6 alleles to get captured fine. + /// However, D7 is often an "off-target" allele, and sometimes the hybrids fall into that off-target bucket as well. + /// # Arguments + /// * `normalize_all_alleles` - if true, this will include all full length alleles in normalization + pub fn is_normalizing_allele(&self, normalize_all_alleles: bool) -> bool { + if normalize_all_alleles { + // normalize all valid CYP2D alleles; included D6, D7, hybrids, and deletion + self.region_type.is_cyp2d() + } else { + // only allow the D6 alleles; hybrids, deletions, and D7 have wonky coverage + self.region_type == Cyp2d6RegionType::Cyp2d6 + } + } + + /// Returns true if this type would show up in the final diplotype + pub fn is_reported_allele(&self) -> bool { + self.region_type.is_reported_allele() + } + + // wrappers + /// Returns true if this allele is considered a D6 or D7 allele + pub fn is_cyp2d(&self) -> bool { + self.region_type.is_cyp2d() + } + + /// Creates a new unknown label. + pub fn new_unknown() -> Cyp2d6RegionLabel { + Cyp2d6RegionLabel::new(Cyp2d6RegionType::Unknown, None) + } + + // getters + pub fn region_type(&self) -> Cyp2d6RegionType { + self.region_type + } + + pub fn subtype_label(&self) -> Option<&str> { + self.subtype_label.as_deref() + } +} + +impl Display for Cyp2d6RegionLabel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.full_allele())?; + Ok(()) + } +} \ No newline at end of file diff --git a/src/cyp2d6/visualization.rs b/src/cyp2d6/visualization.rs new file mode 100644 index 0000000..9a733cb --- /dev/null +++ b/src/cyp2d6/visualization.rs @@ -0,0 +1,273 @@ + +use itertools::Itertools; +use log::warn; +use minimap2::Aligner; +use rust_lib_reference_genome::reference_genome::ReferenceGenome; +use simple_error::bail; +use std::collections::BTreeMap; +use std::path::Path; +use waffle_con::multi_consensus::MultiConsensus; + +use crate::cyp2d6::region_label::{Cyp2d6RegionLabel, Cyp2d6RegionType}; +use crate::data_types::coordinates::Coordinates; +use crate::data_types::database::PgxDatabase; +use crate::visualization::igv_session_writer::{BUFFER_LEN, CUSTOM_CONTIG}; + +/// Accepts information about available alleles as well as the reads spanning those alleles and converts it into a visual graph. +/// # Arguments +/// * `hap_labels` - a list of haplotype region labels we can translate into visual Strings +/// * `chain_frequency` - this is the frequency of each observed chain from the data +/// * `filename` - the output file path to save the results +pub fn generate_debug_graph(hap_labels: &[Cyp2d6RegionLabel], chain_frequency: &BTreeMap, f64>, filename: &Path) -> Result<(), Box> { + use layout::backends::svg::SVGWriter; + use layout::core::base::Orientation; + use layout::core::color::Color; + use layout::core::geometry::Point; + use layout::core::style::{LineStyleKind, StyleAttr}; + use layout::core::utils::save_to_file; + use layout::std_shapes::shapes::{Arrow, Element, LineEndKind, ShapeKind, RecordDef}; + use layout::topo::layout::VisualGraph; + + // first lets part the chain frequencies into totals for edges and node + let mut single_counts = vec![0.0; hap_labels.len()]; + let mut pair_counts: BTreeMap<(usize, usize), f64> = Default::default(); + + for (chain, frequency) in chain_frequency.iter() { + for index in chain.iter() { + single_counts[*index] += *frequency; + } + + for window in chain.windows(2) { + let entry = pair_counts.entry((window[0], window[1])).or_default(); + *entry += *frequency; + } + } + + // the graph we're going to fill in + let mut vg = VisualGraph::new(Orientation::LeftToRight); + + // create all the nodes with a count + let mut node_handles = vec![]; + for (i, hl) in hap_labels.iter().enumerate() { + if hl.is_allowed_label() { + let shape = ShapeKind::Record( + RecordDef::Array(vec![ + RecordDef::new_text(&i.to_string()), + RecordDef::new_text(&hl.full_allele()), + RecordDef::new_text(&format!("{:.2}", single_counts[i])) + ]) + ); + let look = StyleAttr::simple(); + let sz = Point::new(175.0, 100.0); + + let node = Element::create(shape, look, Orientation::TopToBottom, sz); + let handle = vg.add_node(node); + node_handles.push(Some(handle)); + } else { + node_handles.push(None); + } + } + + let min_arrow_width = 2; + let max_arrow_width = 5; + let min_edge_size = *pair_counts.values().min_by(|a, b| a.total_cmp(b)).unwrap_or(&0.0); + let max_edge_size = (*pair_counts.values().max_by(|a, b| a.total_cmp(b)).unwrap_or(&0.0)) // get the maximum + .max(min_edge_size+1.0); // this makes sure the maximum > minimum by at least 1.0 + + // create the edges between them + for (chain_pair, frequency) in pair_counts.iter() { + // figure out a scaled width + let arrow_fraction = (*frequency - min_edge_size) / (max_edge_size - min_edge_size); + let inferred_width = min_arrow_width + ((max_arrow_width - min_arrow_width) as f64 * arrow_fraction).round() as usize; + + // figure out the total color on the heatmap space + let red_component = ((arrow_fraction * 255.0).floor() as u32) << 16; + let blue_component = ((1.0 - arrow_fraction) * 255.0).floor() as u32; + let inferred_color = red_component + blue_component; + + // for some reason, we have to shift and add 0xff + let color = Color::new((inferred_color << 8) + 0xff); + + // Add an edge between the nodes. + let arrow = Arrow::new( + LineEndKind::None, + LineEndKind::Arrow, + LineStyleKind::Normal, + &format!("{:.2}", frequency), + &StyleAttr::new( + color, + inferred_width, + Option::Some(Color::fast("white")), + 0, + 15 + ), + &None, + &None, + ); + vg.add_edge(arrow, node_handles[chain_pair.0].unwrap(), node_handles[chain_pair.1].unwrap()); + } + + // there is a rogue panic in the layout-rs crate that seems rare, unclear how to reproduce it yet + // similar issue submitted to layout-rs in 2022, but no response from devs + // TODO: catch_unwind is clearly a stop-gap, we don't want this in here forever + // I like this crate for ease-of-use, but we may need something more robust with active maintainers + #[allow(clippy::blocks_in_conditions)] + match std::panic::catch_unwind(|| { + let mut vg = vg; // need to move it inside this scope for rust to be happy (panic unwinding) + + // Render the nodes to some rendering backend. + let mut svg = SVGWriter::new(); + vg.do_it(false, false, false, &mut svg); + + // Save the output. + save_to_file(filename.as_os_str().to_str().unwrap(), &svg.finalize()) + }) { + Ok(_v) => Ok(()), + Err(e) => bail!("Received panic while writing SVG file: {:?}", e.downcast_ref::<&str>()) + } +} + +/// This is just a wrapper for the output from the next function, mainly to make clippy happy. +pub struct CustomReference { + /// the full custom sequence + pub sequence: String, + /// list of coordinates / label pairs + pub regions: Vec<(Coordinates, String)> +} + +/// Creates a customized reference genome sequence from our consensus, which we can output to file and use for IGV visuals. +/// This function is specific to CYP2D6 and the database config for it. +/// # Arguments +/// * `reference_genome` - the actual reference genome data (GRCh38) +/// * `database` - the config we loaded +/// * `consensus` - the multi-consensus result, which contains a bunch of sub-units from the D6 region +/// * `hap_labels` - the assigned labels for each consensus +/// * `best_result` - the best chain pair; should be two Vecs of unknown length +/// # Errors +/// * if there are UTF-8 parsing errors +pub fn create_custom_cyp2d6_reference( + reference_genome: &ReferenceGenome, database: &PgxDatabase, + consensus: &MultiConsensus, hap_labels: &[Cyp2d6RegionLabel], best_result: &[Vec] +) -> Result> { + // generic buffer between regions + let buffer_sequence: String = "N".repeat(BUFFER_LEN); + // let d6_surrounding_buffer: usize = 10000; // TODO: do we want upstream/downstream windows? + + // start with a buffer + let mut ret = buffer_sequence.clone(); + let mut regions: Vec<(Coordinates, String)> = vec![]; + + for haplotype_chain in best_result.iter() { + // process the first element + let hap_index = haplotype_chain[0]; + let hap_sequence = std::str::from_utf8(consensus.consensuses()[hap_index].sequence())?; + + // add the coordinates of what was added + let coordinates = Coordinates::new(CUSTOM_CONTIG.to_string(), + ret.len() as u64, + (ret.len() + hap_sequence.len()) as u64 + ); + let region_name = format!("{hap_index}_{}", hap_labels[hap_index].full_allele()); + regions.push((coordinates, region_name)); + + // now extend our region + ret.push_str(hap_sequence); + + // handle everything else in pairs so we can check for gaps to fill + for (&prev_index, &hap_index) in haplotype_chain.iter().tuple_windows() { + // get the types + let prev_type = hap_labels[prev_index].region_type(); + let hap_type = hap_labels[hap_index].region_type(); + let mut overlap_len = 0; // for almost every case, there is no overlap + if prev_type.is_rep() && hap_type.is_cyp2d() { + // we have a gap here to fill from the end of REP6 to the start of D6 + let chrom = "chr22"; + let rep6_end = database.cyp2d6_config().cyp_coordinates().get("REP6").unwrap().end() as usize; + let d6_start = database.cyp2d6_config().cyp_coordinates().get("CYP2D6").unwrap().start() as usize; + let gap_sequence = std::str::from_utf8(reference_genome.get_slice(chrom, rep6_end, d6_start))?; + ret.push_str(gap_sequence); + } else if prev_type == Cyp2d6RegionType::Spacer && hap_type.is_cyp2d() { + // we have a gap here to fill from the end of REP7 to the start of D7 + let chrom = "chr22"; + let rep7_end = database.cyp2d6_config().cyp_coordinates().get("spacer").unwrap().end() as usize; + let d7_start = database.cyp2d6_config().cyp_coordinates().get("CYP2D7").unwrap().start() as usize; + let gap_sequence = std::str::from_utf8(reference_genome.get_slice(chrom, rep7_end, d7_start))?; + ret.push_str(gap_sequence); + } else if prev_type == Cyp2d6RegionType::Cyp2d6Deletion && hap_type == Cyp2d6RegionType::Spacer { + // figure out the overlap + let align_window = 500; + let align_start = ret.len().saturating_sub(align_window); // shouldn't go below 0, but this protects us + let align_sequence = &ret[align_start..]; + + // build the aligner to our smaller tail region + let dna_aligner: Aligner = Aligner::builder() + .map_hifi() + .with_cigar() + .with_seq(align_sequence.as_bytes())?; + + // we only need cigar and md for debugging + // other settings for mapping + let output_cigar: bool = true; + let output_md: bool = true; + let max_frag_len: Option = None; + let extra_flags = None; + + // build the target + let full_query = consensus.consensuses()[hap_index].sequence(); + let query_end = full_query.len().min(align_window); + let query_sequence = &full_query[..query_end]; + + // first, map the sequence + let mappings = dna_aligner.map( + query_sequence, + output_cigar, output_md, max_frag_len, extra_flags + )?; + + // finally search for a matching overlap and save it if we find one + let bp_thresh = 5; // lets give ourselves a small wiggle room + for m in mappings.iter() { + // we want a mapping that starts at 0 in our query and end at the end of the target + // expected length is ~230 bp or so + if m.query_start <= bp_thresh && (m.target_len - m.target_end) <= bp_thresh { + // this one is a pretty good match, save the overlap and skip out + overlap_len = m.query_end as usize; + break; + } + } + + if overlap_len == 0 { + // we didn't find an overlap + warn!("No overlap found between adjacent *5 and spacer region, output reference may have unexplained gaps."); + } + } else { + // TODO: other types to handle? + } + + // first pass, naively extend by each sequence in the chain + let hap_sequence = std::str::from_utf8(&consensus.consensuses()[hap_index].sequence()[overlap_len..])?; + + // add the coordinates of what was added + let coordinates = Coordinates::new(CUSTOM_CONTIG.to_string(), + (ret.len() - overlap_len) as u64, + (ret.len() + hap_sequence.len()) as u64 + ); + + // this mirrors what shows up in debug mode; I think that's fine to not have translations like *68 here for now + let region_name = format!("{hap_index}_{}", hap_labels[hap_index].full_allele()); + regions.push((coordinates, region_name)); + + // now extend our region + ret.push_str(hap_sequence); + } + + // add a buffer between each one + ret.push_str(&buffer_sequence); + } + + // add a buffer at the end also + ret.push_str(&buffer_sequence); + Ok(CustomReference { + sequence: ret, + regions + }) +} \ No newline at end of file diff --git a/src/data_types/alleles.rs b/src/data_types/alleles.rs new file mode 100644 index 0000000..a75fcf6 --- /dev/null +++ b/src/data_types/alleles.rs @@ -0,0 +1,184 @@ + +use std::collections::BTreeMap; + +use serde::{Deserialize, Serialize}; +use simple_error::bail; + +/// Wrapper for all the HLA allele informations +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct AlleleDefinition { + /// The identifier from the upstream DB + id: String, + /// The gene name this is associated with + gene_name: String, + /// The assigned star allele + star_allele: String, + /// The variants defining this allele + variants: Vec +} + +impl AlleleDefinition { + /// Creates a new AlleleDefinition and performs some checks along the way + /// # Arguments + /// * `id` - the identifier, expected to be unique + /// * `description` - basically, the star allele, should be of form "{gene}*{star_allele}", e.g.: "CYP2D6*1" + /// * `dna_sequence` - the DNA sequence, should be ACGT symbols only + pub fn new(opt_id: Option, description: &str, variants: Vec) -> Result> { + let star_split: Vec<&str> = description.split('*').collect(); + if star_split.len() != 2 { + bail!("Star split length != 2 for allele description: {description}"); + } + + let gene_name: String = star_split[0].to_string(); + let star_allele: String = star_split[1].to_string(); + + // if we do not have a special ID, just use the full star allele format + let id = opt_id.unwrap_or(format!("{gene_name}*{star_allele}")); + + Ok(AlleleDefinition { + id, + gene_name, + star_allele, + variants, + }) + } + + pub fn id(&self) -> &str { + &self.id + } + + pub fn gene_name(&self) -> &str { + &self.gene_name + } + + pub fn star_allele(&self) -> &str { + &self.star_allele + } + + pub fn variants(&self) -> &[VariantDefinition] { + &self.variants + } +} + +/// This corresponds to an individual variant or sequence replacement that is part of an allele definition +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct VariantDefinition { + /// an external identifier for this variant + id: Option, + /// the chromosome + chrom: String, + /// the 0-based position + position: usize, + /// the reference sequence that is getting replaced + reference: String, + /// the alternate sequence replacing the reference + alternate: String, + /// additional data can go here, such as the VI field + extras: BTreeMap +} + +impl VariantDefinition { + pub fn new(id: Option, chrom: String, position: usize, reference: String, alternate: String, extras: BTreeMap) + -> Result> { + // check reference and alternate for ACGT only + let allowed_symbols = ['A', 'C', 'G', 'T']; + if !reference.chars().all(|c| allowed_symbols.contains(&c)) { + bail!("Reference sequence contains non-ACGT symbols: {reference}"); + } + if !alternate.chars().all(|c| allowed_symbols.contains(&c)) { + bail!("Reference sequence contains non-ACGT symbols: {alternate}"); + } + + Ok(VariantDefinition { + id, + chrom, + position, + reference, + alternate, + extras + }) + } + + pub fn position(&self) -> usize { + self.position + } + + pub fn reference(&self) -> &str { + &self.reference + } + + pub fn alternate(&self) -> &str { + &self.alternate + } + + pub fn extras(&self) -> &BTreeMap { + &self.extras + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_good_allele_def() { + let test_name = "test_name".to_string(); + let test_gene = "CYP2D6"; + let test_star = "1"; + let test_description = format!("{test_gene}*{test_star}"); + let test_result = AlleleDefinition::new( + Some(test_name.clone()), + &test_description, + vec![] + ).unwrap(); + assert_eq!(test_result, AlleleDefinition { + id: test_name, + gene_name: test_gene.to_string(), + star_allele: test_star.to_string(), + variants: vec![] + }); + } + + #[test] + fn test_bad_allele_name() { + let test_result = AlleleDefinition::new( + None, "A bad name", vec![] + ); + assert!(test_result.is_err()); + } + + #[test] + fn test_variant_definition() { + let id = Some("random_id".to_string()); + let chrom = "chr22".to_string(); + let position = 10; + let reference = "A".to_string(); + let alternate = "C".to_string(); + let extras: BTreeMap = Default::default(); + let vd = VariantDefinition::new( + id.clone(), chrom.clone(), position, reference.clone(), alternate.clone(), extras.clone() + ).unwrap(); + assert_eq!(vd, VariantDefinition { + id, + chrom, + position, + reference, + alternate, + extras + }); + } + + #[test] + fn test_bad_variant_definition() { + let id = Some("random_id".to_string()); + let chrom = "chr22".to_string(); + let position = 10; + let reference = "B".to_string(); + let alternate = "C".to_string(); + let extras: BTreeMap = Default::default(); + let vd = VariantDefinition::new( + id.clone(), chrom.clone(), position, reference.clone(), alternate.clone(), extras.clone() + ); + assert!(vd.is_err()); + } +} \ No newline at end of file diff --git a/src/data_types/coordinates.rs b/src/data_types/coordinates.rs new file mode 100644 index 0000000..67e31ab --- /dev/null +++ b/src/data_types/coordinates.rs @@ -0,0 +1,70 @@ + +use serde::{Deserialize, Serialize}; + +/// Wrapper for basic region coordinates +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +pub struct Coordinates { + /// Chromosome string + chrom: String, + /// 0-based start, inclusive + start: u64, + /// 0-based end, exclusive + end: u64 +} + +impl Coordinates { + /// Typical constructor with some verification + pub fn new(chrom: String, start: u64, end: u64) -> Coordinates { + assert!(start <= end); + Coordinates { + chrom, start, end + } + } + + pub fn chrom(&self) -> &str { + &self.chrom + } + + pub fn start(&self) -> u64 { + self.start + } + + pub fn end(&self) -> u64 { + self.end + } + + /// Wrapper for sending to htslib fetch + pub fn fetch_definition(&self) -> (&str, u64, u64) { + (&self.chrom, self.start, self.end) + } +} + +impl std::fmt::Display for Coordinates { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // start and end are 0-based, so shift start+1 + write!(f, "{}:{}-{}", self.chrom, self.start+1, self.end) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_coordinates() { + let chrom = "chr1".to_string(); + let start = 10; + let end = 20; + let coordinate = Coordinates::new(chrom.clone(), start, end); + assert_eq!(coordinate.fetch_definition(), (chrom.as_str(), start, end)); + } + + #[test] + #[should_panic] + fn test_bad_coordinates() { + let chrom = "chr1".to_string(); + let start = 10; + let end = 5; + let _coordinate = Coordinates::new(chrom.clone(), start, end); + } +} \ No newline at end of file diff --git a/src/data_types/cpic_api_results.rs b/src/data_types/cpic_api_results.rs new file mode 100644 index 0000000..b6f77e1 --- /dev/null +++ b/src/data_types/cpic_api_results.rs @@ -0,0 +1,53 @@ + +use serde::Deserialize; + +// CPIC API quickstart: /~https://github.com/cpicpgx/cpic-data/wiki +// CPI API full book: https://documenter.getpostman.com/view/1446428/Szt78VUJ?version=latest +// Useful postgrest reference: https://postgrest.org/en/v7.0.0/api.html#horizontal-filtering-rows + +/// This captures a full CPIC definition, we only parse the elements we need though +#[derive(Debug, Deserialize)] +pub struct CpicAlleleDefinition { + /// The gene for this definition, good for sanity checking mostly + #[serde(alias = "genesymbol")] + pub gene_symbol: String, + /// The name of this allele + #[serde(alias = "name")] + pub allele_name: String, + /// True if this is the reference allele + #[serde(alias = "matchesreferencesequence")] + pub is_reference: bool, + /// True if this is a structural variant allele, which we will ignore + #[serde(alias = "structuralvariation")] + pub is_sv: bool, + /// the list of variants for this allele + #[serde(alias = "allele_location_value")] + pub variants: Vec +} + +/// This captures a CPIC variant allele definition +#[derive(Debug, Deserialize)] +pub struct CpicVariantDefinition { + /// The allele for this variant + #[serde(alias = "variantallele")] + pub variant_allele: String, + /// The sequence location definition for the variant + pub sequence_location: CpicSequenceLocationDefinition +} + +/// This captures a specific CPIC +#[derive(Debug, Deserialize)] +pub struct CpicSequenceLocationDefinition { + /// The unique ID for this variant + pub id: u64, + /// The name of this variant + pub name: String, + /// This is typically the g. for the variant + #[serde(alias = "chromosomelocation")] + pub gdot: String, + /// The DBSNP ID (e.g. rs##) + #[serde(alias = "dbsnpid")] + pub dbsnp_id: Option, + /// The build 38 position of the variant, 1-based + pub position: usize +} diff --git a/src/data_types/database.rs b/src/data_types/database.rs new file mode 100644 index 0000000..b045da4 --- /dev/null +++ b/src/data_types/database.rs @@ -0,0 +1,551 @@ + +use lazy_static::lazy_static; +use log::{debug, warn}; +use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet}; +use serde::{Deserialize, Serialize}; +use simple_error::{SimpleError, bail}; +use std::collections::BTreeMap; +use std::collections::btree_map::Entry::{Occupied, Vacant}; + +use crate::cyp2d6::definitions::Cyp2d6Config; +use crate::data_types::alleles::AlleleDefinition; +use crate::data_types::cpic_api_results::CpicAlleleDefinition; +use crate::hla::alleles::{HlaAlleleDefinition, HlaConfig}; + +lazy_static!{ + static ref CPIC_IGNORED_LIST: Vec<&'static str> = vec![ + "CYP2D6", "HLA-A", "HLA-B" + ]; + static ref CPIC_IGNORED_GENES: HashSet<&'static str> = { + let mut hs = HashSet::default(); + for &element in CPIC_IGNORED_LIST.iter() { + hs.insert(element); + } + hs + }; +} + +/// This is the full set of PGx information that we have available +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct PgxDatabase { + /// Metadata for the database + database_metadata: PgxMetadata, + /// This is a map from gene name to all of the relevant information for that gene + gene_entries: BTreeMap, + /// The configuration for HLA genes + #[serde(default)] // will populate with our default() if not in the file + hla_config: HlaConfig, + /// The sequences for the HLA alleles + hla_sequences: BTreeMap, + /// The configuration for the CYP2D6 gene + #[serde(default)] // will populate with our default() if not in the file + cyp2d6_config: Cyp2d6Config, + /// The sequences for CYP2D6 + cyp2d6_gene_def: BTreeMap +} + +impl PgxDatabase { + /// Creates a new database from the CPIC allele definitions + /// # Arguments + /// * `gene_to_chrom` - a simple hashmap from gene name to chromosome + /// * `allele_definitions` - the full set of allele definitions + /// * `hla_version` - the HLA version from GitHub + /// * `hla_sequences` - the hashmap from HLA identifier to the definitions (e.g., sequences) + /// * `pharmvar_version` - the PharmVar version + /// * `cyp2d6_gene_def` - the CYP2D6 gene definition + /// # Errors + /// * if CPIC API inconsistencies are detected + /// * if there is an error while adding a new CPIC allele definition to our database + pub fn new( + gene_to_chrom: &HashMap, allele_definitions: &[CpicAlleleDefinition], + hla_version: String, hla_sequences: BTreeMap, + pharmvar_version: String, cyp2d6_gene_def: BTreeMap + ) -> Result> { + // initialize all the gene entries + let mut gene_entries: BTreeMap = Default::default(); + for (gene_name, chrom) in gene_to_chrom.iter() { + if CPIC_IGNORED_GENES.contains(gene_name.as_str()) { + warn!("Gene {gene_name} is on the CPIC ignored genes list, skipping it and all allele definitions for it."); + continue; + } + gene_entries.insert( + gene_name.clone(), + PgxGene::new(gene_name, chrom) + ); + } + + // now add the allele definitions + for allele_def in allele_definitions.iter() { + // make sure are not ignoring this gene + let gene: &str = &allele_def.gene_symbol; + if CPIC_IGNORED_GENES.contains(gene) { + continue; + } + + // make sure this isn't an SV, we ignore those currently + if allele_def.is_sv { + warn!("SV allele detected, ignoring: {gene}, {}", allele_def.allele_name); + continue; + } + + // add it now + let gene_entry: &mut PgxGene = gene_entries.get_mut(gene) + .ok_or(format!("An allele definition was provided for {gene}, but it was not found in the gene to chromosome list."))?; + gene_entry.add_allele(allele_def)?; + } + + // go through the list and remove any genes with no defined alleles + let filtered_entries: BTreeMap = BTreeMap::from_iter( + gene_entries.into_iter() + .filter(|(k, v)| { + let is_empty: bool = v.defined_haplotypes.is_empty(); + if is_empty { + debug!("No defined haplotypes detected for {k}, removing from gene list."); + } else { + debug!("{k} stats: {} variants, {} haplotypes", v.variants().len(), v.defined_haplotypes.len()) + } + !is_empty + }) + ); + + let build_time = chrono::Utc::now(); + let cpic_version = format!("API-{}", build_time.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true)); + let database_metadata = PgxMetadata { + pbstarphase_version: crate::cli::core::FULL_VERSION.to_string(), + cpic_version, + hla_version, + pharmvar_version, + build_time + }; + + // generate the default configs here + let hla_config = HlaConfig::default(); + let cyp2d6_config = Cyp2d6Config::default(); + + // if we made it here, we're all good yo + Ok(PgxDatabase { + database_metadata, + gene_entries: filtered_entries, + hla_config, + hla_sequences, + cyp2d6_config, + cyp2d6_gene_def + }) + } + + /// Validates the loaded database where possible. + /// This does not prevent data errors, but it will prevent crashes due to missing information. + /// # Errors + /// * if the HLA configuration is missing information + /// * if the CYP2D6 configuration is missing information + pub fn validate(&self) -> Result<(), SimpleError> { + self.hla_config.validate_config()?; + self.cyp2d6_config.validate_config() + } + + pub fn database_metadata(&self) -> &PgxMetadata { + &self.database_metadata + } + + pub fn gene_entries(&self) -> &BTreeMap { + &self.gene_entries + } + + pub fn hla_config(&self) -> &HlaConfig { + &self.hla_config + } + + pub fn hla_sequences(&self) -> &BTreeMap { + &self.hla_sequences + } + + pub fn cyp2d6_config(&self) -> &Cyp2d6Config { + &self.cyp2d6_config + } + + pub fn cyp2d6_gene_def(&self) -> &BTreeMap { + &self.cyp2d6_gene_def + } +} + +/// Contains metadata about the construction of the database +#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] +pub struct PgxMetadata { + /// The version of pbstarphase we're running + pbstarphase_version: String, + /// The version of the CPIC database + cpic_version: String, + /// The version of the HLA database + hla_version: String, + /// The version of the PharmVar database + pharmvar_version: String, + /// The time the database was constructed + build_time: chrono::DateTime +} + +/// A PGx gene has defined variants as well as alleles that are composites of the defined variants. +/// There is also always one "reference" allele +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct PgxGene { + /// The name of the gene + gene_name: String, + /// The chromosome the gene is located on + chromosome: String, + /// Variants by their provided ID + variants: BTreeMap, + /// Each name points to an allele that contains variants in the same order as "ordered_variants" + defined_haplotypes: BTreeMap, + /// The allele corresponding to reference, it is the only one that will typically be "full" on the PgxHaplotype definition + reference_allele: Option +} + +impl PgxGene { + /// Creates a new, blank PgxGene with minimum info. Alleles need to be added afterwards. + /// # Arguments + /// * `gene_name` - the name of the gene + /// * `chromosome` - the chromosome this gene and all of it's variants are on + fn new(gene_name: &str, chromosome: &str) -> PgxGene { + PgxGene { + gene_name: gene_name.to_string(), + chromosome: chromosome.to_string(), + variants: Default::default(), + defined_haplotypes: Default::default(), + reference_allele: None + } + } + + /// This will add a new allele definition for this gene and perform any sanity checks along the way. + /// # Arguments + /// * `allele_definition` - the CPIC allele definition + /// # Errors + /// * if this allele was already provided + /// * if this is flagged as the reference allele, but we already have a reference allele + /// * if this is flagged as containing an SV, we are not handling those currently + /// * if there is a variant definition conflict + /// # Panics + /// * if the gene names do not match + fn add_allele(&mut self, allele_definition: &CpicAlleleDefinition) -> Result<(), Box> { + // make sure gene symbol matches + assert_eq!(self.gene_name, allele_definition.gene_symbol); + + // make sure this allele is not already defined + if self.defined_haplotypes.contains_key(&allele_definition.allele_name) { + bail!("Duplicate allele definition found for {}: {}", self.gene_name, allele_definition.allele_name); + } + + // make sure we did not get a second reference allele, we can't handle that pressure + if self.reference_allele.is_some() && allele_definition.is_reference { + bail!("Multiple reference alleles provided for {}: {} and {}", self.gene_name, self.reference_allele.as_ref().unwrap(), allele_definition.allele_name); + } + + // make sure this is not an SV + if allele_definition.is_sv { + bail!("SV allele detected for {}, these are not handled: {}", self.gene_name, allele_definition.allele_name); + } + + // check if we should mark this as reference allele + let is_reference = allele_definition.is_reference; + if is_reference { + self.reference_allele = Some(allele_definition.allele_name.clone()); + } + + // initialize the defined alleles as "None" for all known variants (this may change in length and assignment) + let mut haplotype: BTreeMap = Default::default(); + + // okay, now we can check the variants and add them as necessary + for ad_variant in allele_definition.variants.iter() { + let variant_name: String = ad_variant.sequence_location.name.clone(); + let dbsnp_id: Option = ad_variant.sequence_location.dbsnp_id.clone(); + let variant_id: u64 = ad_variant.sequence_location.id; + let position: usize = ad_variant.sequence_location.position; + let variant_sequence: &str = &ad_variant.variant_allele; + + match self.variants.entry(variant_id) { + Occupied(mut entry) => { + // we have this variant loaded, do sanity checks + let variant: &mut PgxVariant = entry.get_mut(); + if variant.position != position { + bail!("Encountered variant with id {} but different positions: {} != {}", variant_id, variant.position, position); + } + if variant.dbsnp_id != dbsnp_id { + bail!("Encountered variants with id {} but different dbsnp IDs: {:?} != {:?}", variant_id, dbsnp_id, variant.dbsnp_id); + } + if is_reference { + match &variant.alleles[0] { + Some(ra) => if ra != variant_sequence { + bail!("Encountered variant with id {} but different reference alleles: {} != {}", variant_id, ra, variant_sequence); + }, + None => variant.alleles[0] = Some(variant_sequence.to_string()) + }; + } else { + // check if any alleles already match the sequence + let match_index: Option = variant.alleles.iter() + .position(|a| a.as_ref().unwrap_or(&"".to_string()) == variant_sequence); + match match_index { + Some(i) => { + // we already have this sequence in our list + // this should never be the REF allele + assert_ne!(i, 0); + }, + None => { + // this is a new sequence to our list, append it + variant.alleles.push(Some(variant_sequence.to_string())); + } + }; + } + }, + Vacant(entry) => { + // this is a new variant for us + let var_alleles: Vec> = if is_reference { + vec![Some(variant_sequence.to_string())] + } else { + vec![None, Some(variant_sequence.to_string())] + }; + let new_variant = PgxVariant { + name: variant_name, + dbsnp_id, + position, + alleles: var_alleles + }; + + // store it and update our alleles + entry.insert(new_variant); + } + }; + + // now we need to save the allele for this particular haplotype + match haplotype.entry(variant_id) { + Vacant(entry) => { + entry.insert(variant_sequence.to_string()); + }, + Occupied(_entry) => { + bail!("Detected CPIC allele with same variant assigned multiple times: {} {}", allele_definition.allele_name, variant_id); + } + } + } + + // finally, add the allele set we parsed + self.defined_haplotypes.insert( + allele_definition.allele_name.clone(), + PgxHaplotype { haplotype } + ); + + Ok(()) + } + + pub fn gene_name(&self) -> &str { + &self.gene_name + } + + pub fn chromosome(&self) -> &str { + &self.chromosome + } + + pub fn reference_allele(&self) -> Option<&str> { + self.reference_allele.as_deref() + } + + pub fn variants(&self) -> &BTreeMap { + &self.variants + } + + pub fn defined_haplotypes(&self) -> &BTreeMap { + &self.defined_haplotypes + } +} + +/// This is the core information needed to identify a variant in PGx land +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct PgxVariant { + /// the name of the variant + name: String, + /// DBSNP ID if available + dbsnp_id: Option, + /// The 1-based coordinate of the variant + position: usize, + /// All of the alleles, index 0 is *always* reference allele + alleles: Vec> +} + +impl PgxVariant { + pub fn new(name: String, dbsnp_id: Option, position: usize, alleles: Vec>) -> PgxVariant { + PgxVariant { + name, dbsnp_id, position, alleles + } + } + + pub fn name(&self) -> &str { + &self.name + } + + pub fn dbsnp_id(&self) -> &Option { + &self.dbsnp_id + } + + pub fn position(&self) -> usize { + self.position + } + + pub fn alleles(&self) -> &[Option] { + &self.alleles + } +} + +/// For the database representation, it makes more sense to have a sparser format that is just ID -> allele value +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct PgxHaplotype { + /// The PGx variant ID to the allele for this haplotype, only defined ones are stored + haplotype: BTreeMap +} + +impl PgxHaplotype { + pub fn haplotype(&self) -> &BTreeMap { + &self.haplotype + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::path::PathBuf; + + use crate::util::file_io::load_json; + + #[test] + fn test_simple_cacna1s() { + // consts we can tweak + let gene_name = "CACNA1S"; + let chrom = "chr1"; + let cacna1s_fn = PathBuf::from("test_data/CACNA1S/CPIC_API.json"); + + // set up a simple single gene hashmap + let mut gene_to_chrom = HashMap::default(); + gene_to_chrom.insert(gene_name.to_string(), chrom.to_string()); + + // load allele definitions + let cacna1s_allele_defs: Vec = load_json(&cacna1s_fn).unwrap(); + + let mut simple_hla: BTreeMap = Default::default(); + let allele_name: String = "HLA00001".to_string(); + simple_hla.insert( + allele_name.clone(), + HlaAlleleDefinition::new(allele_name.clone(), "A*01:01:01:01", Some("ACGT".to_string()), "TGCA".to_string()).unwrap() + ); + + let mut simple_cyp: BTreeMap = Default::default(); + let allele_name: String = "PV00124".to_string(); + simple_cyp.insert( + allele_name.clone(), + AlleleDefinition::new(Some(allele_name.clone()), "CYP2D6*1", vec![]).unwrap() + ); + + // build the database + let hla_version: String = "hla_v1".to_string(); + let pharmvar_version: String = "pharmvar_v1".to_string(); + let pgx_database = PgxDatabase::new( + &gene_to_chrom, + &cacna1s_allele_defs, + hla_version.clone(), + simple_hla.clone(), + pharmvar_version.clone(), + simple_cyp.clone() + ).unwrap(); + + // check that one gene inside + assert_eq!(pgx_database.gene_entries.len(), 1); + + // check the cacna1s data + let cacna1s_entry = pgx_database.gene_entries.get(gene_name).unwrap(); + assert_eq!(cacna1s_entry.gene_name, gene_name); + assert_eq!(cacna1s_entry.chromosome, chrom); + assert_eq!(cacna1s_entry.reference_allele.as_ref().unwrap(), "Reference"); + + // check one of the variants for cacna1s + assert_eq!(cacna1s_entry.variants.len(), 2); + let variant = cacna1s_entry.variants.get(&777260).unwrap(); + assert_eq!(variant.dbsnp_id.as_ref().unwrap(), "rs772226819"); + assert_eq!(variant.position, 201091993); + assert_eq!(variant.alleles, vec![Some("G".to_string()), Some("A".to_string())]); + + // check the alleles as well + assert_eq!(cacna1s_entry.defined_haplotypes.len(), 3); + + let reference = cacna1s_entry.defined_haplotypes.get("Reference").unwrap(); + assert_eq!(reference.haplotype.len(), 2); + assert_eq!(reference.haplotype.get(&777260).unwrap(), "G"); + assert_eq!(reference.haplotype.get(&777261).unwrap(), "C"); + + let alt1 = cacna1s_entry.defined_haplotypes.get("c.520C>T").unwrap(); + assert_eq!(alt1.haplotype.len(), 1); + assert_eq!(alt1.haplotype.get(&777260).unwrap(), "A"); + + let alt2 = cacna1s_entry.defined_haplotypes.get("c.3257G>A").unwrap(); + assert_eq!(alt2.haplotype.len(), 1); + assert_eq!(alt2.haplotype.get(&777261).unwrap(), "T"); + + // check the HLA stuff + assert_eq!(pgx_database.database_metadata().hla_version, hla_version); + assert_eq!(pgx_database.hla_sequences(), &simple_hla); + + // check the CYP2D6 stuff + assert_eq!(pgx_database.database_metadata().pharmvar_version, pharmvar_version); + let base_entry = pgx_database.cyp2d6_gene_def().get("PV00124").unwrap(); + assert_eq!(base_entry.gene_name(), "CYP2D6"); + assert_eq!(base_entry.star_allele(), "1"); + assert_eq!(base_entry.variants(), &[]); + } + + #[test] + #[should_panic] + fn test_error_sv() { + // test that SVs cause a panic + let gene_name = "CACNA1S"; + let chrom = "chr1"; + let mut pgx_gene = PgxGene::new(gene_name, chrom); + + // load allele definitions + let cacna1s_fn = PathBuf::from("test_data/CACNA1S/CPIC_API.json"); + let mut cacna1s_allele_defs: Vec = load_json(&cacna1s_fn).unwrap(); + + // mark as an SV and add it + cacna1s_allele_defs[0].is_sv = true; + pgx_gene.add_allele(&cacna1s_allele_defs[0]).unwrap(); + } + + #[test] + #[should_panic] + fn test_error_duplicate() { + // test that SVs cause a panic + let gene_name = "CACNA1S"; + let chrom = "chr1"; + let mut pgx_gene = PgxGene::new(gene_name, chrom); + + // load allele definitions + let cacna1s_fn = PathBuf::from("test_data/CACNA1S/CPIC_API.json"); + let cacna1s_allele_defs: Vec = load_json(&cacna1s_fn).unwrap(); + + // add the same ID twice + pgx_gene.add_allele(&cacna1s_allele_defs[0]).unwrap(); + pgx_gene.add_allele(&cacna1s_allele_defs[0]).unwrap(); + } + + #[test] + #[should_panic] + fn test_error_double_reference() { + // test that SVs cause a panic + let gene_name = "CACNA1S"; + let chrom = "chr1"; + let mut pgx_gene = PgxGene::new(gene_name, chrom); + + // load allele definitions + let cacna1s_fn = PathBuf::from("test_data/CACNA1S/CPIC_API.json"); + let mut cacna1s_allele_defs: Vec = load_json(&cacna1s_fn).unwrap(); + + // add two different alleles both flagged as reference + cacna1s_allele_defs[0].is_reference = true; + cacna1s_allele_defs[1].is_reference = true; + pgx_gene.add_allele(&cacna1s_allele_defs[0]).unwrap(); + pgx_gene.add_allele(&cacna1s_allele_defs[1]).unwrap(); + } +} \ No newline at end of file diff --git a/src/data_types/mapping.rs b/src/data_types/mapping.rs new file mode 100644 index 0000000..9458edb --- /dev/null +++ b/src/data_types/mapping.rs @@ -0,0 +1,242 @@ + +use serde::Serialize; +use std::ops::AddAssign; + +/// Wraps the mapping stats for read to an HLA locus +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct MappingStats { + /// the length of the sequence + seq_len: usize, + /// the NM tag for the mapping + nm: usize, + /// the number of unmapped based + unmapped: usize, + /// the number of clipped bases at the start + clipped_start: Option, + /// the number of clipped bases at the end + clipped_end: Option +} + +impl MappingStats { + /// Basic constructor, but will verify some assumptions + /// # Arguments + /// * `seq_len` - the total length of the fragment that was mapped + /// * `nm` - the edit distance (NM tag) of the mapping + /// * `unmapped` - the number of unmapped bases + pub fn new( + seq_len: usize, nm: usize, unmapped: usize + ) -> MappingStats { + MappingStats { + seq_len, + nm, + unmapped, + clipped_start: None, + clipped_end: None + } + } + + /// Basic constructor, but will verify some assumptions + /// # Arguments + /// * `seq_len` - the total length of the fragment that was mapped + /// * `nm` - the edit distance (NM tag) of the mapping + /// * `unmapped` - the number of unmapped bases + /// * `clipped_start` - the number of clipped bases at the start + /// * `clipped_end` - the number of clipped bases at the end + pub fn new_with_clippings( + seq_len: usize, nm: usize, unmapped: usize, + clipped_start: usize, clipped_end: usize + ) -> MappingStats { + MappingStats { + seq_len, + nm, + unmapped, + clipped_start: Some(clipped_start), + clipped_end: Some(clipped_end) + } + } + + /// Calculates the default mapping scores for this mapping. + /// This approach penalizes based on both the number of mismatches and unmapped bases. + pub fn mapping_score(&self) -> MappingScore { + // let dna_score = MappingScore::score_value(self.seq_len, self.nm, self.unmapped); + // MappingScore::new(dna_score) + self.custom_score(true) + } + + /// Calculates a parameterized mapping score. + /// # Arguments + /// * `penalize_unmapped` - if True, this will penalize the score based on unmapped reads + pub fn custom_score(&self, penalize_unmapped: bool) -> MappingScore { + let seq_len = if penalize_unmapped { + self.seq_len + } else { + // remove unmapped from consideration for length + self.seq_len - self.unmapped + }; + let nm = self.nm; + let unmapped = if penalize_unmapped { + self.unmapped + } else { + // remove unmapped as a penalty + 0 + }; + let dna_score = MappingScore::score_value(seq_len, nm, unmapped); + MappingScore::new(dna_score) + } + + /// Wrapper for writing a scoring string to the screen + pub fn score_string(&self) -> String { + self.custom_score_string(true) + } + + /// Wrapper for writing a custom scoring string to the screen. + pub fn custom_score_string(&self, penalize_unmapped: bool) -> String { + let score = self.custom_score(penalize_unmapped); + if penalize_unmapped { + // unmapped is a penalty + format!("{:.5}=({}+{})/{}", score.score(), self.nm, self.unmapped, self.seq_len) + } else { + // unmapped is removed from counting + format!("{:.5}={}/({}-{})", score.score(), self.nm, self.seq_len, self.unmapped) + } + } + + // getters + pub fn clipped_start(&self) -> Option { + self.clipped_start + } + + pub fn clipped_end(&self) -> Option { + self.clipped_end + } + + pub fn nm(&self) -> usize { + self.nm + } + + pub fn unmapped(&self) -> usize { + self.unmapped + } +} + + +/// Contains the score for aligning an HLA sequence against a read. +/// This is basically an error rate, defined as (edit_distance + unmapped) / seq_len +#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)] +pub struct MappingScore { + /// the score for the alignment + score: f64 +} + +impl MappingScore { + /// Simple constructor placeholder + pub fn new(score: f64) -> MappingScore { + MappingScore { + score + } + } + + /// Returns the worst possible value for a mapping + pub fn worst_value() -> f64 { + 1.0 + } + + // Returns the worst possible score (cDNA + DNA) for a mapping + pub fn worst_score() -> MappingScore { + MappingScore { + score: MappingScore::worst_value() + } + } + + /// Returns an empty score so we can accumulate scores with AddAssign + pub fn zero_score() -> MappingScore { + MappingScore { + score: 0.0 + } + } + + /// Convenient comparator since Eq and Ord cannot be derived with f64 + pub fn min(self, other: MappingScore) -> MappingScore { + if self <= other { + self + } else { + other + } + } + + /// Calculates the harmonic mean of a bunch of scores. + /// # Arguments + /// * `scores` - the values to derive the harmonic mean from + /// # Panics + /// * if any of the provided scores are 0.0 + pub fn harmonic_mean(scores: &[MappingScore]) -> MappingScore { + let mut harmonic_sum: f64 = 0.0; + + for score in scores.iter() { + assert!(score.score() > 0.0, "dna_score must be > 0.0"); + harmonic_sum += 1.0 / score.score(); + } + + MappingScore { + score: if harmonic_sum > 0.0 { scores.len() as f64 / harmonic_sum } else { 0.0 } + } + } + + pub fn score(&self) -> f64 { + self.score + } + + /// Calculates the score of a set of values. + /// If `map_nm + unmapped == 0.0`, this will add a partial error to keep the score above 0 (harmonic mean requirement). + pub fn score_value(mapping_len: usize, map_nm: usize, unmapped: usize) -> f64 { + let numerator = ((map_nm + unmapped) as f64).max(0.1); + let denominator = (mapping_len) as f64; + numerator / denominator + } +} + +// we are not really using this anymore, but it will not hurt to keep it +impl AddAssign for MappingScore { + fn add_assign(&mut self, rhs: Self) { + // an individual score is at most 1.0, so we can just add them together + self.score += rhs.score(); + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_mapping_stats() { + let mapping_stats = MappingStats::new( + 10, 1, 0, + ); + assert_eq!(mapping_stats.mapping_score(), MappingScore::new(0.1)); + } + + #[test] + fn test_score_min() { + let s1 = MappingScore::new(1.0); + let s2 = MappingScore::new(0.9); + let s3 = MappingScore::new(0.2); + + assert_eq!(s1.min(s2), s2); + assert_eq!(s1.min(s3), s3); + assert_eq!(s2.min(s3), s3); + } + + #[test] + fn test_harmonic_mean() { + let s1 = MappingScore::new(0.2); + let s2 = MappingScore::new(0.4); + let s3 = MappingScore::new(0.2); + + let arrayed = [s1, s2, s3]; + let expected = MappingScore::new( + 3.0 / (5.0+2.5+5.0), // 0.24 + ); + assert_eq!(expected, MappingScore::harmonic_mean(&arrayed)); + } +} \ No newline at end of file diff --git a/src/data_types/mod.rs b/src/data_types/mod.rs new file mode 100644 index 0000000..4060b50 --- /dev/null +++ b/src/data_types/mod.rs @@ -0,0 +1,15 @@ + +/// Contains serialization for generic alleles that only have a DNA definition +pub mod alleles; +/// Contains the coordinates functionality +pub mod coordinates; +/// Contains serialization for CPIC API result types +pub mod cpic_api_results; +/// Contains definitions related to our underlying database of genes -> alleles -> variants +pub mod database; +/// Contains mapping stats for individual alignments +pub mod mapping; +/// Contains utility to normalize a variant to a standard definition that is non-ambiguous +pub mod normalized_variant; +/// Contains definitions related to the representation of a final diplotype +pub mod pgx_diplotypes; diff --git a/src/data_types/normalized_variant.rs b/src/data_types/normalized_variant.rs new file mode 100644 index 0000000..d73ed28 --- /dev/null +++ b/src/data_types/normalized_variant.rs @@ -0,0 +1,792 @@ + +use lazy_static::lazy_static; +use regex::Regex; +use rust_lib_reference_genome::reference_genome::ReferenceGenome; +use rustc_hash::FxHashSet as HashSet; +use serde::Serialize; +use simple_error::bail; + +lazy_static! { + /// This matches tandem-repeat like patterns: AC(8) OR ACGTAGT(3). + /// "seq" matches the bases and "count" matches the contained repeat value. + pub static ref TR_REGEX: Regex = Regex::new(r"^(?[A-Z]+)\((?[0-9]+)\)$").unwrap(); +} + +/// A normalized variant is unambiguously defined and left-aligned to the reference genome +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize)] +pub struct NormalizedVariant { + /// chromosome of the variant + chrom: String, + /// 0-based position of the variant + position: usize, + /// ref allele + reference: String, + /// alt allele + alternate: String +} + +impl NormalizedVariant { + /// This will take an allele definition and normalize it using the reference genome + /// # Arguments + /// * `chrom` - the chromosome the variant is positioned on + /// * `position` - the 0-based coordinate of the variant + /// * `ref_allele` - the given reference sequence + /// * `alt_allele` - the given alternate sequence + /// * `reference_genome` - the pre-loaded reference genome; if None, some normalization steps will not happen and it may fail to normalize + /// # Errors + /// * if the reference allele is empty + /// * if a variant is not yet supported + /// * if the ref_allele does not match the given reference + /// * if there is unexpected sequence + pub fn new(chrom: String, position: usize, ref_allele: &str, alt_allele: &str, reference_genome: Option<&ReferenceGenome>) -> Result> { + // I think CPIC is smart enough to always put "del", but best to double check + if ref_allele.is_empty() { + bail!("ref_allele cannot be empty"); + } + + // this shouldn't happen from what I've seen + if ref_allele == "del" && !alt_allele.starts_with("ins") { + bail!("Unexpected non-ins alt sequence with a del reference"); + } + + // these may change as we normalize the alleles, put into byte vecs for manipulation + let mut position: usize = position; + let mut ref_allele: Vec = parse_sequence(ref_allele); + let mut alt_allele: Vec = parse_sequence(alt_allele); + + // check if we were given a reference genome to normalize with + let chrom_seq = match reference_genome { + Some(rg) => { + // TODO: this will panic if the chromosome is not in the reference genome + if rg.contig_keys().contains(&chrom) { + // we have the contig, make sure our reference allele matches + let cs = rg.get_full_chromosome(&chrom); + let rg_seq = &cs[position..position+ref_allele.len()]; + if ref_allele != rg_seq { + let ref_allele = std::str::from_utf8(&ref_allele).unwrap_or("utf8-error"); + let rg_seq = std::str::from_utf8(rg_seq).unwrap_or("utf8-error"); + bail!("At {chrom}:{position}, provided reference allele has {ref_allele:?} but reference genome has {rg_seq:?}"); + } + Some(cs) + } else { + bail!("Reference genome does not contain contig {chrom:?}"); + } + }, + None => None + }; + + // make sure that we have some sequences + if ref_allele.is_empty() && alt_allele.is_empty() { + bail!("ref_allele and alt_allele cannot both be empty"); + } else if ref_allele.is_empty() { + // we are inserting _after_ this position (see https://www.ncbi.nlm.nih.gov/snp/rs777311140 for example) + // position does not change, we just need to pre-pend this position + if let Some(cs) = chrom_seq { + ref_allele.insert(0, cs[position]); + alt_allele.insert(0, cs[position]); + } + } else if alt_allele.is_empty() { + if position == 0 { + bail!("alt_allele is empty at position 0"); + } + + // we need to pre-pend the anchor base because one allele is empty + if let Some(cs) = chrom_seq { + position -= 1; + ref_allele.insert(0, cs[position]); + alt_allele.insert(0, cs[position]); + } + } + + // at this point, we should only have ACGTs + // first, chop off any duplicate sequence at the end + while ref_allele.len() > 1 && alt_allele.len() > 1 && ref_allele[ref_allele.len() - 1] == alt_allele[alt_allele.len() - 1] { + // position does not change here + ref_allele.pop(); + alt_allele.pop(); + } + + // similarly, chop off any duplicate sequence at the start + while ref_allele.len() > 1 && alt_allele.len() > 1 && ref_allele[0] == alt_allele[0] { + // we need to shift position as we chop + position += 1; + ref_allele.remove(0); + alt_allele.remove(0); + } + + // now we need to see if there is any shifting that needs to happen + while ref_allele[ref_allele.len() - 1] == alt_allele[alt_allele.len() - 1] { + // TODO: I think this assertion is true, lets leave it in for ourselves + assert!(ref_allele.len() == 1 || alt_allele.len() == 1); + + if position == 0 { + // we cannot prepend, just break out? + break; + } else if let Some(cs) = chrom_seq { + // this is pre-pending the reference base + position -= 1; + ref_allele.insert(0, cs[position]); + alt_allele.insert(0, cs[position]); + } else { + // we cannot pre-pend because no reference + break; + } + + // remove the trailing character that matched + ref_allele.pop(); + alt_allele.pop(); + } + + // we have finished the byte normalizing, convert everything back into a string + let reference: String = String::from_utf8(ref_allele)?; + let alternate: String = String::from_utf8(alt_allele)?; + + // one last sanity check on the allowed bases + let allowed_bases: HashSet = HashSet::from_iter(['A', 'C', 'G', 'T']); + if reference.chars().all(|c| allowed_bases.contains(&c)) && + alternate.chars().all(|c| allowed_bases.contains(&c)) { + // all sequences are composed of ACGT + Ok(NormalizedVariant { + chrom, + position, + reference, + alternate + }) + } else { + bail!("ACGT alleles only"); + } + } + + /// This will take an allele definition and normalize it using the reference genome, potentially producing multiple variants that can match. + /// If one of the results is "None", it indicates that one of the variants is the reference. + /// For example, if you have A->R, this is the same as (A->A OR A->G) where A->A is just the reference allele. + /// # Arguments + /// * `chrom` - the chromosome the variant is positioned on + /// * `position` - the 0-based coordinate of the variant + /// * `ref_allele` - the given reference sequence + /// * `alt_allele` - the given alternate sequence + /// * `reference_genome` - the pre-loaded reference genome; if None, some normalization steps will not happen and it may fail to normalize + /// # Errors + /// * if there are errors creating the sub-alleles + pub fn multi_new(chrom: String, position: usize, ref_allele: &str, alt_allele: &str, reference_genome: Option<&ReferenceGenome>) -> Result>, Box> { + // do any conversions + let multi_alt = match alt_allele { + // IUPAC code: https://www.bioinformatics.org/sms/iupac.html + "M" => vec!["A", "C"], + "R" => vec!["A", "G"], + "Y" => vec!["C", "T"], + // there is one allele with this pattern: "delinsCC; delinsCCC; delinsCCCC; delinsCCCCC; delinsCCCCCC; delinsCCCCCCC" + _ => alt_allele.split("; ").collect() + }; + + // now iterate over each possible allele and save it + let mut ret: Vec> = vec![]; + for aa in multi_alt { + if ref_allele == aa { + // ref and alt match, so just store None; this allows us to match later + ret.push(None); + } else { + // they're different, so convert it + ret.push( + Some( + Self::new( + chrom.clone(), + position, + ref_allele, + aa, + reference_genome + )? + ) + ); + } + } + Ok(ret) + } + + pub fn chrom(&self) -> &str { + &self.chrom + } + + pub fn position(&self) -> usize { + self.position + } +} + +/// Converts a sequence from CPIC into a Vec representation, while parsing any regex along the way. +/// STR regex is expanded and indel events are parsed into the corresponding base-only sequence. +/// # Arguments +/// * `sequence` - the sequence we are converting / parsing +fn parse_sequence(sequence: &str) -> Vec { + if let Some((_, [seq, count])) = TR_REGEX.captures(sequence).map(|c| c.extract()) { + // this is a pattern like ACGT(8), so we make ACGT * 8 and return that + let count = count.parse::().unwrap(); + seq.repeat(count).into_bytes() + } else if sequence.starts_with("delins") { + // just skip the "delins" sequence + sequence.as_bytes()[6..].to_vec() + } else if sequence.starts_with("ins") { + // just skip the "ins" sequence + sequence.as_bytes()[3..].to_vec() + } else if sequence.starts_with("del") { + // this is basically saying to delete the whole thing, so return empty vec + vec![] + } else { + sequence.as_bytes().to_vec() + } +} + +/// The possible genotypes we allow +#[derive(Clone, Copy, Debug, PartialEq, Serialize)] +pub enum Genotype { + /// 0/0 + #[serde(rename = "0/0")] + HomozygousReference, + /// 0/1 + #[serde(rename = "0/1")] + HeterozygousUnphased, + /// 0|1 + #[serde(rename = "0|1")] + HeterozygousPhased, + /// 1|0 + #[serde(rename = "1|0")] + HeterozygousPhasedFlip, + /// 1/1 + #[serde(rename = "1/1")] + HomozygousAlternate +} + +/// A normalized genotype is composed of both the genotype and the phase set +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct NormalizedGenotype { + /// The genotype + genotype: Genotype, + /// The phase set ID, only for phased alleles + phase_set: Option +} + +impl NormalizedGenotype { + /// Basic constructor + pub fn new(genotype: Genotype, phase_set: Option) -> NormalizedGenotype { + NormalizedGenotype { + genotype, phase_set + } + } + + pub fn genotype(&self) -> Genotype { + self.genotype + } + + pub fn phase_set(&self) -> &Option { + &self.phase_set + } +} + +/// Normalized PGx haplotype has both a haplotype name and a list of required variants. +/// This is complicated by the fact that multiple variants can "match" an allele, for example a "C" -> "R" is ["C" -> "A" OR "C" -> "G"]. +/// This struct handles the logic around comparing these mixtures of AND and OR composite haplotype definitions. +#[derive(Debug, Eq, PartialEq)] +pub struct NormalizedPgxHaplotype { + // the name of this haplotype + haplotype_name: String, + // variants defining the haplotype - the outer Vec is AND, the inner Vec is OR; None indicates that it doesn't have to match this + variants: Vec>> +} + +impl NormalizedPgxHaplotype { + /// Basic constructor + pub fn new(haplotype_name: String) -> NormalizedPgxHaplotype { + NormalizedPgxHaplotype { + haplotype_name, + variants: vec![] + } + } + + /// Adds a single required variant that matches one of the provided NormalizedVariants + /// # Arguments + /// * `variant` - a Vec of normalized variants, any of them would be considered a match for this variant + pub fn add_variant(&mut self, variant: Vec>) { + self.variants.push(variant); + } + + /// Returns true if a collection of NormalizedVariants matches this haplotype. + /// A match requires that both all of the provided variants match a variant in this haplotype AND that all variants in the haplotype have a match. + /// If double matches are detected (e.g. same variant twice), then this will return false. + /// # Arguments + /// * `other_variants` - the list of other variants we want to match + pub fn matches(&self, other_variants: &[NormalizedVariant]) -> bool { + // initialize that all variants are unmatched so far + let mut match_vec: Vec = vec![false; self.variants.len()]; + for ov in other_variants.iter() { + // find the first variant that matches this one + // if we ever get multiple matches, this logic will need to change + let match_index = self.variants.iter() + .position(|v| { + v.iter().any(|sub_v| sub_v.as_ref() == Some(ov)) + }); + match match_index { + Some(mi) => { + if match_vec[mi] { + // something has already matched this loci, we cannot double match + return false; + } else { + // mark this one as matched + match_vec[mi] = true; + } + }, + None => { + // there is no match for this allele + return false; + } + } + } + + // all the other_variants had a match, make sure everything in this haplotype also had a match + match_vec.iter() + .enumerate() + .all(|(i, &matched)| { + // allowed if we either had a match OR None is allowed + matched || self.variants[i].contains(&None) + }) + } + + pub fn haplotype_name(&self) -> &str { + &self.haplotype_name + } + + pub fn variants(&self) -> &[Vec>] { + &self.variants + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + + /// Utility that loads our tiny reference for us + fn load_test_reference() -> ReferenceGenome { + let ref_fn = PathBuf::from("test_data/test_reference.fa"); + ReferenceGenome::from_fasta(&ref_fn).unwrap() + } + + /// Checks a basic SNP, no reference + #[test] + fn test_normalize_snp() { + let chrom = "chr1".to_string(); + let position = 10; + let reference = "A".to_string(); + let alternate = "C".to_string(); + let na = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, None).unwrap(); + assert_eq!(na, NormalizedVariant { + chrom, + position, + reference, + alternate + }); + } + + /// Checks an A->R (non-multi) to verify we get an error. + #[test] + fn test_normalize_multisnp() { + let chrom = "chr1".to_string(); + let position = 10; + let reference = "A".to_string(); + let alternate = "R".to_string(); + let na = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, None); + assert!(na.is_err()); + } + + /// Check basic indel normalization (no reference). + #[test] + fn test_normalize_indel() { + let chrom = "chr1".to_string(); + let position = 10; + let reference = "AC".to_string(); + let alternate = "ACC".to_string(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, None).unwrap(); + let expected = NormalizedVariant { + chrom, + position, + reference: "A".to_string(), + alternate: "AC".to_string() + }; + assert_eq!(nv, expected); + } + + /// Same as above, but with reference + #[test] + fn test_normalize_ins_ref_001() { + // this case should get normalize to chr1:10 A -> AC + let chrom = "chr1".to_string(); + let position = 10; + let reference = "AC".to_string(); + let alternate = "ACC".to_string(); + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = NormalizedVariant { + chrom, + position, + reference: "A".to_string(), + alternate: "AC".to_string() + }; + assert_eq!(nv, expected); + } + + /// tests for removing redundant bases in prefix + #[test] + fn test_normalize_ins_ref_002() { + // this case should get normalize to chr1:12 A -> AC + let chrom = "chr1".to_string(); + let position = 10; + let reference = "ACAC".to_string(); + let alternate = "ACACC".to_string(); + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = NormalizedVariant { + chrom, + position: 12, + reference: "A".to_string(), + alternate: "AC".to_string() + }; + assert_eq!(nv, expected); + } + + /// tests for left shifting a large insertion in a TR + #[test] + fn test_normalize_ins_ref_003() { + // this case should get normalize to chr1:9 A -> AACAC + let chrom = "chr1".to_string(); + let position = 10; + let reference = "ACACACACAC".to_string(); + let alternate = "ACACACACACACAC".to_string(); + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = NormalizedVariant { + chrom, + position: 9, + reference: "A".to_string(), + alternate: "AACAC".to_string() + }; + assert_eq!(nv, expected); + } + + /// test for normalizing a deletion in a TR + #[test] + fn test_normalize_del_ref_001() { + // this case should get normalize to chr1:9 AAC -> A + let chrom = "chr1".to_string(); + let position = 16; + let reference = "ACAC".to_string(); + let alternate = "AC".to_string(); + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = NormalizedVariant { + chrom, + position: 9, + reference: "AAC".to_string(), + alternate: "A".to_string() + }; + assert_eq!(nv, expected); + } + + /// test for normalizing a del in an TR + #[test] + fn test_normalize_del_ref_002() { + // this case should get normalize to chr1:9 AAC -> A + let chrom = "chr1".to_string(); + let position = 16; + let reference = "ACAC".to_string(); + let alternate = "AC".to_string(); + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = NormalizedVariant { + chrom, + position: 9, + reference: "AAC".to_string(), + alternate: "A".to_string() + }; + assert_eq!(nv, expected); + } + + /// make sure that empty ref & alt throws an error + #[test] + fn test_empty_refalt() { + // this case should get normalize to chr1:9 CAGT -> C + let chrom = "chr2".to_string(); + let position = 13; + let reference = "".to_string(); + let alternate = "".to_string(); + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)); + assert!(nv.is_err()); + } + + /// make sure that an empty alt will add the anchor base and left-shift + #[test] + fn test_empty_alt() { + // this case should get normalize to chr1:9 CAGT -> C + let chrom = "chr2".to_string(); + let position = 13; + let reference = "AGT".to_string(); + let alternate = "".to_string(); + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = NormalizedVariant { + chrom, + position: 9, + reference: "CAGT".to_string(), + alternate: "C".to_string() + }; + assert_eq!(nv, expected); + } + + /// make sure the reference always matches + #[test] + fn test_ref_mismatch() { + // this case should get fail to normalize due to reference mismatch + let chrom = "chr2".to_string(); + let position = 13; + let reference = "MISS".to_string(); + let alternate = "A".to_string(); + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)); + assert!(nv.is_err()); + } + + /// CPIC has an annoying syntax for insertions, this checks that we appropriately anchor it. + #[test] + fn test_cpic_ins() { + // this case should get normalize to chr1:9 C -> CAGT + let chrom = "chr2".to_string(); + let position = 12; + let reference = "del".to_string(); + let alternate = "insAGT".to_string(); + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = NormalizedVariant { + chrom, + position: 9, + reference: "C".to_string(), + alternate: "CAGT".to_string() + }; + assert_eq!(nv, expected); + } + + /// tests a CPIC-style deletion + #[test] + fn test_cpic_del() { + // this case should get normalize to chr1:9 C -> CAGT + let chrom = "chr2".to_string(); + let position = 13; + let reference = "AGT".to_string(); + let alternate = "delAGT".to_string(); + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = NormalizedVariant { + chrom, + position: 9, + reference: "CAGT".to_string(), + alternate: "C".to_string() + }; + assert_eq!(nv, expected); + } + + /// tests a CPIC-style indel + #[test] + fn test_cpic_delins() { + // this case should get normalize to chr1:10 A -> CGG + let chrom = "chr2".to_string(); + let position = 10; + let reference = "A".to_string(); + let alternate = "delinsCGG".to_string(); + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = NormalizedVariant { + chrom, + position, + reference: "A".to_string(), + alternate: "CGG".to_string() + }; + assert_eq!(nv, expected); + } + + /// Tests a CPIC deletion in a TR + #[test] + fn test_cpic_tr_del() { + // this case should get normalize to chr1:10 A -> CGG + let chrom = "chr2".to_string(); + let position = 10; + let reference = "AGT(3)".to_string(); + let alternate = "AGT(2)".to_string(); + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = NormalizedVariant { + chrom, + position: 9, + reference: "CAGT".to_string(), + alternate: "C".to_string() + }; + assert_eq!(nv, expected); + } + + /// Tests a CPIC inseriton in a TR + #[test] + fn test_cpic_tr_ins() { + // this case should get normalize to chr1:10 A -> CGG + let chrom = "chr2".to_string(); + let position = 10; + let reference = "AGT(3)".to_string(); + let alternate = "AGT(4)".to_string(); + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = NormalizedVariant { + chrom, + position: 9, + reference: "C".to_string(), + alternate: "CAGT".to_string() + }; + assert_eq!(nv, expected); + } + + /// Tests a CPIC IUPAC multi-match with the reference allele + #[test] + fn test_multinew_iupac_ref_included() { + // this case should get normalize to chr1:10 A -> [None, G] + let chrom = "chr1".to_string(); + let position = 10; + let reference = "A".to_string(); + let alternate = "R".to_string(); // A or G + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::multi_new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = vec![ + None, + Some(NormalizedVariant { + chrom, + position: 10, + reference: "A".to_string(), + alternate: "G".to_string() + }) + ]; + assert_eq!(nv, expected); + } + + /// Tests a CPIC IUPAC multi-match where both are non-reference + #[test] + fn test_multinew_iupac_double_alt() { + // this case should get normalize to chr1:10 A -> [C, T] + let chrom = "chr1".to_string(); + let position = 10; + let reference = "A".to_string(); + let alternate = "Y".to_string(); // C or T + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::multi_new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = vec![ + Some(NormalizedVariant { + chrom: chrom.clone(), + position: 10, + reference: "A".to_string(), + alternate: "C".to_string() + }), + Some(NormalizedVariant { + chrom, + position: 10, + reference: "A".to_string(), + alternate: "T".to_string() + }) + ]; + assert_eq!(nv, expected); + } + + /// Tests a CPIC multi-match using the "; " delimiter. There is only one in CPIC thus far. + #[test] + fn test_multinew_semicolon() { + // this case should get normalize to chr1:10 A -> [C, CC, CCC] + let chrom = "chr1".to_string(); + let position = 10; + let reference = "A".to_string(); + let alternate = "delinsC; delinsCC; delinsCCC".to_string(); // C or CC or CCC + let reference_genome = load_test_reference(); + let nv = NormalizedVariant::multi_new(chrom.clone(), position, &reference, &alternate, Some(&reference_genome)).unwrap(); + let expected = vec![ + Some(NormalizedVariant { + chrom: chrom.clone(), + position: 10, + reference: "A".to_string(), + alternate: "C".to_string() + }), + Some(NormalizedVariant { + chrom: chrom.clone(), + position: 10, + reference: "A".to_string(), + alternate: "CC".to_string() + }), + Some(NormalizedVariant { + chrom, + position: 10, + reference: "A".to_string(), + alternate: "CCC".to_string() + }) + ]; + assert_eq!(nv, expected); + } + + /// make sure an empty haplotype only matches empty list + #[test] + fn test_ref_matches() { + // can only match reference allele + let pgx_hap = NormalizedPgxHaplotype::new("test".to_string()); + let test_variant = NormalizedVariant::new("chr1".to_string(), 10, "A", "C", None).unwrap(); + assert!(pgx_hap.matches(&[])); + assert!(!pgx_hap.matches(&[ + test_variant + ])); + } + + /// test a simple single-variant haplotype match + #[test] + fn test_alt_matches() { + // can only match alternate allele + let mut pgx_hap = NormalizedPgxHaplotype::new("test".to_string()); + let test_variant = NormalizedVariant::new("chr1".to_string(), 10, "A", "C", None).unwrap(); + pgx_hap.add_variant(vec![Some(test_variant.clone())]); + assert!(!pgx_hap.matches(&[])); + assert!(pgx_hap.matches(&[ + test_variant + ])); + } + + /// tests that optional matches work both with and without the optional variant + #[test] + fn test_optional_matches() { + // optional variant, so both with and without will match + let mut pgx_hap = NormalizedPgxHaplotype::new("test".to_string()); + let test_variant = NormalizedVariant::new("chr1".to_string(), 10, "A", "C", None).unwrap(); + pgx_hap.add_variant(vec![None, Some(test_variant.clone())]); + assert!(pgx_hap.matches(&[])); + assert!(pgx_hap.matches(&[ + test_variant + ])); + } + + /// tests a multi-variant list, make sure that each individually will match, but both together should not + #[test] + fn test_multivariant_matches() { + // optional variant, so both with and without will match + let mut pgx_hap = NormalizedPgxHaplotype::new("test".to_string()); + let test_variant_form1 = NormalizedVariant::new("chr1".to_string(), 10, "A", "C", None).unwrap(); + let test_variant_form2 = NormalizedVariant::new("chr1".to_string(), 10, "A", "T", None).unwrap(); + pgx_hap.add_variant(vec![Some(test_variant_form1.clone()), Some(test_variant_form2.clone())]); + + // either one by itself should match, both together should not + assert!(pgx_hap.matches(&[ + test_variant_form1.clone() + ])); + assert!(pgx_hap.matches(&[ + test_variant_form2.clone() + ])); + assert!(!pgx_hap.matches(&[ + test_variant_form1, + test_variant_form2 + ])); + } +} diff --git a/src/data_types/pgx_diplotypes.rs b/src/data_types/pgx_diplotypes.rs new file mode 100644 index 0000000..3a5b29f --- /dev/null +++ b/src/data_types/pgx_diplotypes.rs @@ -0,0 +1,348 @@ + +use serde::Serialize; +use simple_error::bail; +use std::collections::BTreeMap; +use std::collections::btree_map::Entry::{Occupied, Vacant}; + +use crate::data_types::database::PgxMetadata; +use crate::data_types::normalized_variant::{NormalizedVariant, NormalizedGenotype}; +use crate::hla::mapping::HlaMappingStats; + +/// Intended to be serialized to JSON as the final result +#[derive(Debug, Serialize)] +pub struct PgxDiplotypes { + /// Version of the tool that generated the calls + pbstarphase_version: String, + /// Metadata for the database + database_metadata: PgxMetadata, + /// Map from gene name to diplotype call + gene_details: BTreeMap +} + +impl PgxDiplotypes { + /// Basic constructor, will perform sanity checks if necessary + pub fn new(database_metadata: PgxMetadata) -> PgxDiplotypes { + PgxDiplotypes { + pbstarphase_version: crate::cli::core::FULL_VERSION.to_string(), + database_metadata, + gene_details: Default::default() + } + } + + /// Simple wrapper for our diplotype insertion to make sure we do not double insert + /// # Arguments + /// * `gene` - the gene name we are saving the diplotype for + /// * `diplotype` - the diplotype call getting saved + pub fn insert(&mut self, gene: String, diplotype: PgxGeneDetails) -> Result<(), Box> { + match self.gene_details.entry(gene) { + Vacant(entry) => entry.insert(diplotype), + Occupied(entry) => bail!("Entry for {} is already occupied.", entry.key()) + }; + Ok(()) + } + + pub fn database_metadata(&self) -> &PgxMetadata { + &self.database_metadata + } + + pub fn gene_details(&self) -> &BTreeMap { + &self.gene_details + } +} + +/// Wrapper for all of the details for a single gene +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct PgxGeneDetails { + /// Contains the list of exactly matching diplotypes + diplotypes: Vec, + /// Contains an optional list of simplified diplotypes + simple_diplotypes: Option>, + /// Contains the list of identified variants + variant_details: Option>, + /// Contains the list of alignments analyzed + mapping_details: Option>, + // Contains the list of multi-mapping alignments analyzed + multi_mapping_details: Option> +} + +impl PgxGeneDetails { + /// Basic constructor for wrapping our details up into a nice bundle + pub fn new(diplotypes: Vec, simple_diplotypes: Option>, variant_details: Vec) -> Result> { + if let Some(sd) = simple_diplotypes.as_ref() { + if diplotypes.len() != sd.len() { + bail!("diplotypes and simple_diplotypes must be the same length"); + } + } + Ok(PgxGeneDetails { + diplotypes, + simple_diplotypes, + variant_details: Some(variant_details), + mapping_details: None, + multi_mapping_details: None + }) + } + + /// This is the one for HLA (currently, it will probably change to multi-mapping when we synchronize the approaches) + pub fn new_from_mappings(diplotypes: Vec, simple_diplotypes: Option>, mapping_details: Vec) -> Result> { + if let Some(sd) = simple_diplotypes.as_ref() { + if diplotypes.len() != sd.len() { + bail!("diplotypes and simple_diplotypes must be the same length"); + } + } + Ok(PgxGeneDetails { + diplotypes, + simple_diplotypes, + variant_details: None, + mapping_details: Some(mapping_details), + multi_mapping_details: None + }) + } + + /// This is the one for CYP2D6 + pub fn new_from_multi_mappings(diplotypes: Vec, simple_diplotypes: Option>, multi_mapping_details: Vec) -> Result> { + if let Some(sd) = simple_diplotypes.as_ref() { + if diplotypes.len() != sd.len() { + bail!("diplotypes and simple_diplotypes must be the same length"); + } + } + Ok(PgxGeneDetails { + diplotypes, + simple_diplotypes, + variant_details: None, + mapping_details: None, + multi_mapping_details: Some(multi_mapping_details) + }) + } + + /// Generic wrapper function to create a "NO_MATCH" result, usually from some algorithm failure. + /// This does not include additional details. + pub fn no_match() -> PgxGeneDetails { + let diplotypes = vec![ + Diplotype::new("NO_MATCH", "NO_MATCH") + ]; + PgxGeneDetails { + diplotypes, + simple_diplotypes: None, + variant_details: None, + mapping_details: None, + multi_mapping_details: None + } + } + + pub fn diplotypes(&self) -> &[Diplotype] { + &self.diplotypes + } + + pub fn simple_diplotypes(&self) -> &[Diplotype] { + if let Some(sd) = self.simple_diplotypes.as_ref() { + sd + } else { + self.diplotypes() + } + } + + pub fn variant_details(&self) -> Option<&[PgxVariantDetails]> { + self.variant_details.as_deref() + } + + pub fn mapping_details(&self) -> Option<&[PgxMappingDetails]> { + self.mapping_details.as_deref() + } +} + +/// Contains all the information related to a single gene's diplotype result +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct Diplotype { + /// short string for haplotype 1 + hap1: String, + /// short string for haplotype 2 + hap2: String, + /// combination diplotype call + diplotype: String + // TODO: we will likely include deeper evidence information in the future, e.g. variant calls, counts, etc. +} + +impl Diplotype { + pub fn new(hap1: &str, hap2: &str) -> Diplotype { + Diplotype { + hap1: hap1.to_string(), + hap2: hap2.to_string(), + diplotype: format!("{}/{}", hap1, hap2) + } + } + + /// If homozygous, return the single haplotype + pub fn homozygous_haplotype(&self) -> Option<&str> { + if self.hap1 == self.hap2 { + Some(&self.hap1) + } else { + None + } + } + + pub fn diplotype(&self) -> &str { + &self.diplotype + } + + /// Returns a PharmCAT formatted diplotype. + /// See https://pharmcat.org/using/Outside-Call-Format/#diplotypes for more details. + pub fn pharmcat_diplotype(&self) -> String { + let h1 = if self.hap1.contains('+') { + format!("[{}]", self.hap1) + } else { + self.hap1.clone() + }; + let h2 = if self.hap2.contains('+') { + format!("[{}]", self.hap2) + } else { + self.hap2.clone() + }; + format!("{h1}/{h2}") + } +} + +/// Contains all the details for a variant that was identified through our process +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct PgxVariantDetails { + /// CPIC assigned variant ID + cpic_variant_id: u64, + /// CPIC assigned name + cpic_name: String, + /// DBSNP id when available + dbsnp: Option, + /// The normalized variant we loaded + normalized_variant: NormalizedVariant, + /// The normalized genotype we loaded + normalized_genotype: NormalizedGenotype +} + +impl PgxVariantDetails { + pub fn new(cpic_variant_id: u64, cpic_name: String, dbsnp: Option, normalized_variant: NormalizedVariant, normalized_genotype: NormalizedGenotype) -> PgxVariantDetails { + PgxVariantDetails { + cpic_variant_id, + cpic_name, + dbsnp, + normalized_variant, + normalized_genotype + } + } +} + +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct PgxMappingDetails { + /// Read mapping ID + read_qname: String, + /// The ID of the best matching star allele from the database + best_hla_id: String, + /// The star allele string for the best match + best_star_allele: String, + /// The mapping stats for the best match + best_mapping_stats: HlaMappingStats, + /// If true, this mapping was ignored due to high error rate + is_ignored: bool +} + +impl PgxMappingDetails { + pub fn new(read_qname: String, best_hla_id: String, best_star_allele: String, best_mapping_stats: HlaMappingStats, is_ignored: bool) -> PgxMappingDetails { + PgxMappingDetails { + read_qname, + best_hla_id, + best_star_allele, + best_mapping_stats, + is_ignored + } + } +} + +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct PgxMultiMappingDetails { + /// Read mapping ID + read_qname: String, + /// Coordinates in the read corresponding to the extracted sequence + read_position: std::ops::Range, + /// The final consensus ID this was assigned to + consensus_id: usize, + /// The final star allele this was assigned to + consensus_star_allele: String +} + +impl PgxMultiMappingDetails { + pub fn new( + read_qname: String, read_position: std::ops::Range, consensus_id: usize, consensus_star_allele: String + ) -> PgxMultiMappingDetails { + PgxMultiMappingDetails { + read_qname, read_position, consensus_id, consensus_star_allele + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pgx_diplotypes() { + let hap1 = "A"; + let hap2 = "B"; + let diplotype = vec![Diplotype::new(hap2, hap1)]; + + let mut diplotypes = PgxDiplotypes::new(Default::default()); + let gene_details = PgxGeneDetails::new(diplotype, None, vec![]).unwrap(); + assert!(gene_details.mapping_details.is_none()); + diplotypes.insert("CACNA1S".to_string(), gene_details.clone()).unwrap(); + + let map = &diplotypes.gene_details; + assert_eq!(map.len(), 1); + assert_eq!(map.get("CACNA1S").unwrap(), &gene_details); + } + + #[test] + #[should_panic] + fn test_duplicate_diplotype() { + let hap1 = "A"; + let hap2 = "B"; + let diplotype = vec![Diplotype::new(hap2, hap1)]; + + let mut diplotypes = PgxDiplotypes::new(Default::default()); + let gene_details = PgxGeneDetails::new(diplotype, None, vec![]).unwrap(); + diplotypes.insert("CACN1S".to_string(), gene_details.clone()).unwrap(); + diplotypes.insert("CACN1S".to_string(), gene_details).unwrap(); + } + + #[test] + fn test_new_from_mappings() { + let hap1 = "A"; + let hap2 = "B"; + let diplotype = vec![Diplotype::new(hap2, hap1)]; + + let mut diplotypes = PgxDiplotypes::new(Default::default()); + let gene_details = PgxGeneDetails::new_from_mappings(diplotype, None, vec![]).unwrap(); + assert!(gene_details.variant_details.is_none()); + diplotypes.insert("HLA-A".to_string(), gene_details.clone()).unwrap(); + + let map = &diplotypes.gene_details; + assert_eq!(map.len(), 1); + assert_eq!(map.get("HLA-A").unwrap(), &gene_details); + } + + #[test] + fn test_diplotype() { + // this is a basic test for now, will likely get more complicated over time + let hap1 = "A"; + let hap2 = "B"; + let diplotype = Diplotype::new(hap2, hap1); + assert_eq!(diplotype.diplotype(), "B/A"); + } + + #[test] + fn test_pharmcat_diplotype() { + let diplotype = Diplotype::new("*4", "*1"); + assert_eq!(diplotype.pharmcat_diplotype(), "*4/*1"); + + let diplotype = Diplotype::new("*4x2", "*1"); + assert_eq!(diplotype.pharmcat_diplotype(), "*4x2/*1"); + + let diplotype = Diplotype::new("*4 + *68", "*1"); + assert_eq!(diplotype.pharmcat_diplotype(), "[*4 + *68]/*1"); + } +} diff --git a/src/diplotyper.rs b/src/diplotyper.rs new file mode 100644 index 0000000..9abd019 --- /dev/null +++ b/src/diplotyper.rs @@ -0,0 +1,1131 @@ + +use log::{debug, error, info, trace, warn}; +use rust_htslib::bcf; +use rust_htslib::bcf::Read; +use rust_htslib::bcf::record::GenotypeAllele; +use rust_lib_reference_genome::reference_genome::ReferenceGenome; +use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet}; +use simple_error::bail; +use std::collections::hash_map::Entry::{Occupied, Vacant}; +use std::path::{Path, PathBuf}; + +use crate::cli::diplotype::DiplotypeSettings; +use crate::cyp2d6::caller::diplotype_cyp2d6; +use crate::data_types::database::{PgxDatabase, PgxGene, PgxVariant}; +use crate::data_types::normalized_variant::{Genotype, NormalizedGenotype, NormalizedVariant, NormalizedPgxHaplotype}; +use crate::data_types::pgx_diplotypes::{PgxDiplotypes, PgxGeneDetails, PgxVariantDetails, Diplotype}; +use crate::hla::caller::diplotype_hla; +use crate::util::file_io::load_file_lines; +use crate::visualization::debug_bam_writer::DebugBamWriter; + +/// This is the main function to call all of the diplotypes. +/// It handles all the VCF parsing and variant normalization. +/// # Arguments +/// * `database` - the pre-loaded database of PGx data +/// * `opt_vcf_fn` - the optional VCF that we will scan for variants from the database, we assume it is indexed +/// * `reference_genome` - the pre-loaded reference genome; if None, some normalization steps will not happen +/// * `bam_fns` - any BAM files that will be used for HLA calling +/// * `cli_settings` - the full settings for diplotyping +/// # Errors +/// * if there are errors normalizing database entries +/// * if there are errors loading the VCF file +/// * if there are errors calling the diplotypes +pub fn call_diplotypes( + database: &PgxDatabase, opt_vcf_fn: Option<&Path>, reference_genome: Option<&ReferenceGenome>, + bam_fns: &[PathBuf], cli_settings: &DiplotypeSettings +) -> Result> { + let mut diplotypes: PgxDiplotypes = PgxDiplotypes::new(database.database_metadata().clone()); + + // figure out the set of genes to include / exclude + let opt_exclude_set: Option> = if let Some(efn) = cli_settings.exclude_fn.as_deref() { + Some(load_file_lines(efn)?) + } else { + None + }; + let opt_include_set: Option> = if let Some(ifn) = cli_settings.include_fn.as_deref() { + Some(load_file_lines(ifn)?) + } else { + None + }; + + if let Some(vcf_fn) = opt_vcf_fn { + for (gene_name, gene_entry) in database.gene_entries().iter() { + // assume include set has everything UNLESS it is specified + let included = opt_include_set.as_ref().map(|include_set| include_set.contains(gene_name)).unwrap_or(true); + if !included { + debug!("Skipping {gene_name}, not in include set"); + continue; + } + + // assume exclude set has nothing UNLESS it is specified + let excluded = opt_exclude_set.as_ref().map(|exclude_set| exclude_set.contains(gene_name)).unwrap_or(false); + if excluded { + debug!("Skipping {gene_name}, part of exclude set"); + continue; + } + + info!("Solving {gene_name}..."); + + // we need to normalize all of the variants in our database and load haplotypes as well + let (variant_hash, normalized_haplotypes) = load_database_haplotypes(gene_entry, reference_genome)?; + debug!("Loaded {} normalized variants.", variant_hash.len()); + debug!("Loaded {} normalized haplotypes.", normalized_haplotypes.len()); + + // make sure we have variants, otherwise we can't do anything + if variant_hash.is_empty() { + warn!("No variants found for {gene_name}, returning default reference allele."); + let reference_name: &str = gene_entry.reference_allele().unwrap_or("NO_REFERENCE_ALLELE"); + let all_ref_diplotype = Diplotype::new(reference_name, reference_name); + + debug!("\t{:?}", all_ref_diplotype.diplotype()); + let gene_details = PgxGeneDetails::new( + vec![all_ref_diplotype], + None, + vec![] + )?; + diplotypes.insert(gene_name.clone(), gene_details)?; + continue; + } + + // load the normalize variants from the VCF + let vcf_variants: HashMap = load_vcf_variants(vcf_fn, &variant_hash, reference_genome)?; + debug!("Loaded {} normalized genotypes.", vcf_variants.len()); + + // finally, call the diplotype from the loaded variants + let diplotype = solve_diplotype(&normalized_haplotypes, &vcf_variants)?; + debug!("Diplotype for {gene_name} => {:?}", diplotype.iter().map(|d| d.diplotype()).collect::>()); + let variant_details: Vec = vcf_variants.into_iter() + .map(|(nv, ng)| { + let variant_meta = variant_hash.get(&nv).unwrap(); + PgxVariantDetails::new( + variant_meta.variant_id, + variant_meta.name.clone(), + variant_meta.dbsnp_id.clone(), + nv, + ng + ) + }) + .collect(); + let gene_details = PgxGeneDetails::new( + diplotype, + None, + variant_details + )?; + diplotypes.insert(gene_name.clone(), gene_details)?; + } + } else { + info!("No VCF file provided, all variant based diplotyping was skipped."); + } + + if !bam_fns.is_empty() { + // this check is not _really_ necessary, but we have an Option so lets keep it safe + if reference_genome.is_none() { + bail!("Reference genome is required for reading alignment files"); + } + + let mut debug_bam_writer: Option = match cli_settings.debug_folder.as_ref() { + Some(debug_folder) => { + let extension = "debug_consensus.bam"; + let consensus_fn = debug_folder.join(extension); + Some(DebugBamWriter::new(consensus_fn, reference_genome.unwrap())?) + }, + None => None + }; + + if !cli_settings.debug_skip_hla { + // this is the initial HLA list + let initial_hla_list = [ + "HLA-A".to_string(), + "HLA-B".to_string() + ]; + + // filter out anything that is excluded due to CLI parameters + let final_hla_list: Vec = initial_hla_list.into_iter() + .filter(|gene_name| { + // assume include set has everything UNLESS it is specified + let included = opt_include_set.as_ref().map(|include_set| include_set.contains(gene_name)).unwrap_or(true); + if !included { + debug!("Skipping {gene_name}, not in include set"); + } + + // assume exclude set has nothing UNLESS it is specified + let excluded = opt_exclude_set.as_ref().map(|exclude_set| exclude_set.contains(gene_name)).unwrap_or(false); + if excluded { + debug!("Skipping {gene_name}, part of exclude set"); + } + + // this one is only kept if it is both included AND NOT excluded + included && !excluded + }) + .collect(); + + // only call this if something is in the list to diplotype + if !final_hla_list.is_empty() { + // user gave us BAM files, so lets add in the HLA genes we have ready + let hla_calls = diplotype_hla( + &final_hla_list, + database, + bam_fns, + reference_genome.unwrap(), + debug_bam_writer.as_mut(), + cli_settings + )?; + + // add in each result, insert will make sure we do not duplicate + for (gene_name, gene_details) in hla_calls.into_iter() { + diplotypes.insert(gene_name, gene_details)?; + } + } + } + + // assume include set has everything UNLESS it is specified + let gene_name = "CYP2D6"; + let included = opt_include_set.as_ref().map(|include_set| include_set.contains(gene_name)).unwrap_or(true); + if !included { + debug!("Skipping {gene_name}, not in include set"); + } + + // assume exclude set has nothing UNLESS it is specified + let excluded = opt_exclude_set.as_ref().map(|exclude_set| exclude_set.contains(gene_name)).unwrap_or(false); + if excluded { + debug!("Skipping {gene_name}, part of exclude set"); + } + + if included && !excluded { + // CYP2D6 also requires a BAM file + match diplotype_cyp2d6( + database, + bam_fns, + reference_genome.unwrap(), + debug_bam_writer.as_mut(), + cli_settings + ) { + // happy path, we produced a D6 call + Ok(cyp2d6_call) => diplotypes.insert("CYP2D6".to_string(), cyp2d6_call)?, + // unhappy path - this could be an "expected" error due to something like low coverage + // OR it could be something where we want to propagate something unexpected so we get a user report + Err(e) => { + // first, check if this is an error we "expect" to happen + use crate::cyp2d6::errors::CallerError; + if let Some(caller_error) = e.downcast_ref::() { + error!("Received error while calling CYP2D6: {caller_error}"); + error!("Setting result to NO_MATCH state"); + diplotypes.insert("CYP2D6".to_string(), PgxGeneDetails::no_match())?; + } else { + // not an expected error, propagate and maybe a user will message us for debugging + return Err(e); + } + } + }; + } + + // we finished all processing, finalize the debug BAM if it exists + if let Some(dbw) = debug_bam_writer.as_mut() { + // write all the records we have saved from the HLA/CYP2D6 processes + match dbw.write_all_records() { + Ok(()) => {}, + Err(e) => { + error!("Error while writing debug BAM: {e}"); + error!("Continuing processes..."); + } + } + } + } else { + info!("No BAM files were provided, all alignment based diplotyping was skipped."); + } + + Ok(diplotypes) +} + +/// This is just a basic wrapper for variant-level metadata that we can tag on NormalizedVariants. +#[derive(Clone, Debug, Default, PartialEq)] +struct VariantMeta { + /// CPIC variant ID + pub variant_id: u64, + /// CPIC variant name + pub name: String, + /// DBSNP ID, if available + pub dbsnp_id: Option +} + +/// This will load all the haplotypes from the database and normalize them. +/// It returns the set of loaded variants AND the haplotypes. +/// Note that if a variant fails to normalize, the whole haplotype will be ignored. +/// +/// Returns a tuple (`variant_hash`, `normalized_haplotypes`): +/// * `variant_hash` - a HashMap from a NormalizedVariant to the original variant metadata +/// * `normalized_haplotypes` - a Vec of NormalizedPgxHaplotypes loaded from the database +/// # Arguments +/// * `gene_entry` - the single gene entry from our database +/// * `reference_genome` - the pre-loaded reference genome; if None, some normalization steps will not happen +/// # Errors +/// * if a variant_id is present in a haplotype definition but not in the variant set (aka, undefined) +/// * if a variant has fewer than two alleles +/// * if a variant in the database is incomplete +#[allow(clippy::type_complexity)] +fn load_database_haplotypes(gene_entry: &PgxGene, reference_genome: Option<&ReferenceGenome>) + -> Result<(HashMap, Vec), Box> { + let mut normalized_haplotypes: Vec = vec![]; + let mut normalized_variants: HashMap = Default::default(); + + let pgx_variants = gene_entry.variants(); + for (haplotype_name, pgx_haplotype) in gene_entry.defined_haplotypes() { + // initialize a haplotype + let mut normalized_haplotype = NormalizedPgxHaplotype::new(haplotype_name.clone()); + let mut normalized_variant_meta: Vec = vec![]; + let mut normalized: bool = true; + for (variant_id, variant_allele) in pgx_haplotype.haplotype().iter() { + let variant: &PgxVariant = pgx_variants.get(variant_id).ok_or(format!("variant {variant_id} is referenced but not defined"))?; + let dbsnp: &Option = variant.dbsnp_id(); + let variant_name: String = variant.name().to_string(); + + let alleles = variant.alleles(); + if alleles.len() < 2 { + bail!("Encountered variant {variant_id} with fewer than two alleles."); + } + for allele in alleles.iter() { + if allele.is_none() { + bail!("Encountered variant {variant_id} with undefined alleles."); + } + } + + // ref is allele 0 + let ref_allele = alleles[0].as_deref().unwrap(); + let alt_allele = variant_allele.as_str(); + + // check if this is a reference allele + if ref_allele != alt_allele { + match NormalizedVariant::multi_new( + gene_entry.chromosome().to_string(), + // 1-based -> 0-based + variant.position() - 1, + ref_allele, + alt_allele, + reference_genome + ) { + Ok(nv) => { + // the allele was successfully constructed, add it to this haplotype + normalized_haplotype.add_variant(nv); + normalized_variant_meta.push(VariantMeta{ + variant_id: *variant_id, + name: variant_name, + dbsnp_id: dbsnp.clone() + }); + }, + Err(e) => { + warn!("Error while normalizing database variant {variant_id}: {e}, {variant:?}"); + warn!("Ignoring {haplotype_name:?} due to variant incompatibility."); + normalized = false; + break; + } + } + } else { + // this is a reference allele, we can just ignore for now + } + } + + if normalized { + // we need to add every variant we found to the hash set + assert_eq!(normalized_haplotype.variants().len(), normalized_variant_meta.len()); + for (or_variants, variant_meta) in normalized_haplotype.variants().iter().zip(normalized_variant_meta.into_iter()) { + // this is a list of logically OR'ed variants, they should all have the same metadata though (except None) + for opt_nv in or_variants.iter() { + match opt_nv { + Some(nv) => { + // add this variant if it is not already added + match normalized_variants.entry(nv.clone()) { + Occupied(entry) => { + // we already saved this one, make sure there are no surprises + assert_eq!(entry.get(), &variant_meta); + }, + Vacant(entry) => { + // new one to add + entry.insert(variant_meta.clone()); + } + }; + }, + None => { + // do nothing, it's just indiciating optional + } + }; + } + } + + // now we need to save this haplotype + normalized_haplotypes.push(normalized_haplotype); + } + } + + Ok( + (normalized_variants, normalized_haplotypes) + ) +} + +/// This will load all the identified variants from a VCF file. +/// # Arguments +/// * `vcf_fn` - the path to the VCF file, which must have an index +/// * `variant_hash` - the set of normalized variants we are looking for +/// * `reference_genome` - the pre-loaded reference genome; if None, some normalization steps will not happen +/// # Errors +/// * if there are issues opening the indexed VCF +/// * if the chromosome for the variants cannot be found in the VCF +/// # Panics +/// * if the variant_hash is empty, the chrom.unwrap() will panic +fn load_vcf_variants(vcf_fn: &Path, variant_hash: &HashMap, reference_genome: Option<&ReferenceGenome>) + -> Result, Box> { + // first we need to open up the vcf and pull out the header + let mut vcf_reader: bcf::IndexedReader = bcf::IndexedReader::from_path(vcf_fn)?; + let vcf_header: bcf::header::HeaderView = vcf_reader.header().clone(); + + // prep our return struct + let mut ret: HashMap = Default::default(); + + // iterate over each variant, search for the corresponding entry in the VCF + for (variant, _v_meta) in variant_hash.iter() { + trace!("Searching for variant: {variant:?}"); + let chrom: &str = variant.chrom(); + let chrom_index: u32 = vcf_header.name2rid(chrom.as_bytes())?; + let position: usize = variant.position(); + const BUFFER: usize = 50; + let min_search: usize = position.saturating_sub(BUFFER); + let max_search: usize = position.saturating_add(BUFFER); + + // if we find anything, this gets changed + let mut search_genotype: Option = None; + match vcf_reader.fetch(chrom_index, min_search as u64, Some(max_search as u64)) { + Ok(()) => { + // we can iterate as normal + for record_result in vcf_reader.records() { + let record: rust_htslib::bcf::Record = record_result?; + let alleles: Vec<&str> = record.alleles().iter() + .map(|a| std::str::from_utf8(a).unwrap_or("UTF8_ERROR")) + .collect(); + let ref_allele = alleles[0]; + + // TODO: if we want to allow for multi-sample VCFs, we need to adjust this + let all_genotypes = record.genotypes()?; + let genotype = all_genotypes.get(0); + if genotype.len() != 2 { + warn!("Error while parsing genotype.len() != 2, ignoring: {} {} {:?} => {:?}", chrom, record.pos(), alleles, genotype); + continue; + } + + // figure out what the genotype is + let mut is_phased = false; + let gt1 = match genotype[0] { + GenotypeAllele::Unphased(at) => Some(at), + GenotypeAllele::Phased(at) => { + is_phased = true; + Some(at) + }, + //TODO: ignore these for now, not sure how to handle it? + GenotypeAllele::UnphasedMissing | + GenotypeAllele::PhasedMissing=> None + }; + let gt2 = match genotype[1] { + GenotypeAllele::Unphased(at) => Some(at), + GenotypeAllele::Phased(at) => { + is_phased = true; + Some(at) + }, + //TODO: ignore these for now, not sure how to handle it? + GenotypeAllele::UnphasedMissing | + GenotypeAllele::PhasedMissing=> None + }; + + // if we encounter empty genotypes, we will ignore them for now + if gt1.is_none() || gt2.is_none() { + warn!("Error while parsing incomplete genotype, ignoring: {} {} {:?} => {:?}", chrom, record.pos(), alleles, genotype); + continue; + } + let gt1 = gt1.unwrap() as usize; + let gt2 = gt2.unwrap() as usize; + + let phase_set: Option = if is_phased { + // check for a phase set ID if we are phased + match record.format(b"PS").integer() { + Ok(all_ps_tag) => { + // phase set parsing was fine + let ps_tag = all_ps_tag[0]; + assert_eq!(ps_tag.len(), 1); + Some(ps_tag[0] as usize) + }, + Err(e) => { + // phase set parsing failed, which is weird + warn!("Failed to parse \"PS\" tag for variant, setting unphased: {} {} {:?} => {}", chrom, record.pos(), alleles, e); + is_phased = false; + None + } + } + } else { + // marked as unphased, so no ID + None + }; + + for (alt_index, &alt_allele) in alleles.iter().enumerate().skip(1) { + match NormalizedVariant::new( + chrom.to_string(), + record.pos() as usize, + ref_allele, + alt_allele, + reference_genome + ) { + Ok(nv) => { + if nv == *variant { + if alt_index == gt1 && alt_index == gt2 { + // homozygous call - we know this can never be reference because we did .skip(1) + if phase_set.is_some() { + // we do not expect homozygous records to have a phase set + bail!("Homozygous record detected with a phase set ID (PS): {}", record.desc()); + } + assert!(search_genotype.is_none()); + search_genotype = Some(NormalizedGenotype::new( + Genotype::HomozygousAlternate, + phase_set + )); + } else if alt_index == gt1 && is_phased { + // heterozygous call like 1|0 + if phase_set.is_none() { + // heterozygous and phased, we need a PS tag + bail!("Phased record detected without a phase set ID (PS): {}", record.desc()); + } + assert!(search_genotype.is_none()); + search_genotype = Some(NormalizedGenotype::new( + Genotype::HeterozygousPhasedFlip, + phase_set + )); + } else if alt_index == gt2 && is_phased { + // heterozygous call like 0|1 + if phase_set.is_none() { + // heterozygous and phased, we need a PS tag + bail!("Phased record detected without a phase set ID (PS): {}", record.desc()); + } + assert!(search_genotype.is_none()); + search_genotype = Some(NormalizedGenotype::new( + Genotype::HeterozygousPhased, + phase_set + )); + } else if (alt_index == gt1 || alt_index == gt2) && !is_phased { + // heterozygous call like 0/1 (this can handle situations like 1/2 also though) + if phase_set.is_some() { + // heterozygous and marked as unphased, so if we have a PS tag, something is weird + bail!("Unphased heterozygous record detected with a phase set ID (PS): {}", record.desc()); + } + assert!(search_genotype.is_none()); + search_genotype = Some(NormalizedGenotype::new( + Genotype::HeterozygousUnphased, + phase_set + )); + } else { + // neither, possibly hom-reference or a different form of the allele + } + } else { + // this is a variant, just not one that matches what we are looking for + } + }, + Err(e) => { + warn!("Error parsing VCF variant {} {} {:?}: {e}", chrom, record.pos(), alleles); + } + }; + } + } + }, + Err(e) => { + // this usually happens when there are no entries for the chromosome + // error in the search, we will handle these later if they pop up + warn!("Received \'{}\', while seeking to {}:{}-{}, assuming no variants present", e, chrom, min_search, max_search); + } + }; + + // check if the search found anything + if let Some(ng) = search_genotype { + debug!("Genotype found for {variant:?}: {ng:?}"); + ret.insert(variant.clone(), ng); + } else { + trace!("Genotype not found."); + } + } + + Ok(ret) +} + +/// This is the workhorse function for computing the diplotype. +/// This initial implementation requires an *exact* match of the two haplotypes. +/// There can be ambiguity though, even with exact matches, so a Vec is returned. +/// # Arguments +/// * `normalized_haplotypes` - all possible haplotypes that are defined +/// * `variant_calls` - the set of variants that were identified as well as their genotype +/// # Errors +/// * None so far +fn solve_diplotype(normalized_haplotypes: &[NormalizedPgxHaplotype], variant_calls: &HashMap) -> Result, Box> { + // build up our base homozygous haplotype and also order the het variants at the same time + let mut base_haplotype: Vec = vec![]; + let mut het_variants: Vec = vec![]; + let mut null_haplogroups: usize = 0; + let mut identified_haplogroups: HashSet = Default::default(); + for (variant, genotype) in variant_calls.iter() { + match genotype.genotype() { + Genotype::HomozygousReference => panic!("we do not actually do anything with this"), + Genotype::HomozygousAlternate => base_haplotype.push(variant.clone()), + Genotype::HeterozygousUnphased | + Genotype::HeterozygousPhased | + Genotype::HeterozygousPhasedFlip => { + // always save the variant + het_variants.push(variant.clone()); + + // figure out the haplogroup + match genotype.phase_set() { + Some(ps) => { identified_haplogroups.insert(*ps); }, + None => null_haplogroups += 1 + }; + } + }; + } + + // now the magic + let diplotype: Vec = if het_variants.is_empty() { + // there are no heterozygous variants, so we are returning a homozygous allele + let mut matched: Option = None; + for haplotype in normalized_haplotypes.iter() { + if haplotype.matches(&base_haplotype) { + assert!(matched.is_none()); + matched = Some(haplotype.haplotype_name().to_string()); + } + } + + // since we require exact matching, it's possible we fail to find something + let match_name: String = matched.unwrap_or("NO_MATCH".to_string()); + vec![ + Diplotype::new(&match_name, &match_name) + ] + } else { + // complicated path - we have heterozygous variants to resolve + let total_haplogroups: usize = null_haplogroups + identified_haplogroups.len(); + + // if we have 'x' hets, there are 2^(x-1) combinations due to symmetry; iterate over that combination count + let max_combinations = 2_usize.pow(total_haplogroups as u32 - 1); + let mut valid_diplotypes: Vec = vec![]; + for combination in 0..max_combinations { + // first resolve this combination into the two haplotypes + let mut h1: Vec = base_haplotype.clone(); + let mut h2: Vec = base_haplotype.clone(); + + let mut combo_index: usize = 0; + let mut ps_lookup: HashMap = Default::default(); + for hv in het_variants.iter() { + // let is_h1: bool = ((combination >> i) & 0x1) != 0; + let genotype = variant_calls.get(hv).unwrap(); + + // this basically controls the permutation we are on + // if we have 3 iterations of unphased variants, the first one is h1=000, h2=111; i.e. all REF on h1, all ALT on h2 + let is_h1: bool = match genotype.phase_set() { + Some(ps) => { + match ps_lookup.entry(*ps) { + Occupied(entry) => { + // we already assigned this PS + *entry.get() + }, + Vacant(entry) => { + // we have not assigned this PS, so do so now + let r = ((combination >> combo_index) & 0x1) != 0; + entry.insert(r); + combo_index += 1; + r + } + } + }, + None => { + // unphased, so we always combo bump + let r = ((combination >> combo_index) & 0x1) != 0; + combo_index += 1; + r + } + }; + + // controls orientation + let orientation01: bool = match genotype.genotype() { + // unphased 0/1 and phased 0|1 orientations are treated the same + Genotype::HeterozygousUnphased | + Genotype::HeterozygousPhased => true, + // flipped 1|0 orientation + Genotype::HeterozygousPhasedFlip => false, + _ => panic!("we should not have homs in the het list") + }; + + if is_h1 == orientation01 { + // if (is_h1 AND normal orientation) OR (is_h2 AND flipped orientation) + h1.push(hv.clone()); + } else { + // if (is_h1 AND flipped orientatio) OR (is_h2 AND normal orientation) + h2.push(hv.clone()); + } + } + + assert_eq!(combo_index, total_haplogroups); + debug!("\t combination {} = {:?}, {:?}", combination, h1, h2); + + // now compare them to see if they both match an allele + let mut h1_matched: Option = None; + let mut h2_matched: Option = None; + for haplotype in normalized_haplotypes.iter() { + if haplotype.matches(&h1) { + assert!(h1_matched.is_none()); + h1_matched = Some(haplotype.haplotype_name().to_string()); + } + if haplotype.matches(&h2) { + assert!(h2_matched.is_none()); + h2_matched = Some(haplotype.haplotype_name().to_string()); + } + } + + if let (Some(h1_name), Some(h2_name)) = (h1_matched, h2_matched) { + // both matched, save this combination + valid_diplotypes.push(Diplotype::new(&h1_name, &h2_name)); + } + } + + if valid_diplotypes.is_empty() { + // no exact matching combinations were found + vec![ + Diplotype::new("NO_MATCH", "NO_MATCH") + ] + } else { + // we found one or more + valid_diplotypes + } + }; + + Ok(diplotype) +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::path::PathBuf; + + use crate::util::file_io::load_json; + + fn load_test_reference() -> ReferenceGenome { + let ref_fn = PathBuf::from("test_data/test_reference.fa"); + ReferenceGenome::from_fasta(&ref_fn).unwrap() + } + + fn create_dummy_cli() -> DiplotypeSettings { + Default::default() + } + + /// mainly test that the database values loaded matches expectations, CACNA1S is a relatively simple starter + #[test] + fn test_load_database_haplotypes() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/CACNA1S/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + let gene_entry: &PgxGene = database.gene_entries().get("CACNA1S").unwrap(); + + // get the haplotypes + let (normalized_variants, normalized_haplotypes) = load_database_haplotypes(gene_entry, None).unwrap(); + + // check the variants + let v1 = NormalizedVariant::new("chr1".to_string(), 201091992, "G", "A", None).unwrap(); + let v1_meta = VariantMeta { variant_id: 777260, name: "faux".to_string(), dbsnp_id: Some("rs772226819".to_string()) }; + let v2 = NormalizedVariant::new("chr1".to_string(), 201060814, "C", "T", None).unwrap(); + let v2_meta = VariantMeta { variant_id: 777261, name: "faux".to_string(), dbsnp_id: Some("rs1800559".to_string()) }; + let expected_variants = HashMap::from_iter(vec![ + (v1.clone(), v1_meta), + (v2.clone(), v2_meta) + ].into_iter()); + assert_eq!(normalized_variants, expected_variants); + + // check the haplotypes + assert_eq!(normalized_haplotypes.len(), 3); + let h1 = NormalizedPgxHaplotype::new("Reference".to_string()); + let mut h2 = NormalizedPgxHaplotype::new("c.3257G>A".to_string()); + h2.add_variant(vec![Some(v2)]); + let mut h3 = NormalizedPgxHaplotype::new("c.520C>T".to_string()); + h3.add_variant(vec![Some(v1)]); + let expected_haplotypes = vec![h1, h2, h3]; + assert_eq!(normalized_haplotypes, expected_haplotypes); + } + + /// tests that we load variants from a homozygous only VCF correctly + #[test] + fn test_load_vcf_variants() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/CACNA1S/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + let gene_entry: &PgxGene = database.gene_entries().get("CACNA1S").unwrap(); + + // get the haplotypes + let (normalized_variants, _normalized_haplotypes) = load_database_haplotypes(gene_entry, None).unwrap(); + + // now we need to load from VCF + let vcf_fn = PathBuf::from("./test_data/CACNA1S/hom.vcf.gz"); + let vcf_variants = load_vcf_variants(&vcf_fn, &normalized_variants, None).unwrap(); + + // make sure we have this variant as homozygous + let expected_variant = NormalizedVariant::new("chr1".to_string(), 201060814, "C", "T", None).unwrap(); + let expected_genotype = NormalizedGenotype::new(Genotype::HomozygousAlternate, None); + + // we just expect the one hom call + assert_eq!(vcf_variants.len(), 1); + assert_eq!(*vcf_variants.get(&expected_variant).unwrap(), expected_genotype); + } + + /// tests that bad variants through errors + #[test] + fn test_invalid_ps_vcf() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/CACNA1S/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + let gene_entry: &PgxGene = database.gene_entries().get("CACNA1S").unwrap(); + + // get the haplotypes + let (normalized_variants, _normalized_haplotypes) = load_database_haplotypes(gene_entry, None).unwrap(); + + // now we need test the bad VCF + let vcf_fn = PathBuf::from("./test_data/CACNA1S/bad_hom_ps.vcf.gz"); + let result = load_vcf_variants(&vcf_fn, &normalized_variants, None); + assert!(result.is_err()); + } + + /// This is basically a full test of the single CACNA1S homozygous call + #[test] + fn test_solve_diplotype_hom() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/CACNA1S/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/CACNA1S/hom.vcf.gz"); + + let diplotype = call_diplotypes(&database, Some(&vcf_fn), None, &[], &create_dummy_cli()).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("CACNA1S").unwrap().diplotypes(), vec![Diplotype::new("c.3257G>A", "c.3257G>A")]); + } + + /// This is basically a full test of the single CACNA1S heterozygous call + #[test] + fn test_solve_diplotype_het() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/CACNA1S/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/CACNA1S/het.vcf.gz"); + + let diplotype = call_diplotypes(&database, Some(&vcf_fn), None, &[], &create_dummy_cli()).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("CACNA1S").unwrap().diplotypes(), vec![Diplotype::new("Reference", "c.3257G>A")]); + } + + /// This is basically a full test of the CACNA1S compound heterozygous call + #[test] + fn test_solve_diplotype_compound_het() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/CACNA1S/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/CACNA1S/compound_het.vcf.gz"); + + let diplotype = call_diplotypes(&database, Some(&vcf_fn), None, &[], &create_dummy_cli()).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("CACNA1S").unwrap().diplotypes(), vec![Diplotype::new("c.520C>T", "c.3257G>A")]); + } + + /// This is basically a full test of the RNR1 compound het for overlapping variants + #[test] + fn test_solve_diplotype_overlapping_compound_het() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/RNR1-faux/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + let reference_genome = load_test_reference(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/RNR1-faux/compound_het.vcf.gz"); + + let diplotype = call_diplotypes(&database, Some(&vcf_fn), Some(&reference_genome), &[], &create_dummy_cli()).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("MT-RNR1").unwrap().diplotypes(), vec![Diplotype::new("961T>del", "961T>del+Cn")]); + } + + /// This is basically a full test of the RNR1 homozygous for overlapping variants + #[test] + fn test_solve_diplotype_overlapping_hom() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/RNR1-faux/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + let reference_genome = load_test_reference(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/RNR1-faux/hom.vcf.gz"); + + let diplotype = call_diplotypes(&database, Some(&vcf_fn), Some(&reference_genome), &[], &create_dummy_cli()).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("MT-RNR1").unwrap().diplotypes(), vec![Diplotype::new("961T>del+Cn", "961T>del+Cn")]); + } + + /// This is basically a full test of the UGT1A1 variant *1/*80+*28 + #[test] + fn test_solve_same_phase_001() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/UGT1A1-faux/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + let reference_genome = load_test_reference(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/UGT1A1-faux/same_phase_001.vcf.gz"); + let diplotype = call_diplotypes(&database, Some(&vcf_fn), Some(&reference_genome), &[], &create_dummy_cli()).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("UGT1A1").unwrap().diplotypes(), vec![Diplotype::new("*1", "*80+*28")]); + } + + /// Same as above, but reversed; for now it keeps the alleles ordered by the phasing + #[test] + fn test_solve_same_phase_002() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/UGT1A1-faux/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + let reference_genome = load_test_reference(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/UGT1A1-faux/same_phase_002.vcf.gz"); + let diplotype = call_diplotypes(&database, Some(&vcf_fn), Some(&reference_genome), &[], &create_dummy_cli()).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("UGT1A1").unwrap().diplotypes(), vec![Diplotype::new("*80+*28", "*1")]); + } + + /// Opposite phased alleles + #[test] + fn test_solve_opposite_phase_001() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/UGT1A1-faux/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + let reference_genome = load_test_reference(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/UGT1A1-faux/opposite_phase_001.vcf.gz"); + let diplotype = call_diplotypes(&database, Some(&vcf_fn), Some(&reference_genome), &[], &create_dummy_cli()).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("UGT1A1").unwrap().diplotypes(), vec![Diplotype::new("*28", "*80")]); + } + + /// Opposite phased alleles, and swap to *37 + #[test] + fn test_solve_opposite_phase_002() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/UGT1A1-faux/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + let reference_genome = load_test_reference(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/UGT1A1-faux/opposite_phase_002.vcf.gz"); + let diplotype = call_diplotypes(&database, Some(&vcf_fn), Some(&reference_genome), &[], &create_dummy_cli()).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("UGT1A1").unwrap().diplotypes(), vec![Diplotype::new("*80", "*37")]); + } + + /// Homozygous allele + phased allele + #[test] + fn test_solve_hethom_phase_001() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/UGT1A1-faux/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + let reference_genome = load_test_reference(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/UGT1A1-faux/hethom_phase_001.vcf.gz"); + let diplotype = call_diplotypes(&database, Some(&vcf_fn), Some(&reference_genome), &[], &create_dummy_cli()).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("UGT1A1").unwrap().diplotypes(), vec![Diplotype::new("*80+*28", "*80+*37")]); + } + + /// Different phase sets, so effectively unphased; should return two options + #[test] + fn test_solve_different_phaseset_001() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/UGT1A1-faux/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + let reference_genome = load_test_reference(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/UGT1A1-faux/different_phaseset_001.vcf.gz"); + let diplotype = call_diplotypes(&database, Some(&vcf_fn), Some(&reference_genome), &[], &create_dummy_cli()).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("UGT1A1").unwrap().diplotypes(), vec![ + Diplotype::new("*1", "*80+*28"), + Diplotype::new("*28", "*80") + ]); + } + + /// Different phase sets, so effectively unphased; should return two options that shifts the *80 around + #[test] + fn test_solve_different_phaseset_002() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/UGT1A1-faux/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + let reference_genome = load_test_reference(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/UGT1A1-faux/different_phaseset_002.vcf.gz"); + let diplotype = call_diplotypes(&database, Some(&vcf_fn), Some(&reference_genome), &[], &create_dummy_cli()).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("UGT1A1").unwrap().diplotypes(), vec![ + Diplotype::new("*28", "*80+*37"), + Diplotype::new("*37", "*80+*28") + ]); + } + + // test when we don't provide a VCF, we should get no calls + // we prevent this in the CLI, but worth checking that we don't crash + #[test] + fn test_no_files() { + // load the database + let database_fn: PathBuf = PathBuf::from("./data/v0.9.0/cpic_20240404.json.gz"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + + // and here's the VCF + let no_ref_fn = None; + let no_vcf_fn = None; + let no_bams = []; + let diplotype = call_diplotypes(&database, no_vcf_fn, no_ref_fn, &no_bams, &create_dummy_cli()).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + // make sure we didn't call anything + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 0); + } + + #[test] + fn test_include_set() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/CACNA1S/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/CACNA1S/compound_het.vcf.gz"); + + // test a CLI that just includes CACNA1S + let mut cli = create_dummy_cli(); + cli.include_fn = Some(PathBuf::from("test_data/CACNA1S/CACNA1S_gene_list.txt")); + + let diplotype = call_diplotypes(&database, Some(&vcf_fn), None, &[], &cli).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("CACNA1S").unwrap().diplotypes(), vec![Diplotype::new("c.520C>T", "c.3257G>A")]); + + // now do the same test, but with an empty include list + let mut cli = create_dummy_cli(); + cli.include_fn = Some(PathBuf::from("test_data/empty_gene_list.txt")); + + let diplotype = call_diplotypes(&database, Some(&vcf_fn), None, &[], &cli).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 0); + } + + #[test] + fn test_exclude_set() { + // load the database + let database_fn: PathBuf = PathBuf::from("test_data/CACNA1S/database.json"); + let database: PgxDatabase = load_json(&database_fn).unwrap(); + + // and here's the VCF + let vcf_fn = PathBuf::from("./test_data/CACNA1S/compound_het.vcf.gz"); + + // test a CLI that excludes nothing from the list + let mut cli = create_dummy_cli(); + cli.exclude_fn = Some(PathBuf::from("test_data/empty_gene_list.txt")); + + let diplotype = call_diplotypes(&database, Some(&vcf_fn), None, &[], &cli).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 1); + assert_eq!(*diplotypes.get("CACNA1S").unwrap().diplotypes(), vec![Diplotype::new("c.520C>T", "c.3257G>A")]); + + // now do the same test, but with our gene in the exclude list + let mut cli = create_dummy_cli(); + cli.exclude_fn = Some(PathBuf::from("test_data/CACNA1S/CACNA1S_gene_list.txt")); + + let diplotype = call_diplotypes(&database, Some(&vcf_fn), None, &[], &cli).unwrap(); + + // make sure the metadata matches exactly + assert_eq!(database.database_metadata(), diplotype.database_metadata()); + + let diplotypes = diplotype.gene_details(); + assert_eq!(diplotypes.len(), 0); + } +} \ No newline at end of file diff --git a/src/hla/alleles.rs b/src/hla/alleles.rs new file mode 100644 index 0000000..fb1453d --- /dev/null +++ b/src/hla/alleles.rs @@ -0,0 +1,311 @@ + +use serde::{Deserialize, Serialize}; +use simple_error::{SimpleError, bail}; +use std::collections::BTreeMap; + +use crate::data_types::coordinates::Coordinates; + +/// Contains the configuration for the HLA database. +/// These are generally coordinates that we do not expect to change except between reference builds. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct HlaConfig { + /// High-level coordinates of the CYP2D regions + hla_coordinates: BTreeMap, + /// Strand orientation for the gene relative to the reference genome + #[serde(default="HlaConfig::default_strand")] + hla_is_forward_strand: BTreeMap, + /// Specific subregion, like exons + hla_exons: BTreeMap>, +} + +impl HlaConfig { + /// This function should be called after loading a config to verify that everything required to run the algorithms is present. + pub fn validate_config(&self) -> Result<(), SimpleError> { + // make sure all expected regions are defined + let expected_hla_coordinates = [ + "HLA-A", "HLA-B" + ]; + for &k in expected_hla_coordinates.iter() { + if !self.hla_coordinates.contains_key(k) { + bail!("Coordinates for \"{}\" were not found in provided hla_coordinates.", k); + } + } + + // make sure all exon regions are defined + let expected_num_exons = 8; + for &k in expected_hla_coordinates.iter() { + if !self.hla_exons.contains_key(k) { + bail!("Data for \"{}\" was not found in provided hla_exons.", k); + } + let exons = self.hla_exons.get(k).unwrap(); + if exons.len() != expected_num_exons { + bail!("Found {} exons for \"{}\", expected {}.", exons.len(), k, expected_num_exons); + } + } + + Ok(()) + } + + // getters + pub fn hla_coordinates(&self) -> &BTreeMap { + &self.hla_coordinates + } + + pub fn hla_is_forward_strand(&self) -> &BTreeMap { + &self.hla_is_forward_strand + } + + pub fn hla_exons(&self) -> &BTreeMap> { + &self.hla_exons + } + + // Defaults for strand orientations. Having this as a separate functions allows us to do an update without requiring a DB update. + fn default_strand() -> BTreeMap { + let mut hla_is_forward_strand: BTreeMap = Default::default(); + hla_is_forward_strand.insert("HLA-A".to_string(), true); + hla_is_forward_strand.insert("HLA-B".to_string(), false); + hla_is_forward_strand + } +} + +impl Default for HlaConfig { + fn default() -> Self { + let mut hla_coordinates: BTreeMap = Default::default(); + let preshift = 1; // coordinate below are from UCSC and not 0-base shifted + let postshift = 0; + // HLA-A + // chr6:29,942,532-29,945,870 from UCSC browser + // 03:01:01:01 blats to chr6:29942254-29945755, partially updated below (old end was higher); TODO: systematically solve this? + hla_coordinates.insert("HLA-A".to_string(), Coordinates::new("chr6".to_string(), 29942254 - preshift, 29945870 - postshift)); + // HLA-B + // chr6:31,353,875-31,357,179 from UCSC browser + // 08:01:01:01 blats to chr6:31353362-31357442, updated below; TODO: systematically solve this? + hla_coordinates.insert("HLA-B".to_string(), Coordinates::new("chr6".to_string(), 31353362 - preshift, 31357442 - postshift)); + + let hla_is_forward_strand: BTreeMap = HlaConfig::default_strand(); + + /* + // we were originally using this as a filter, but it's no longer necessary; preserving in case we need this in the future + // the HLA lengths + static ref HLA_CDNA_LENGTHS: HashMap = { + let mut hla_lengths: HashMap = Default::default(); + hla_lengths.insert("HLA-A".to_string(), 1098); + hla_lengths.insert("HLA-B".to_string(), 1089); + hla_lengths + }; + */ + + // TODO: figure out how to automate this part in the future? + let mut hla_exons: BTreeMap> = Default::default(); + hla_exons.insert("HLA-A".to_string(), + vec![ + // 0-based, exclusive, sorted; copied from RefSeq file + Coordinates::new("chr6".to_string(), 29942532 - preshift, 29942626 - postshift), + Coordinates::new("chr6".to_string(), 29942757 - preshift, 29943026 - postshift), + Coordinates::new("chr6".to_string(), 29943268 - preshift, 29943543 - postshift), + Coordinates::new("chr6".to_string(), 29944122 - preshift, 29944397 - postshift), + Coordinates::new("chr6".to_string(), 29944500 - preshift, 29944616 - postshift), + Coordinates::new("chr6".to_string(), 29945059 - preshift, 29945091 - postshift), + Coordinates::new("chr6".to_string(), 29945234 - preshift, 29945281 - postshift), + Coordinates::new("chr6".to_string(), 29945451 - preshift, 29945870 - postshift) + ] + ); + hla_exons.insert("HLA-B".to_string(), + vec![ + // 0-based, exclusive, sorted; copied from RefSeq file + Coordinates::new("chr6".to_string(), 31353875 - preshift, 31354296 - postshift), + Coordinates::new("chr6".to_string(), 31354479 - preshift, 31354526 - postshift), + Coordinates::new("chr6".to_string(), 31354633 - preshift, 31354665 - postshift), + Coordinates::new("chr6".to_string(), 31355107 - preshift, 31355223 - postshift), + Coordinates::new("chr6".to_string(), 31355317 - preshift, 31355592 - postshift), + Coordinates::new("chr6".to_string(), 31356167 - preshift, 31356442 - postshift), + Coordinates::new("chr6".to_string(), 31356688 - preshift, 31356957 - postshift), + Coordinates::new("chr6".to_string(), 31357086 - preshift, 31357179 - postshift) + ] + ); + + Self { + hla_coordinates, + hla_is_forward_strand, + hla_exons + } + } +} + +/// Wrapper for all the HLA allele informations +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct HlaAlleleDefinition { + /// The identifier from IMGT + hla_id: String, + /// The gene name this goes to; e.g. HLA-A + gene_name: String, + /// The assigned star allele as a Vec; e.g. ["01", "01", "01"] is a three field allele + star_allele: Vec, + /// The DNA sequence for the record + dna_sequence: Option, + /// The cDNA sequence for the record + cdna_sequence: String +} + +impl HlaAlleleDefinition { + /// Creates a new HlaAlleleDefinition and performs some checks along the way + /// # Arguments + /// * `hla_id` - the identifier, expected to be of form "HLA:HLA00001" + /// * `description` - basically, the star allele, should be of form "A*01:01:01:01" + /// * `dna_sequence` - the optional DNA sequence, should be ACGT symbols only + /// * `cdna_sequence` - the cDNA sequence, should be ACGT symbols only + pub fn new(hla_id: String, description: &str, dna_sequence: Option, cdna_sequence: String) -> Result> { + let star_split: Vec<&str> = description.split('*').collect(); + if star_split.len() != 2 { + bail!("Star split length != 2 for allele description: {description}"); + } + let gene_name: String = format!("HLA-{}", star_split[0]); + + let star_allele: Vec = star_split[1].split(':').map(String::from).collect(); + if star_allele.len() > 4 { + bail!("Unexpected number of fields for allele description: {description}"); + } + + let allowed_symbols = ['A', 'C', 'G', 'T']; + if let Some(dna) = dna_sequence.as_ref() { + if !dna.chars().all(|c| allowed_symbols.contains(&c)) { + bail!("DNA sequence contains non-ACGT symbols."); + } + } + if !cdna_sequence.chars().all(|c| allowed_symbols.contains(&c)) { + bail!("cDNA sequence contains non-ACGT symbols."); + } + + Ok(HlaAlleleDefinition { + hla_id, + gene_name, + star_allele, + dna_sequence, + cdna_sequence + }) + } + + pub fn hla_id(&self) -> &str { + &self.hla_id + } + + pub fn gene_name(&self) -> &str { + &self.gene_name + } + + pub fn star_allele(&self) -> &[String] { + &self.star_allele + } + + pub fn dna_sequence(&self) -> Option<&str> { + self.dna_sequence.as_deref() + } + + pub fn cdna_sequence(&self) -> &str { + &self.cdna_sequence + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::path::PathBuf; + + use crate::util::file_io::load_json; + + #[test] + fn test_config_full_length() { + // full file + let test_fn = PathBuf::from("test_data/HLA_configs/full_length.json"); + let config: HlaConfig = load_json(&test_fn).unwrap(); + assert!(config.validate_config().is_ok()); + } + + #[test] + fn test_config_missing_regions() { + // this one is missing a CYP2D6 coordinate + let test_fn = PathBuf::from("test_data/HLA_configs/missing_regions.json"); + let config: HlaConfig = load_json(&test_fn).unwrap(); + assert!(config.validate_config().is_err()); + } + + #[test] + fn test_config_missing_exons() { + // this one is missing a CYP2D6 exon + let test_fn = PathBuf::from("test_data/HLA_configs/missing_exons.json"); + let config: HlaConfig = load_json(&test_fn).unwrap(); + assert!(config.validate_config().is_err()); + } + + #[test] + fn test_good_allele_def() { + let test_name = "test_name".to_string(); + let test_gene = "A"; + let test_star = "01:01:01:01"; + let test_description = format!("{test_gene}*{test_star}"); + let test_dna = Some("ACGT".to_string()); + let test_cdna = "CG".to_string(); + let test_result = HlaAlleleDefinition::new( + test_name.clone(), + &test_description, + test_dna.clone(), + test_cdna.clone() + ).unwrap(); + assert_eq!(test_result, HlaAlleleDefinition { + hla_id: test_name, + gene_name: "HLA-A".to_string(), + star_allele: vec!["01".to_string(); 4], + dna_sequence: test_dna, + cdna_sequence: test_cdna + }); + } + + #[test] + fn test_bad_fields() { + // not getting tested + let test_name = "test_name".to_string(); + let test_dna = Some("ACGT".to_string()); + let test_cdna = "CG".to_string(); + + // too many fields + let test_gene = "A"; + let test_star = "01:01:01:01:01"; + let test_description = format!("{test_gene}*{test_star}"); + let test_result = HlaAlleleDefinition::new( + test_name, + &test_description, + test_dna, + test_cdna + ); + assert!(test_result.is_err()); + } + + #[test] + fn test_bad_alleles() { + let test_name = "test_name".to_string(); + let test_gene = "A"; + let test_star = "01:01:01:01"; + let test_description = format!("{test_gene}*{test_star}"); + + // bad dna + let test_bad = "BOB".to_string(); + let test_good = "CG".to_string(); + let test_result = HlaAlleleDefinition::new( + test_name.clone(), + &test_description, + Some(test_bad.clone()), + test_good.clone() + ); + assert!(test_result.is_err()); + + // bad cnda + let test_result = HlaAlleleDefinition::new( + test_name.clone(), + &test_description, + Some(test_good), + test_bad + ); + assert!(test_result.is_err()); + } +} \ No newline at end of file diff --git a/src/hla/caller.rs b/src/hla/caller.rs new file mode 100644 index 0000000..4fa5915 --- /dev/null +++ b/src/hla/caller.rs @@ -0,0 +1,1075 @@ + +use bio::bio_types::genome::AbstractInterval; +use log::{debug, error, info, trace, warn}; +use minimap2::Aligner; +use rust_htslib::bam::record::{Cigar, CigarString}; +use rust_htslib::bam::Read; +use rust_htslib::bam::ext::BamRecordExtensions; +use rust_lib_reference_genome::reference_genome::ReferenceGenome; +use rustc_hash::FxHashMap as HashMap; +use simple_error::bail; +use statrs::distribution::{Binomial, DiscreteCDF}; +use waffle_con::cdwfa_config::{CdwfaConfig, CdwfaConfigBuilder, CdwfaConfigBuilderError}; +use waffle_con::consensus::ConsensusDWFA; +use waffle_con::dual_consensus::{DualConsensus, DualConsensusDWFA}; +use std::collections::BTreeMap; +use std::path::PathBuf; + +use crate::cli::diplotype::DiplotypeSettings; +use crate::data_types::coordinates::Coordinates; +use crate::data_types::database::PgxDatabase; +use crate::data_types::mapping::MappingStats; +use crate::data_types::pgx_diplotypes::{Diplotype, PgxGeneDetails, PgxMappingDetails}; +use crate::hla::alleles::HlaAlleleDefinition; +use crate::hla::debug::{HlaDebug, ReadMappingStats}; +use crate::hla::mapping::HlaMappingStats; +use crate::hla::processed_match::HlaProcessedMatch; +use crate::util::file_io::save_fasta; +use crate::util::sequence::reverse_complement; +use crate::visualization::debug_bam_writer::{unmapped_record, DebugBamWriter}; + +/// This is the main function to call for HLA diplotyping from a BAM file. +/// # Arguments +/// * `gene_list` - the list of HLA genes to diplotype +/// * `database` - the pre-loaded database +/// * `bam_filenames` - list of BAM files containing reads to scan +/// * `reference_filename` - reference genome file path +/// * `cli_settings` - settings for diplotyping +/// # Errors +/// * if a gene is provided that we do not support +/// * if we cannot open or parse a BAM file correctly +pub fn diplotype_hla( + gene_list: &[String], database: &PgxDatabase, + bam_filenames: &[PathBuf], reference_genome: &ReferenceGenome, + mut debug_bam_writer: Option<&mut DebugBamWriter>, + cli_settings: &DiplotypeSettings +) -> Result, Box> { + // if we have disabled cDNA scoring AND the DNA requirement is NOT enabled; then we can cause errors later due to lack of comparator sequence + if cli_settings.disable_cdna_scoring && !cli_settings.hla_require_dna { + bail!("If cDNA scoring is disabled, require HLA DNA must be enabled"); + } + + // prep all the bam readers + let mut bam_readers: Vec = vec![]; + for bam_fn in bam_filenames.iter() { + let mut b = rust_htslib::bam::IndexedReader::from_path(bam_fn)?; + b.set_reference(reference_genome.filename())?; + bam_readers.push(b); + } + + //set up job configuration + let mut ret: HashMap = Default::default(); + let mut debug_stats = HlaDebug::new(); + + for gene_name in gene_list.iter() { + info!("Solving {gene_name}..."); + + // get the coordinates for this gene + let gene_coordinates = match database.hla_config().hla_coordinates().get(gene_name) { + Some(c) => c, + None => bail!("No coordinates for {gene_name}") + }; + debug!("Gene coordinates: {gene_coordinates:?}"); + + // build a reference DNA aligner + let buffer = 100; + let reference_coordinates = Coordinates::new(gene_coordinates.chrom().to_string(), gene_coordinates.start() - buffer, gene_coordinates.end() + buffer); + debug!("Buffered coordinates: {reference_coordinates:?}"); + let reference_sequence = reference_genome.get_slice(reference_coordinates.chrom(), reference_coordinates.start() as usize, reference_coordinates.end() as usize); + let ref_len = reference_sequence.len(); + let dna_aligner: Aligner = Aligner::builder() + .map_hifi() + .with_cigar() + .with_seq(reference_sequence)?; + + // we only need cigar and md for debugging + // other settings for mapping + let output_cigar: bool = true; + let output_md: bool = true; + let max_frag_len: Option = None; + let extra_flags = None; + + // this is where we collect all the DNA segments; using BTreeMap for defined iteration order + let mut read_segments: BTreeMap = Default::default(); + let mut spliced_segments: BTreeMap = Default::default(); + + // store all the relevant mapping details + let mut mapping_details: Vec = Default::default(); + + // iterate over each bam, and fetch the reads + for (bam_index, bam) in bam_readers.iter_mut().enumerate() { + match bam.fetch(reference_coordinates.fetch_definition()) { + Ok(()) => {}, + Err(e) => { + let filename = &bam_filenames[bam_index]; + warn!("Received error \"{e}\" while fetching {reference_coordinates} in {filename:?}, assuming no reads for region."); + continue; + } + }; + + for read_entry in bam.records() { + let mut read = read_entry.unwrap(); + + //build out the cigar info + read.cache_cigar(); + + let qname: String = std::str::from_utf8(read.qname())?.to_string(); + let full_range = read.range(); + + if full_range.start > reference_coordinates.start() || full_range.end < reference_coordinates.end() { + continue; + } + + let read_bytes = read.seq().as_bytes(); + let d_mappings = dna_aligner.map( + &read_bytes, + output_cigar, output_md, max_frag_len, extra_flags.clone() + )?; + + let mut best_stats = MappingStats::new(ref_len, ref_len, 0); + let mut best_mapping = None; + for m in d_mappings.iter() { + // scoring is based on the lowest edit distance, including unmapped + let nm = m.alignment.as_ref().unwrap().nm as usize; + + // some sanity checks while we debug + let unmapped = ref_len - (m.target_end - m.target_start) as usize; + let stats = MappingStats::new(ref_len, nm, unmapped); + trace!("\t{stats:?}"); + if stats.mapping_score().score() < best_stats.mapping_score().score() { + best_stats = stats; + best_mapping = Some(m); + } + } + + if best_stats.mapping_score().score() > cli_settings.max_error_rate { + debug!("Best score for {qname} was {}, ignoring read.", best_stats.mapping_score().score()); + + // this one is ignored, so add the stats here to make it clear that we intentionally ignored it later + let mapping_stats = HlaMappingStats::from_mapping_stats(None, Some(best_stats)); + let ignored_details = PgxMappingDetails::new( + qname, + "REFERENCE".to_string(), + "REFERENCE".to_string(), + mapping_stats, + true + ); + mapping_details.push(ignored_details); + } else if let Some(bm) = best_mapping { + debug!("Best score for {qname}: {}", best_stats.score_string()); + let start = bm.query_start as usize; + let end = bm.query_end as usize; + let read_segment = std::str::from_utf8(&read_bytes[start..end])?.to_string(); + read_segments.insert(qname.clone(), read_segment); + let spliced_read = splice_read(&mut read, database, gene_name)?; + spliced_segments.insert(qname, spliced_read); + } else { + debug!("No mappings found for {qname}, ignoring read."); + } + } + } + + let best_result = if read_segments.is_empty() { + // we did not find any reads, definitely no way to get a consensus from that + ("NO_READS".to_string(), "NO_READS".to_string()) + } else { + let mut consensus_map: BTreeMap = Default::default(); + let consensus = run_dual_consensus(&spliced_segments, cli_settings)?; + let dual_passes = is_passing_dual(&consensus, cli_settings); + + // check if we found two using the cDNA + let consensus = if dual_passes { + // we did, continue on + debug!("cDNA dual consensus successful."); + consensus + } else { + // we did not, run dual on DNA + debug!("cDNA dual consensus was homozygous, attempting dual consensus on DNA."); + run_dual_consensus(&read_segments, cli_settings)? + }; + + // output debug records + let mut debug_records = vec![]; + + // re-run consensus on the groupings; this is required because we might have solved it via cDNA and need the DNA now + let mut consensus_dwfa1 = ConsensusDWFA::with_config(dwfa_config_from_cli(cli_settings)?)?; + let mut consensus_dwfa2 = ConsensusDWFA::with_config(dwfa_config_from_cli(cli_settings)?)?; + for ((_qname, read_segment), &is_consensus1) in read_segments.iter().zip(consensus.is_consensus1().iter()) { + if is_consensus1 { + consensus_dwfa1.add_sequence(read_segment.as_bytes())?; + } else { + consensus_dwfa2.add_sequence(read_segment.as_bytes())?; + } + } + + let is_forward_strand = *database.hla_config().hla_is_forward_strand().get(gene_name).unwrap(); + let c1_list = consensus_dwfa1.consensus()?; + let con1 = std::str::from_utf8(c1_list[0].sequence())?.to_string(); + let con1_label = format!("consensus1_{gene_name}"); + if is_forward_strand { + consensus_map.insert(con1_label, con1.clone()); + } else { + consensus_map.insert(con1_label, String::from_utf8(reverse_complement(con1.as_bytes())?)?); + } + + let (_map_results1, best_map1) = score_consensus( + &dna_aligner, &reference_coordinates, &con1, + database, gene_name, cli_settings + )?; + debug!("best_map1: {:?} {:?}", best_map1.best_match_id(), best_map1.best_match_star()); + + // add the consensus to our debug records + let tags = [("HP".to_string(), format!("1_consensus1_{gene_name}"))].into_iter().collect(); + match unmapped_record( + &format!("consensus1_{gene_name}"), + &con1, + &tags + ) { + Ok(umr) => { + debug_records.push(umr); + }, + Err(e) => { + error!("Error while creating unmapped record: {e}"); + } + }; + + // also add the matching haplotype for comparison + if let Some(best_id) = best_map1.best_match_id() { + let star_allele = best_map1.best_match_star().unwrap_or("NO_STAR_DEF").to_string(); + let dna_sequence = database.hla_sequences().get(best_id).unwrap().dna_sequence(); + if let Some(sequence) = dna_sequence { + let sequence = if is_forward_strand { + sequence.to_string() + } else { + String::from_utf8(reverse_complement(sequence.as_bytes())?)? + }; + let tags = [("HP".to_string(), format!("2_{gene_name}*{star_allele}"))].into_iter().collect(); + + match unmapped_record( + &star_allele, + &sequence, + &tags + ) { + Ok(umr) => { + debug_records.push(umr); + }, + Err(e) => { + error!("Error while creating unmapped record: {e}"); + } + }; + } else { + // I don't think we want a warning here since this is fairly common and outside user control + } + } + + // save the ID + let best_id1 = best_map1.best_match_id().unwrap_or("NO_ID_MATCH").to_string(); + debug_stats.add_read(gene_name.clone(), "consensus1".to_string(), best_map1)?; + + let best_dual_result = if consensus.is_dual() { + // we have a dual consensus, type the second one also + let c2_list = consensus_dwfa2.consensus()?; + let con2 = std::str::from_utf8(c2_list[0].sequence())?.to_string(); + let con2_label = format!("consensus2_{gene_name}"); + if is_forward_strand { + consensus_map.insert(con2_label, con2.clone()); + } else { + consensus_map.insert(con2_label, String::from_utf8(reverse_complement(con2.as_bytes())?)?); + } + + let (_map_results2, best_map2) = score_consensus( + &dna_aligner, &reference_coordinates, &con2, + database, gene_name, cli_settings + )?; + debug!("best_map2: {:?} {:?}", best_map2.best_match_id(), best_map2.best_match_star()); + + // add the consensus to our debug records + let tags = [("HP".to_string(), format!("4_consensus2_{gene_name}"))].into_iter().collect(); + match unmapped_record( + &format!("consensus2_{gene_name}"), + &con2, + &tags + ) { + Ok(umr) => { + debug_records.push(umr); + }, + Err(e) => { + error!("Error while creating unmapped record: {e}"); + } + }; + + // also add the matching haplotype for comparison + if let Some(best_id) = best_map2.best_match_id() { + // get the star allele and sequence + let star_allele = best_map2.best_match_star().unwrap_or("NO_STAR_DEF").to_string(); + let dna_sequence = database.hla_sequences().get(best_id).unwrap().dna_sequence(); + if let Some(sequence) = dna_sequence { + let sequence = if is_forward_strand { + sequence.to_string() + } else { + String::from_utf8(reverse_complement(sequence.as_bytes())?)? + }; + + let tags = [("HP".to_string(), format!("5_{gene_name}*{star_allele}"))].into_iter().collect(); + match unmapped_record( + &star_allele, + &sequence, + &tags + ) { + Ok(umr) => { + debug_records.push(umr); + }, + Err(e) => { + error!("Error while creating unmapped record: {e}"); + } + }; + } else { + // I don't think we want a warning here since this is fairly common and outside user control + } + } + + // save the id + let best_id2 = best_map2.best_match_id().unwrap_or("NO_ID_MATCH").to_string(); + debug_stats.add_read(gene_name.clone(), "consensus2".to_string(), best_map2)?; + + // we found a dual consensus, first check if the CDF and MAF are passing + let total_count = read_segments.len(); + let counts1 = consensus.is_consensus1().iter().filter(|&&b| b).count(); + let counts2 = total_count - counts1; + + let dual_passed = is_passing_dual(&consensus, cli_settings); + if dual_passed { + // MAF and CDF likelihood is above cutoff, so assume heterozygous is correct + (best_id1.clone(), best_id2) + } else { + // MAF or CDF likelihood is below cutoff, so report homozygous for the dominant allele + debug!("MAF or CDF failed, returning homozygous result"); + if counts1 > counts2 { + (best_id1.clone(), best_id1.clone()) + } else { + (best_id2.clone(), best_id2) + } + } + } else { + debug!("best_map2: No second consensus, homozygous result"); + (best_id1.clone(), best_id1.clone()) + }; + + if let Some(debug_folder) = cli_settings.debug_folder.as_ref() { + // save the consensus sequences + let extension = format!("consensus_{gene_name}.fa"); + let consensus_fn = debug_folder.join(extension); + debug!("Saving consensus for {gene_name} to {consensus_fn:?}"); + save_fasta(&consensus_map, &consensus_fn)?; + } + + // pattern here is a little weird because we need the mutable reference multiple times in the loop + if let Some(dbw) = debug_bam_writer.as_mut() { + // add each extra target + for eid in cli_settings.debug_hla_targets.iter() { + if let Some(hap_def) = database.hla_sequences().get(eid) { + // make sure it's a gene match, otherwise ignore for now + if hap_def.gene_name() != gene_name { + continue; + } + let dna_sequence = hap_def.dna_sequence(); + if let Some(sequence) = dna_sequence { + // parse the star allele and get the correct sequence orientation + let star_allele = hap_def.star_allele().join(":"); + let sequence = if is_forward_strand { + sequence.to_string() + } else { + String::from_utf8(reverse_complement(sequence.as_bytes())?)? + }; + + // save the record + let tags = [("HP".to_string(), format!("0_debug-target_{eid}_{gene_name}*{star_allele}"))].into_iter().collect(); + match unmapped_record( + &star_allele, + &sequence, + &tags + ) { + Ok(umr) => { + debug_records.push(umr); + }, + Err(e) => { + error!("Error while creating unmapped record: {e}"); + } + }; + } else { + warn!("Debug target \"{eid}\" does not have a DNA sequence in the database, ignoring it in output BAM"); + } + } else { + warn!("Debug target \"{eid}\" was not found in the database, ignoring it in output BAM"); + } + } + + // we have a debug writer, so add all of the reads to this list also + for ((qname, read_segment), &is_consensus1) in read_segments.iter().zip(consensus.is_consensus1().iter()) { + let con_type = if is_consensus1 { 1 } else { 2 }; + let order_index = if is_consensus1 { 3 } else { 6 }; + let phase_label = format!("{order_index}_consensus{con_type}_sequence"); + let tags = [("HP".to_string(), phase_label)].into_iter().collect(); + match unmapped_record( + qname, + read_segment, + &tags + ) { + Ok(umr) => { + debug_records.push(umr); + }, + Err(e) => { + error!("Error while creating unmapped record: {e}"); + } + }; + } + + // now map each record + match dbw.map_records_to_region(&debug_records, &reference_coordinates) { + Ok(()) => {}, + Err(e) => { + error!("Error while mappings records to debug BAM: {e}"); + } + }; + } + + best_dual_result + }; + + // For now, we only return one result, but lets leave the mechanisms for multiple in the future + let best_combination = [best_result]; + + // collect the diplotypes + let diplotypes: Vec = best_combination.iter().map(|(k1, k2)| { + let star1 = match database.hla_sequences().get(k1) { + Some(allele_def) => { + let s1 = allele_def.star_allele(); + format!("*{}", s1.join(":")) + }, + None => k1.clone() + }; + let star2 = match database.hla_sequences().get(k2) { + Some(allele_def) => { + let s2 = allele_def.star_allele(); + format!("*{}", s2.join(":")) + }, + None => k2.clone() + }; + Diplotype::new( + &star1, + &star2 + ) + }).collect(); + + debug!("Diplotype for {gene_name} => {:?}", diplotypes.iter().map(|d| d.diplotype()).collect::>()); + ret.insert(gene_name.clone(), PgxGeneDetails::new_from_mappings( + diplotypes, + None, + mapping_details + )?); + } + + // if we have a debug output file, we can write it now + // if let Some(debug_fn) = cli_settings.debug_hla_filename.as_ref() { + if let Some(debug_folder) = cli_settings.debug_folder.as_ref() { + let debug_fn = debug_folder.join("hla_debug.json"); + debug!("Saving HLA debug to {:?}", debug_fn); + crate::util::file_io::save_json(&debug_stats, &debug_fn)?; + } + + Ok(ret) +} + +/// Wrapper function for if we allow an allele definition +/// # Arguments +/// * `hla_allele_def` - the definition in question +/// * `gene_name` - the name of the gene we are matching against +/// * `cli_settings` - any settings from the CLI; some control the behavior of this function +fn is_allowed_allele_def(hla_allele_def: &HlaAlleleDefinition, gene_name: &str, cli_settings: &DiplotypeSettings) -> bool { + // require the gene name to match AND + hla_allele_def.gene_name() == gene_name && + // require that either we have DNA sequence OR that the DNA sequence is not required + (hla_allele_def.dna_sequence().is_some() || !cli_settings.hla_require_dna) +} + +/// Wrapper script for setting up the consensus configuration from our CLI. +/// # Arguments +/// * `cli_settings` - the provided CLI options +/// # Errors +/// * if the CLI options are not allowed by our config +fn dwfa_config_from_cli(cli_settings: &DiplotypeSettings) -> Result { + CdwfaConfigBuilder::default() + .min_count(cli_settings.min_consensus_count) + .min_af(cli_settings.min_consensus_fraction) + .dual_max_ed_delta(cli_settings.dual_max_ed_delta) + .allow_early_termination(false) + .weighted_by_ed(false) // currently, I'm not convinced on either approach + .consensus_cost(waffle_con::cdwfa_config::ConsensusCost::L1Distance) + .max_queue_size(20) + .max_capacity_per_size(10) + .build() +} + +/// Runs dual consensus on an ordered collection of sequences. Useful because we may solve via cDNA or DNA depending on dataset. +/// If multiple equal options are found, this only returns the first one. +/// # Arguments +/// * `segments` - a map from segment ID to a segment sequence; traversed in order +/// * `cli_settings` - contains controls for the consensus step +/// # Errors +/// * if the cli_settings are not valid for consensus +/// * if the consensus itself has errors while running +fn run_dual_consensus(segments: &BTreeMap, cli_settings: &DiplotypeSettings) -> Result> { + // now prep the priority consensus runner - we have HPC as priority, then full length + let mut consensus_dwfa = DualConsensusDWFA::with_config(dwfa_config_from_cli(cli_settings)?)?; + for (_qname, read_segment) in segments.iter() { + consensus_dwfa.add_sequence(read_segment.as_bytes())?; + } + + let mut consensus_list = consensus_dwfa.consensus()?; + if consensus_list.len() > 1 { + warn!("Found multiple solutions, selecting first."); + } + Ok(consensus_list.remove(0)) +} + +/// Checks a DualConsensus solution to see if it passes our MAF and CDF cutoffs from the CLI. +/// # Arguments +/// * `dual_consensus` - the consensus to check; if not dual, this will always return false +/// * `cli_settings` - contains the cutoffs we compare against +fn is_passing_dual(dual_consensus: &DualConsensus, cli_settings: &DiplotypeSettings) -> bool { + if dual_consensus.is_dual() { + // we found a dual consensus, first check if the CDF and MAF are passing + let maf_cutoff = cli_settings.min_consensus_fraction; + let total_count = dual_consensus.is_consensus1().len(); + let counts1 = dual_consensus.is_consensus1().iter().filter(|&&b| b).count(); + let counts2 = total_count - counts1; + let minor_count = counts1.min(counts2) as f64; + let maf = minor_count / (total_count as f64); + + // binomial distribution based decision making for het/hom + let cdf_cutoff = cli_settings.min_cdf; + let distro = Binomial::new(0.5, total_count as u64).unwrap(); + let cdf = distro.cdf(minor_count as u64); + + let is_passing = maf >= maf_cutoff && cdf >= cdf_cutoff; + debug!("DualConsensus detected: counts1={counts1}, counts2={counts2}, MAF={maf:.5}, CDF={cdf:.5}; is_passing={is_passing}"); + is_passing + } else { + false + } +} + +/// Given a consensus sequence from a particular region, this will score it against all the entries in the database by comparing both the cDNA and DNA sequences. +/// cDNA takes priority, and then DNA is mostly a tie-breaker (4th-field). +/// # Arguments +/// * `ref_aligner` - a preconstructed aligner to the reference sequence, which allows us to align and then splice out the cDNA +/// * `ref_coordinates` - coordinates of the reference sequence, which are need to create an artificial read +/// * `consensus` - the consensus sequence we are comparing to the DB +/// * `database` - contains all the HLA entries +/// * `gene_name` - gene we are trying to label +/// * `cli_settings` - parameters that control this step +fn score_consensus( + ref_aligner: &Aligner, ref_coordinates: &Coordinates, consensus: &str, + database: &PgxDatabase, gene_name: &str, cli_settings: &DiplotypeSettings +) -> Result<(HashMap, ReadMappingStats), Box> { + // we need cigar here + // other settings for mapping + let output_cigar: bool = true; + let output_md: bool = true; + let max_frag_len: Option = None; + let extra_flags = None; + + // target is reference; query is the consensus + let d_mappings = ref_aligner.map( + consensus.as_bytes(), + output_cigar, output_md, max_frag_len, extra_flags.clone() + )?; + assert_eq!(d_mappings.len(), 1); + let d_map = &d_mappings[0]; + + // we will only pull out the mapped region when we make a record + let mapped_region = (d_map.query_start as usize)..(d_map.query_end as usize); + let mapped_sequence = &consensus.as_bytes()[mapped_region]; + + let cigar_vec: Vec = d_map.alignment.as_ref().unwrap() + .cigar.as_ref().unwrap().iter() + .map(|&(cigar_len, cigar_type)| { + match cigar_type { + 0 => Cigar::Match(cigar_len), + 1 => Cigar::Ins(cigar_len), + 2 => Cigar::Del(cigar_len), + _ => panic!("Unknown cigar type: {cigar_type}") + } + }).collect(); + let cigar_string = CigarString(cigar_vec); + + // create an artificial bam record that we can feed to our score_read function + let mut consensus_record = rust_htslib::bam::Record::new(); + let record_name = b"consensus"; + let qual = vec![255; mapped_sequence.len()]; + let cigar = Some(&cigar_string); + consensus_record.set( + record_name, + cigar, + mapped_sequence, + &qual + ); + let start_align = ref_coordinates.start() as i64 + d_map.target_start as i64; + consensus_record.set_pos(start_align); + + // score it as if we pulled it directly from a BAM file + score_read(consensus_record, database, gene_name, cli_settings) +} + +/// Work-horse function for scoring every allele in the HLA database against a single read. +/// Originally, this was intended to be called in parallel, once for each read sequence. +/// Now, it is called up to twice per gene, once for each consensus sequence after re-mapping it back to the reference. +/// Returns a hash map of all the scores as well as the single best ID. +/// # Arguments +/// * `read` - the read we are aligning against, which is always on the forward strand +/// * `database` - the full PGx database, including HLA sequences +/// * `gene_name` - the gene we want to compare against +/// # Errors +/// * if there are errors parsing the read record +/// * if there are errors when aligning the sequences +fn score_read(mut read: rust_htslib::bam::Record, database: &PgxDatabase, gene_name: &str, cli_settings: &DiplotypeSettings) + -> Result<(HashMap, ReadMappingStats), Box> { + // result is a map of HLA_ID -> score + let mut ret: HashMap = Default::default(); + + let is_forward_strand = *database.hla_config().hla_is_forward_strand().get(gene_name).unwrap(); + + // get the read sequence for alignment + let qname: String = std::str::from_utf8(read.qname())?.to_string(); + let read_sequence: Vec = read.seq().as_bytes(); + let read_string: String = String::from_utf8(if is_forward_strand { + read_sequence + } else { + reverse_complement(&read_sequence)? + })?; + + let mut cdna_aligner = if !cli_settings.disable_cdna_scoring { + let fw_spliced = splice_read(&mut read, database, gene_name)?; + let prespliced_read = if is_forward_strand { + fw_spliced + } else { + String::from_utf8(reverse_complement(fw_spliced.as_bytes())?)? + }; + + // since the read is pre-spliced, we can use the typical map mode for HiFi + Aligner::builder() + .map_hifi() + .with_cigar() + .with_seq(prespliced_read.as_bytes())? + } else { + // does not really matter, but this is the older splicing approach + // FWIW, John has a modified example, and we found that k=5 will work but it's SUPER slow + Aligner::builder() + .splice() + .with_cigar() + .with_seq(read_string.as_bytes())? + }; + + // DNA aligner is much more straight-forward + let mut dna_aligner: Aligner = Aligner::builder() + .map_hifi() + .with_cigar() + .with_seq(read_string.as_bytes())?; + + // default options: a: 1, b: 4, q: 6, e: 2, q2: 26, e2: 1 + // descriptions from minimap2 here: /~https://github.com/lh3/minimap2/blob/69e36299168d739dded1c5549f662793af10da83/minimap.h#L157 + // John suggested increasing "a" from 1 to some higher value to remove clipping + // equality does not seem to err on the side of inclusion; we had two identical + 1 mismatch and a=2 did not work (2 + 2 - 4); a=3 did work + // so in theory, if we have one matching base separated by a mismatch, then a > 4 to catch it + cdna_aligner.mapopt.a = 5; + dna_aligner.mapopt.a = 5; + + // we only need cigar and md for debugging + // other settings for mapping + let output_cigar: bool = true; + let output_md: bool = true; + let max_frag_len: Option = None; + // see /~https://github.com/lh3/minimap2/blob/69e36299168d739dded1c5549f662793af10da83/minimap.h#L36 for more flag options + let extra_flags = Some(vec![0x4000000]); // enables the X/= cigar tags + + // used to be controled by targets when we had one entry per read, now we can just assume this is fine + let all_hla_targets: bool = true; // cli_settings.debug_hla_targets.contains(&"all".to_string()); + let mut read_mapping_stats = ReadMappingStats::new(); + + /* + * Note for future Matt: we tried using HPC DNA aligner as well. It sort of worked, but the problem is when true variation + * gets masked by the HPC. Also, if the HPC alleles are different lengths, it always picks the longer one. + * We need something smarter, and I don't think HPC is it due to variant masking... + */ + let aligners = [ + cdna_aligner, // cDNA + dna_aligner // DNA + ]; + + // this is all of the best results thus far + let mut best_match: HlaProcessedMatch = HlaProcessedMatch::worst_match(aligners.len()); + for (hla_id, hla_allele_def) in database.hla_sequences().iter() { + // only check the sequences for this particular gene + if !is_allowed_allele_def(hla_allele_def, gene_name, cli_settings) { + continue; + } + + // these are the ordered rankings of our sequence mappings; cDNA -> HPC DNA -> full DNA + let sequences = [ + // cDNA sequence - 2nd/3rd field + if cli_settings.disable_cdna_scoring { None } else { Some(hla_allele_def.cdna_sequence()) }, + // lastly DNA sequence - 4th field, HPC tie-break + hla_allele_def.dna_sequence() + ]; + + // sanity check for dev + assert_eq!(aligners.len(), sequences.len()); + + // these should be the same length as sequences at the end + let mut current_match = HlaProcessedMatch::new(hla_id.clone())?; + + for (aligner, opt_sequence) in aligners.iter().zip(sequences.iter()) { + // initialize the best_mapping as a worst result (100% ED) + let mut best_mapping = None; + let mut best_stats = MappingStats::new(1, 1, 0); + + let seq_len = if let Some(sequence) = opt_sequence { + // cDNA mapper first + let mappings = aligner.map( + sequence.as_bytes(), + output_cigar, output_md, max_frag_len, extra_flags.clone() + )?; + let seq_len = sequence.len(); + + for m in mappings.iter() { + // some sanity checks while we debug + assert!(m.query_end <= seq_len as i32); + assert!(m.query_start >= 0); + assert!(m.query_start <= m.query_end); + + let is_forward = m.strand == minimap2::Strand::Forward; + if !is_forward { + // can happen for bad mappings, so we should just ignore them + continue; + } + + // scoring is based on the lowest edit distance, including unmapped + let nm = m.alignment.as_ref().unwrap().nm as usize; + let unmapped = seq_len - (m.query_end - m.query_start) as usize; + let mapping_stats = MappingStats::new(seq_len, nm, unmapped); + + if mapping_stats.mapping_score() < best_stats.mapping_score() { + // replace it + best_stats = mapping_stats; + best_mapping = Some(m.clone()); + } + } + seq_len + } else { + 0 + }; + + // add what we found for this haplotype aligner + current_match.add_mapping(best_mapping, seq_len)?; + } + + // first, do all the debug and tracking of the comparisons + let hla_mapping_stats = HlaMappingStats::from_mapping_stats( + current_match.full_mapping_stats()[0].clone(), + current_match.full_mapping_stats()[1].clone() + ); + let match_mapping = current_match.full_mappings()[0].as_ref(); + let d_mapping = current_match.full_mappings()[1].as_ref(); + + // check if we want to store the full info for this one + let hla_star_allele = database.hla_sequences().get(hla_id).unwrap().star_allele().join(":"); + if all_hla_targets || cli_settings.debug_hla_targets.contains(&hla_star_allele) { + // save this info using the star allele notation for simplicity + read_mapping_stats.add_mapping(hla_star_allele, match_mapping, d_mapping)?; + } else if cli_settings.debug_hla_targets.contains(hla_id) { + // we need to save this info using the HLA ID + read_mapping_stats.add_mapping(hla_id.clone(), match_mapping, d_mapping)?; + } + + // now do the comparison to the current best, and overwrite if better + if current_match.is_better_match(&best_match)? { + // pull out cigars for debug + let match_cigar = match_mapping.map(|bm| bm.alignment.as_ref().unwrap().cigar_str.as_ref().unwrap()); + let d_cigar = d_mapping.map(|bm| bm.alignment.as_ref().unwrap().cigar_str.as_ref().unwrap()); + + debug!("{qname} {hla_id} -> new best {:?}, {}", hla_allele_def.star_allele(), hla_mapping_stats.score_string()); + debug!("\tcDNA cigar -> {:?}", match_cigar); + debug!("\tcDNA mapping -> {:?}", match_mapping); + debug!("\tDNA cigar -> {:?}", d_cigar); + debug!("\tDNA mapping -> {:?}", d_mapping); + + // the current is better, time to replace things + best_match = current_match; + } + + // now save the result for this ID + ret.insert(hla_id.clone(), hla_mapping_stats); + } + + // set the best match before returning + let best_hla_id = best_match.haplotype().to_string(); + if !best_hla_id.is_empty() { + let best_hla_star = database.hla_sequences().get(&best_hla_id).unwrap().star_allele().join(":"); + read_mapping_stats.set_best_match(best_hla_id, best_hla_star); + } else { + // default sets it to "" and ""; so no operations needed here + } + Ok((ret, read_mapping_stats)) +} + +/// Accepts a bam record and extracts just the parts that align to the cDNA for a gene. +/// # Arguments +/// * `read` - the record to parse +/// * `database` - contains coordinates for splicing +/// * `gene_name` - the gene we are splicing +fn splice_read(read: &mut rust_htslib::bam::Record, database: &PgxDatabase, gene_name: &str) -> Result> { + // get the DNA string out + let read_sequence: Vec = read.seq().as_bytes(); + let read_string: String = String::from_utf8(read_sequence)?; + + // we should have already called this, but do it again to be safe + read.cache_cigar(); + + //build a lookup from reference coordinate -> sequence coordinate + let mut coordinate_lookup: HashMap = Default::default(); + for bp in read.aligned_pairs() { + let segment_index = bp[0] as usize; + let ref_index = bp[1] as usize; + coordinate_lookup.insert(ref_index, segment_index); + } + + // splice in the coordinates from the lookup + let mut splice_segments: Vec<(usize, usize)> = vec![]; + for exon_coordinates in database.hla_config().hla_exons().get(gene_name).unwrap().iter() { + let mut first_exon_base = exon_coordinates.start() as usize; + let mut last_exon_base = (exon_coordinates.end() - 1) as usize; + + // find the first base in this exon that the read maps to + while !coordinate_lookup.contains_key(&first_exon_base) && first_exon_base <= last_exon_base { + first_exon_base += 1; + } + + // find the last base (inclusive) in this exon that the read maps to + while !coordinate_lookup.contains_key(&last_exon_base) && first_exon_base <= last_exon_base { + last_exon_base -= 1; + } + + // if there is a non-zero number of bases, we will have a splice segment to add + if first_exon_base <= last_exon_base { + // these are both inclusive + let first_read_base = *coordinate_lookup.get(&first_exon_base).unwrap(); + let last_read_base = *coordinate_lookup.get(&last_exon_base).unwrap(); + + // add one to make this a normal range + splice_segments.push((first_read_base, last_read_base+1)); + } + } + + // we don't want to oversplice, so set the first one to the read start and the last one to the read end + // splice_segments[0].0 = 0; + // splice_segments.last_mut().unwrap().1 = read_string.len(); + + // now create the pre-spliced read using the segments we found + let prespliced_read: String = splice_segments.iter() + .map(|&(s, e)| read_string[s..e].to_string()) + .collect::>() + .join(""); + Ok(prespliced_read) +} + +#[cfg(test)] +mod tests { + use rust_htslib::bam::record::{Cigar, CigarString}; + use std::str::FromStr; + + use super::*; + use crate::data_types::mapping::MappingScore; + use crate::util::file_io::load_json; + + /* + // writing a test for this will be complicated and may be better served with our end-to-end testing; punting for now + #[test] + fn test_diplotype_hla() { + panic!("no impl"); + } + */ + + #[test] + fn test_is_allowed_allele_def() { + // base case, should be allowed + let mut cli_settings = Default::default(); + let gene_name = "HLA-A"; + let hla_allele_def = HlaAlleleDefinition::new( + "HLA1".to_string(), "A*01", Some("ACGT".to_string()), "AG".to_string() + ).unwrap(); + assert!(is_allowed_allele_def(&hla_allele_def, gene_name, &cli_settings)); + + // wrong gene, should not be allowed + let hla_allele_def = HlaAlleleDefinition::new( + "HLA1".to_string(), "B*01", Some("ACGT".to_string()), "AG".to_string() + ).unwrap(); + assert!(!is_allowed_allele_def(&hla_allele_def, gene_name, &cli_settings)); + + // missing DNA, should not be allowed if require DNA is enabled + cli_settings.hla_require_dna = true; + let hla_allele_def = HlaAlleleDefinition::new( + "HLA1".to_string(), "A*01", None, "AG".to_string() + ).unwrap(); + assert!(!is_allowed_allele_def(&hla_allele_def, gene_name, &cli_settings)); + + // remove the requirement and it's allowed again + cli_settings.hla_require_dna = false; + let hla_allele_def = HlaAlleleDefinition::new( + "HLA1".to_string(), "A*01", None, "AG".to_string() + ).unwrap(); + assert!(is_allowed_allele_def(&hla_allele_def, gene_name, &cli_settings)); + } + + fn load_default_database() -> PgxDatabase { + let database_fn = PathBuf::from_str("./test_data/HLA-faux/database.json").unwrap(); + let database = load_json(&database_fn).unwrap(); + database + } + + #[test] + fn test_reference_alleles() { + // these are the reference alleles for each, so they should exactly match + let genes = vec!["HLA-A".to_string(), "HLA-B".to_string()]; + let hla_ids = vec!["HLA:HLA00037".to_string(), "HLA:HLA00132".to_string()]; + let hla_stars = vec!["03:01:01:01".to_string(), "07:02:01:01".to_string()]; + let mapping_positions = vec![29942254, 31353362]; + let is_revcomp = vec![false, true]; + + // load our proxy database that just has the reference alleles + let database = load_default_database(); + let mut cli_settings: DiplotypeSettings = Default::default(); + cli_settings.hla_require_dna = false; + cli_settings.disable_cdna_scoring = false; + cli_settings.debug_hla_targets = hla_ids.clone(); + + for (i, gene_name) in genes.iter().enumerate() { + let hla_key = &hla_ids[i]; + let hla_star = &hla_stars[i]; + let mut test_sequence = database.hla_sequences().get(hla_key).expect("the HLA key is present") + .dna_sequence().expect("the DNA sequence is present") + .to_string(); + + // HLA-B is on the opposite strand, so we need to rev-comp it to match the reference correctly; otherwise splicing gets jacked + if is_revcomp[i] { + test_sequence = test_sequence.chars() + .rev() + .map(|c| match c { + 'A' => 'T', + 'C' => 'G', + 'G' => 'C', + 'T' => 'A', + _ => panic!("unexpected {c}") + }) + .collect::(); + } + + // create a read that exactly matches the test sequence + let mut read = rust_htslib::bam::Record::new(); + let cigar_string = CigarString(vec![Cigar::Match(test_sequence.len() as u32); 1]); + read.set( + "read_name".as_bytes(), + Some(&cigar_string), + test_sequence.as_bytes(), + &vec![20; test_sequence.len()] + ); + read.set_pos(mapping_positions[i] - 1); + + // score it + let (all_scores, mapping_stats) = score_read(read, &database, gene_name, &cli_settings).unwrap(); + let best_score = mapping_stats.best_match_id().unwrap(); + assert_eq!(hla_key, best_score); + let best_score_hla = mapping_stats.best_match_star().unwrap(); + assert_eq!(hla_star, best_score_hla); + + let key_score = all_scores.get(hla_key).unwrap(); + let cdna_len = database.hla_sequences().get(hla_key).unwrap().cdna_sequence().len(); + let dna_len = test_sequence.len(); + assert_eq!(key_score, &HlaMappingStats::new( + // figure out the oddity here, it's that HLA-A*03:01:01:01 is the reference allele, this should work now + Some(cdna_len), Some(0), Some(0), + Some(dna_len), Some(0), Some(0) + )); + } + } + + // make the settings match the default from clap + fn get_default_cli_settings() -> DiplotypeSettings { + let mut default_settings: DiplotypeSettings = Default::default(); + default_settings.max_error_rate = 0.05; + default_settings.min_cdf = 0.001; + default_settings + } + + #[test] + fn test_score_bad_read() { + // nothing should match this read + let mut record = rust_htslib::bam::record::Record::new(); + record.set("test".as_bytes(), None, "ACGT".as_bytes(), &[255; 4]); + + // verify that we can successfully run this read, even though it's going to get thrown away + let database = load_default_database(); + let gene_name = "HLA-A"; + let mut cli_settings = get_default_cli_settings(); + cli_settings.disable_cdna_scoring = true; // this read will never work with cDNA + let (hash_scores, best_result) = score_read(record, &database, gene_name, &cli_settings).unwrap(); + + // make sure there is no best score + assert!(best_result.best_match_id().is_none()); + assert!(best_result.best_match_star().is_none()); + + // we get one mapping stat per entry in the DB + assert_eq!(best_result.mapping_stats().len(), 1); + + // finally, make sure all voting is marked as terrible, this will lead to getting discarded later + for (_key, score) in hash_scores.iter() { + // make sure both scores are the worst + assert_eq!(score.mapping_score().cdna_score(), MappingScore::worst_score()); + assert_eq!(score.mapping_score().dna_score(), MappingScore::worst_score()); + } + } + + /// Wrapper for a running test on whether a DualConsensus passes just based on counts. + fn run_passing_test(c1: usize, c2: usize) -> bool { + use waffle_con::consensus::Consensus; + let mut cli_settings: DiplotypeSettings = Default::default(); + cli_settings.min_cdf = 0.001; + cli_settings.min_consensus_fraction = 0.10; + + let total = c1+c2; + let mut is_consensus1 = vec![true; c1]; + is_consensus1.extend(vec![false; c2]); + + let dual_consensus = DualConsensus::new( + // these do not matter + Consensus::new(vec![], waffle_con::cdwfa_config::ConsensusCost::L1Distance, vec![]), + Some(Consensus::new(vec![], waffle_con::cdwfa_config::ConsensusCost::L1Distance, vec![])), + // this is all that matters for the test + is_consensus1, + // also don't matter + vec![None; total], + vec![None; total], + ).unwrap(); + is_passing_dual(&dual_consensus, &cli_settings) + } + + #[test] + fn test_is_passing_dual() { + // imbalanced, should fail + assert!(!run_passing_test(3, 20)); + assert!(!run_passing_test(20, 3)); + + // close enough to pass + assert!(run_passing_test(10, 20)); + assert!(run_passing_test(20, 10)); + } +} \ No newline at end of file diff --git a/src/hla/debug.rs b/src/hla/debug.rs new file mode 100644 index 0000000..32070a5 --- /dev/null +++ b/src/hla/debug.rs @@ -0,0 +1,151 @@ + +use serde::Serialize; +use simple_error::bail; +use std::collections::BTreeMap; +use std::collections::btree_map::Entry::{Occupied, Vacant}; + +/// Primary object that gets converted into JSON +#[derive(Default, Serialize)] +pub struct HlaDebug { + /// Each gene & read has a detailed set of mappings + read_mapping_stats: BTreeMap> +} + +impl HlaDebug { + /// Constructor, default works for now, but we may change in the future as things get added + pub fn new() -> HlaDebug { + Default::default() + } + + /// Adds a read to the debug collection + /// # Arguments + /// * `qname` - the read name to insert + /// * `stats` - the HLA sequence mapping stats for this read + /// # Errors + /// * if the read has already been added + pub fn add_read(&mut self, gene: String, qname: String, stats: ReadMappingStats) -> Result<(), Box> { + let gene_entry = self.read_mapping_stats.entry(gene).or_default(); + match gene_entry.entry(qname.clone()) { + Occupied(_entry) => { + bail!("Entry {qname} is already occupied"); + }, + Vacant(entry) => { + entry.insert(stats); + Ok(()) + } + } + } +} + +/// Wrapper for an individual read stats +#[derive(Default, Serialize)] +pub struct ReadMappingStats { + /// The best matching ID + best_match_id: Option, + /// The best matching star allele + best_match_star: Option, + /// The details around mappings for the read + mapping_stats: BTreeMap +} + +impl ReadMappingStats { + /// Generic constructor, may add some constraints later + pub fn new() -> ReadMappingStats { + Default::default() + } + + /// sets the best match, this usually is not known when constructed + pub fn set_best_match(&mut self, new_best_id: String, new_best_star: String) { + self.best_match_id = Some(new_best_id); + self.best_match_star = Some(new_best_star); + } + + pub fn best_match_id(&self) -> Option<&str> { + self.best_match_id.as_deref() + } + + pub fn best_match_star(&self) -> Option<&str> { + self.best_match_star.as_deref() + } + + pub fn mapping_stats(&self) -> &BTreeMap { + &self.mapping_stats + } + + /// Adds a mapping to the stats for this read + /// # Arguments + /// * `hla_id` - the HLA ID for the mapping + /// * `cdna_mm2` - the minimap2 mapping for the cDNA sequence, optional + /// * `dna_mm2` - the minimap2 mapping for the DNA sequence, optional + /// # Errors + /// * if the HLA ID has already been inserted + pub fn add_mapping(&mut self, hla_id: String, cdna_mm2: Option<&minimap2::Mapping>, dna_mm2: Option<&minimap2::Mapping>) + -> Result<(), Box> { + match self.mapping_stats.entry(hla_id.clone()) { + Occupied(_entry) => { + bail!("Entry {hla_id} is already occupied!"); + }, + Vacant(entry) => { + let cdna_mapping: Option = cdna_mm2.map(DetailedMappingStats::from_mapping); + let dna_mapping: Option = dna_mm2.map(DetailedMappingStats::from_mapping); + let paired_mapping = PairedMappingStats { + cdna_mapping, dna_mapping + }; + entry.insert(paired_mapping); + Ok(()) + } + } + } +} + +/// Wrapper for a read to allele comparison, both cDNA and DNA are optional +#[derive(Debug, Serialize)] +pub struct PairedMappingStats { + /// the cDNA mapping stats + cdna_mapping: Option, + /// the DNA mapping stats + dna_mapping: Option +} + +/// Wrapper for an individual mapping statistic set +#[derive(Debug, Serialize)] +struct DetailedMappingStats { + /// length of the sequence + query_len: usize, + /// length of the match + match_len: usize, + /// number of mismatches + nm: usize, + /// number of unmapped bases + unmapped: usize, + /// Cigar string + cigar: String, + /// The incorrect matching bases + md: String +} + +impl DetailedMappingStats { + /// Creates a detailed mapping stat we can serialize from the minimap2 mapping + /// # Arguments + /// * `mapping` - the minimap2 mapping, we will pull relevant info out + /// # Panics + /// * if the mapping is missing detailed alignment information + fn from_mapping(mapping: &minimap2::Mapping) -> DetailedMappingStats { + let query_len = mapping.query_len.unwrap().get() as usize; + let match_len = mapping.match_len as usize; + let alignment = mapping.alignment.as_ref().unwrap(); + let nm = alignment.nm as usize; + let unmapped = query_len - (mapping.query_end - mapping.query_start) as usize; + let cigar = alignment.cigar_str.as_ref().unwrap().clone(); + let md = alignment.md.as_ref().unwrap().clone(); + + DetailedMappingStats { + query_len, + match_len, + nm, + unmapped, + cigar, + md + } + } +} \ No newline at end of file diff --git a/src/hla/mapping.rs b/src/hla/mapping.rs new file mode 100644 index 0000000..ce148b6 --- /dev/null +++ b/src/hla/mapping.rs @@ -0,0 +1,230 @@ + +use serde::Serialize; +use std::ops::AddAssign; + +use crate::data_types::mapping::{MappingScore, MappingStats}; + +/// Wraps the mapping stats for read to an HLA locus +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct HlaMappingStats { + /// the cDNA mapping stats + cdna_stats: Option, + /// the DNA mapping stats + dna_stats: Option +} + +impl HlaMappingStats { + /// Basic constructor, but will verify some assumptions + /// # Arguments + /// * `cdna_len` - the total length of the cDNA fragment that was mapped to the read + /// * `cdna_nm` - the edit distance (NM tag) of the mapping against the cDNA + /// * `cdna_unmapped` - the number of unmapped bases in the cDNA + /// * `dna_len` - the total length of the DNA fragment that was mapped to the read + /// * `dna_nm` - the edit distance (NM tag) of the mapping against the DNA + /// * `dna_unmapped` - the number of unmapped bases in the DNA + /// # Panics + /// * if both the cDNA and DNA are unset + /// * if cDNA fields are only partially filled, i.e. all must be Some or None + /// * if DNA fields are only partially filled, i.e. all must be Some or None + pub fn new( + cdna_len: Option, cdna_nm: Option, cdna_unmapped: Option, + dna_len: Option, dna_nm: Option, dna_unmapped: Option + ) -> HlaMappingStats { + // make sure at least one is set + assert!(cdna_len.is_some() || dna_len.is_some()); + // make sure all of cDNA is either Some or None + assert!(cdna_len.is_some() == cdna_nm.is_some() && cdna_len.is_some() == cdna_unmapped.is_some()); + // make sure all of DNA is either Some or None + assert!(dna_len.is_some() == dna_nm.is_some() && dna_len.is_some() == dna_unmapped.is_some()); + + let cdna_stats = cdna_len.map(|l| { + MappingStats::new(l, cdna_nm.unwrap(), cdna_unmapped.unwrap()) + }); + + let dna_stats = dna_len.map(|l| { + MappingStats::new(l, dna_nm.unwrap(), dna_unmapped.unwrap()) + }); + + HlaMappingStats { + cdna_stats, dna_stats + } + } + + /// Wrapper constructor for directly pulling in establishing mapping stats. + /// Does not perform sanity checks on the input. + pub fn from_mapping_stats(cdna_stats: Option, dna_stats: Option) -> Self { + Self { + cdna_stats, + dna_stats + } + } + + /// Calculates the mapping scores for this mapping. + /// Defaults to worst score when mappings are unavailable. + /// # Panics + /// * if a dev somehow creates one with incomplete cDNA or DNA entries, don't do that! + pub fn mapping_score(&self) -> HlaMappingScore { + let cdna_score = match self.cdna_stats.as_ref() { + Some(cds) => { + cds.mapping_score() + }, + None => MappingScore::worst_score() + }; + let dna_score = match self.dna_stats.as_ref() { + Some(ds) => { + ds.mapping_score() + }, + None => MappingScore::worst_score() + }; + HlaMappingScore::new(cdna_score, dna_score) + } + + /// Wrapper for writing a scoring string to the screen + /// # Panics + /// * if a dev somehow creates one with incomplete cDNA or DNA entries, don't do that! + pub fn score_string(&self) -> String { + let cdna_str = match self.cdna_stats.as_ref() { + Some(cds) => cds.score_string(), + None => "None".to_string() + }; + let dna_str = match self.dna_stats.as_ref() { + Some(ds) => ds.score_string(), + None => "None".to_string() + }; + format!( + "{}, {}", cdna_str, dna_str + ) + } + + pub fn has_cdna(&self) -> bool { + self.cdna_stats.is_some() + } + + pub fn has_dna(&self) -> bool { + self.dna_stats.is_some() + } +} + + +/// Contains the score for aligning an HLA sequence against a read. +/// This is basically an error rate, defined as (edit_distance + unmapped) / seq_len +#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)] +pub struct HlaMappingScore { + /// the score for the cDNA alignment + cdna_score: MappingScore, + /// the score for the DNA alignment + dna_score: MappingScore +} + +impl HlaMappingScore { + /// Simple constructor placeholder + pub fn new(cdna_score: MappingScore, dna_score: MappingScore) -> HlaMappingScore { + HlaMappingScore { + cdna_score, dna_score + } + } + + /// Useful wrapper when we just want to directly pass values in + pub fn from_values(cdna_value: f64, dna_value: f64) -> HlaMappingScore { + let cdna_score = MappingScore::new(cdna_value); + let dna_score = MappingScore::new(dna_value); + HlaMappingScore { + cdna_score, dna_score + } + } + + // Returns the worst possible score (cDNA + DNA) for a mapping + pub fn worst_score() -> HlaMappingScore { + HlaMappingScore { + cdna_score: MappingScore::worst_score(), + dna_score: MappingScore::worst_score() + } + } + + /// Returns an empty score so we can accumulate scores with AddAssign + pub fn zero_score() -> HlaMappingScore { + HlaMappingScore { + cdna_score: MappingScore::zero_score(), + dna_score: MappingScore::zero_score() + } + } + + /// Convenient comparator since Eq and Ord cannot be derived with f64 + pub fn min(self, other: HlaMappingScore) -> HlaMappingScore { + if self <= other { + self + } else { + other + } + } + + pub fn cdna_score(&self) -> MappingScore { + self.cdna_score + } + + pub fn dna_score(&self) -> MappingScore { + self.dna_score + } +} + +// we are not really using this anymore, but it will not hurt to keep it +impl AddAssign for HlaMappingScore { + fn add_assign(&mut self, rhs: Self) { + // an individual score is at most 1.0, so we can just add them together + self.cdna_score += rhs.cdna_score; + self.dna_score += rhs.dna_score; + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_mapping_stats() { + let mapping_stats = HlaMappingStats::new( + Some(10), Some(1), Some(0), + Some(20), Some(0), Some(1) + ); + assert_eq!(mapping_stats.mapping_score(), HlaMappingScore::from_values(0.1, 0.05)); + } + + #[test] + #[should_panic] + fn test_empty_stats() { + let _mapping_stats = HlaMappingStats::new( + None, None, None, + None, None, None + ); + } + + #[test] + #[should_panic] + fn test_partial_cdna_stats() { + let _mapping_stats = HlaMappingStats::new( + Some(10), None, None, + None, None, None + ); + } + + #[test] + #[should_panic] + fn test_partial_dna_stats() { + let _mapping_stats = HlaMappingStats::new( + None, None, None, + Some(10), None, None + ); + } + + #[test] + fn test_score_min() { + let s1 = HlaMappingScore::from_values(1.0, 0.5); + let s2 = HlaMappingScore::from_values(0.9, 1.0); + let s3 = HlaMappingScore::from_values(1.0, 0.2); + + assert_eq!(s1.min(s2), s2); + assert_eq!(s1.min(s3), s3); + assert_eq!(s2.min(s3), s2); + } +} \ No newline at end of file diff --git a/src/hla/mod.rs b/src/hla/mod.rs new file mode 100644 index 0000000..c981d07 --- /dev/null +++ b/src/hla/mod.rs @@ -0,0 +1,11 @@ + +/// Contains definitions related to HLA alleles +pub mod alleles; +/// Contains the core calling functionality +pub mod caller; +/// Contains debug out functionality that is targeted to HLA +pub mod debug; +/// Contains the mapping stats for HLA-mapped reads +pub mod mapping; +/// Wrapper for HLA match processing and comparing +pub mod processed_match; \ No newline at end of file diff --git a/src/hla/processed_match.rs b/src/hla/processed_match.rs new file mode 100644 index 0000000..aa8fed6 --- /dev/null +++ b/src/hla/processed_match.rs @@ -0,0 +1,271 @@ + +use log::debug; +use simple_error::{bail, SimpleError}; + +use crate::data_types::mapping::MappingStats; +use crate::hla::mapping::HlaMappingStats; + +/// Container for a processing HLA mapping. Mostly helpful for wrapping the `is_better_match` function. +pub struct HlaProcessedMatch { + /// Name for this haplotype + haplotype: String, + /// Collection of full mappings + full_mappings: Vec>, + /// Collection of end-to-end stats, mostly for tie-breaking + full_mapping_stats: Vec>, + /// Collection of processed cigars for comparing + processed_cigars: Vec>>, + /// Ranges that were actually included in the processing + processed_ranges: Vec>, +} + +impl HlaProcessedMatch { + // constructors + /// Creates a new processed match with the given haplotype name + pub fn new(haplotype: String) -> Result { + if haplotype.is_empty() { + bail!("Haplotype name cannot be empty") + } + Ok(Self { + haplotype, + full_mappings: vec![], + full_mapping_stats: vec![], + processed_cigars: vec![], + processed_ranges: vec![] + }) + } + + /// Creates a terrible default, everything should beat this + pub fn worst_match(num_sequences: usize) -> Self { + Self { + haplotype: Default::default(), + full_mappings: vec![None; num_sequences], + full_mapping_stats: vec![None; num_sequences], + processed_cigars: vec![None; num_sequences], + processed_ranges: vec![0..0; num_sequences] + } + } + + /// Adds a mapping to this processed match; can be None to indicate missing cDNA/DNA comparators + /// # Arguments + /// * `mapping` - the minimap2 full mapping + /// * `seq_len` - baseline sequence length (`target`) from the alignment + pub fn add_mapping(&mut self, mapping: Option, seq_len: usize) -> Result<(), SimpleError> { + let (opt_processed_cigar, opt_mapping_stats, processed_range) = if let Some(m) = mapping.as_ref() { + // first, get the processed cigar + if m.strand != minimap2::Strand::Forward { + bail!("Reverse strand mappings are not supported by HlaProcessedMatch"); + } + let cigar = m.alignment.as_ref().unwrap().cigar.as_ref().unwrap(); + let target_offset = m.target_start as usize; + let target_len = m.target_len as usize; + let clip_start = m.query_start as usize; + let clip_end = seq_len - m.query_end as usize; + let processed_cigar = process_mm_cigar( + cigar, + target_offset, target_len, + clip_start, clip_end + )?; + + // regions that overlap from this processed segment + let pc_start = target_offset - clip_start; + let pc_end = (m.target_end as usize) + clip_end; + + // create the mapping also + let nm = m.alignment.as_ref().unwrap().nm as usize; + let unmapped = seq_len - (m.query_end - m.query_start) as usize; + let mapping_stats = MappingStats::new(seq_len, nm, unmapped); + + // make sure our last count equals the NM + unmapped; otherwise we goofed while processing + assert_eq!(processed_cigar.len(), target_len+1); // we should have a lookup before/after each base + assert_eq!(*processed_cigar.last().unwrap(), nm + unmapped); + (Some(processed_cigar), Some(mapping_stats), pc_start..pc_end) + } else { + (None, None, 0..0) + }; + + // add the relevant info + self.full_mappings.push(mapping); + self.full_mapping_stats.push(opt_mapping_stats); + self.processed_cigars.push(opt_processed_cigar); + self.processed_ranges.push(processed_range); + + Ok(()) + } + + /// Wrapper function for determining if this match is better than the other + pub fn is_better_match(&self, rhs: &Self) -> Result { + // sanity check + if self.processed_cigars.len() != rhs.processed_cigars.len() { + bail!("RHS has different processed cigar length"); + } + + // useful when we want to get at something specific in our debug comparisons, should be false in all commits + let manual_debug = false; //self.haplotype == "HLA:HLA24487"; + + // we need to compare these sequentially + for (i, (opt_lhs_pc, opt_rhs_pc)) in self.processed_cigars.iter().zip(rhs.processed_cigars.iter()).enumerate() { + if let (Some(lhs_pc), Some(rhs_pc)) = (opt_lhs_pc, opt_rhs_pc) { + // get the two ranges and figure out the overlap + let lhs_range = &self.processed_ranges[i]; + let rhs_range = &rhs.processed_ranges[i]; + let overlap_start = lhs_range.start.max(rhs_range.start); + let overlap_end = lhs_range.end.min(rhs_range.end); + assert!(overlap_start < overlap_end); + + let lhs_nm = lhs_pc[overlap_end] - lhs_pc[overlap_start]; + let rhs_nm = rhs_pc[overlap_end] - rhs_pc[overlap_start]; + + if manual_debug { + debug!("DEBUG_MODE"); + debug!("lhs: {}", self.haplotype); + debug!("rhs: {}", rhs.haplotype); + debug!("overlap: {overlap_start}..{overlap_end}"); + debug!("lhs_nm: {lhs_nm}"); + debug!("rhs_nm: {rhs_nm}"); + debug!("lhs_m : {:?}", self.full_mappings[i]); + debug!("rhs_m : {:?}", rhs.full_mappings[i]); + } + + match lhs_nm.cmp(&rhs_nm) { + std::cmp::Ordering::Less => { + // LHS is better + return Ok(true); + }, + std::cmp::Ordering::Equal => { + // equal at this level, so iterate to next + }, + std::cmp::Ordering::Greater => { + // RHS is better + return Ok(false); + }, + }; + } else if opt_lhs_pc.is_none() && opt_rhs_pc.is_none() { + // both are absent, iterate + } else if opt_lhs_pc.is_some() { + // LHS has a value, but RHS does not; mark LHS as better + return Ok(true); + } else { + // RHS has a value, but LHS does not; mark RHS as better + assert!(opt_rhs_pc.is_some()); + return Ok(false); + } + } + + // the following is only programmed with cDNA & DNA in mind; verify that here + assert_eq!(self.full_mapping_stats.len(), 2); + assert_eq!(rhs.full_mapping_stats.len(), 2); + + // tie-break by comparing the scores from end-to-end mappings + let lhs_stats = HlaMappingStats::from_mapping_stats( + self.full_mapping_stats[0].clone(), + self.full_mapping_stats[1].clone() + ); + let rhs_stats = HlaMappingStats::from_mapping_stats( + rhs.full_mapping_stats[0].clone(), + rhs.full_mapping_stats[1].clone() + ); + Ok(lhs_stats.mapping_score() < rhs_stats.mapping_score()) + } + + // getters + pub fn haplotype(&self) -> &str { + &self.haplotype + } + + pub fn full_mappings(&self) -> &[Option] { + &self.full_mappings + } + + pub fn full_mapping_stats(&self) -> &[Option] { + &self.full_mapping_stats + } +} + +/// This will load a cigar and output a Vec of length equal to the target sequence + 1 with a value equal to the number of edits before that position. +/// Nice little shortcut for making sure we are comparing equivalent regions. +/// The value at index "i" is the number of edits before position "i" in the string. +/// Thus, the first value in this vec is always 0, and the last should equal NM + unmapped. +fn process_mm_cigar(cigar: &[(u32, u8)], target_offset: usize, target_len: usize, clip_start: usize, clip_end: usize) -> Result, SimpleError> { + // alignment starts target_offset into the vec, so everything before is a 0 + assert!(clip_start <= target_offset); + let mut ret = vec![0; target_offset - clip_start + 1]; // there should always be at least 1 zero at the start + let mut current_nm = 0; + + // if we have soft clipping at the start, add a region that is basically a bunch of mismatches + for _i in 0..clip_start { + current_nm += 1; + ret.push(current_nm); + } + + for &(length, cigar_type) in cigar.iter() { + // cigar types: /~https://github.com/lh3/minimap2/blob/69e36299168d739dded1c5549f662793af10da83/minimap.h#L57 + match cigar_type { + // I - insertion of bases, we should increase current NM but don't insert anything + 1 => { current_nm += length as usize; }, + // D | X - deletion of reference character or mismatch to reference; either way, we increase by one for each base + 2 | 8 => { + for _i in 0..length { + current_nm += 1; + ret.push(current_nm); + } + }, + // = - matches, so just copy NM value at this point + 7 => ret.extend(std::iter::repeat(current_nm).take(length as usize)), + // we should not have any of the others + unexpected => bail!("Unexpected cigar type: {unexpected}") + }; + } + + // if we have soft clipping at the end, add a region that is basically a bunch of mismatches + for _i in 0..clip_end { + current_nm += 1; + ret.push(current_nm); + } + + // extend to fill out whatever remains, with 1 extra so we have lookups after each position + assert!(ret.len() <= target_len+1); + ret.extend(std::iter::repeat(current_nm).take(target_len+1 - ret.len())); + Ok(ret) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_process_mm_cigar() { + + // (length, type) + let cigar = [ + (2, 7), // match + (1, 8), // mismatch, + (2, 7), // match + (1, 1), // insertion + (2, 7), // match + (1, 2), // deletion + (2, 7), // match + ]; + + // first try exact match overlap + // ==X==I==D== + let expected_pc = vec![0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3]; + let target_offset = 0; + let target_len = 10; + let clip_start = 0; + let clip_end = 0; + let result = process_mm_cigar(&cigar, target_offset, target_len, clip_start, clip_end).unwrap(); + assert_eq!(expected_pc, result); + + // Now add some clipping and offsets to the test + // -SS==X==I==D==SSS-- + let expected_pc = vec![0, 0, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 7, 8, 8, 8]; + let target_offset = 3; + let target_len = 18; + let clip_start = 2; + let clip_end = 3; + let result = process_mm_cigar(&cigar, target_offset, target_len, clip_start, clip_end).unwrap(); + assert_eq!(expected_pc, result); + } + +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..ed5354c --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,17 @@ + +/// Contains functionality for constructing our CPIC database +pub mod build_database; +/// Contains all the CLI related functionality +pub mod cli; +/// Contains all functionality for identifying and calling CYP2D6 diplotypes +pub mod cyp2d6; +/// Contains any specialized data types that are shared across the tooling +pub mod data_types; +/// Contains the functionality for diplotyping a gene +pub mod diplotyper; +/// Contains the specialized functionality for HLA genes +pub mod hla; +/// Contains generic utilities that are handy wrappers +pub mod util; +/// Contains shared visualization utilities +pub mod visualization; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..460496b --- /dev/null +++ b/src/main.rs @@ -0,0 +1,223 @@ + +use log::{LevelFilter, error, info}; +use rust_lib_reference_genome::reference_genome::ReferenceGenome; +use serde::Serialize; +use std::fs::File; +use std::path::Path; + +use pbstarphase::cli::diplotype::{DiplotypeSettings, check_diplotype_settings}; +use pbstarphase::cli::core::{Commands, get_cli}; +use pbstarphase::cli::db_build::{BuildSettings, check_build_settings}; +use pbstarphase::data_types::database::PgxDatabase; +use pbstarphase::data_types::pgx_diplotypes::{PgxDiplotypes, Diplotype}; +use pbstarphase::util::file_io::{load_json, save_json}; + +/// This will run the "build" mode of the tool +/// # Arguments +/// * `settings` - the BuildSettings object +fn run_build(settings: BuildSettings) { + // get the settings + // let settings: Settings = get_raw_settings(); + let filter_level: LevelFilter = match settings.verbosity { + 0 => LevelFilter::Info, + 1 => LevelFilter::Debug, + _ => LevelFilter::Trace + }; + + // immediately setup logging first + env_logger::builder() + .format_timestamp_millis() + .filter_level(filter_level) + .init(); + + // okay, now we can check all the other settings + let cli_settings: BuildSettings = check_build_settings(settings); + + // all the work + let pgx_db: PgxDatabase = match pbstarphase::build_database::pull_database_cpic_api() { + Ok(pdb) => pdb, + Err(e) => { + error!("Error while building CPIC database: {e}"); + std::process::exit(exitcode::IOERR); + } + }; + + // debug!("Full database:\n{pgx_db:#?}"); + // save the database to the defined file + info!("Saving database to {:?}", cli_settings.output_database); + match save_json(&pgx_db, &cli_settings.output_database) { + Ok(()) => {}, + Err(e) => { + error!("Error while writing database to file: {e}"); + std::process::exit(exitcode::IOERR); + } + }; +} + +/// This will run the "build" mode of the tool +/// # Arguments +/// * `settings` - the BuildSettings object +fn run_diplotype(settings: DiplotypeSettings) { + // get the settings + // let settings: Settings = get_raw_settings(); + let filter_level: LevelFilter = match settings.verbosity { + 0 => LevelFilter::Info, + 1 => LevelFilter::Debug, + _ => LevelFilter::Trace + }; + + // immediately setup logging first + env_logger::builder() + .format_timestamp_millis() + .filter_level(filter_level) + .init(); + + // okay, now we can check all the other settings + let cli_settings: DiplotypeSettings = match check_diplotype_settings(settings) { + Ok(s) => s, + Err(e) => { + error!("Error while processing CLI settings: {e}"); + std::process::exit(exitcode::USAGE); + } + }; + + // create a debug folder if specified + if let Some(debug_folder) = cli_settings.debug_folder.as_ref() { + info!("Creating debug folder at {debug_folder:?}..."); + match std::fs::create_dir_all(debug_folder) { + Ok(()) => {}, + Err(e) => { + error!("Error while creating debug folder: {e}"); + std::process::exit(exitcode::IOERR); + } + } + } + + // first load the database + info!("Loading PGx database from {:?}...", cli_settings.input_database); + let database: PgxDatabase = match load_json(&cli_settings.input_database) { + Ok(db) => db, + Err(e) => { + error!("Error while loading PGx database file: {e}"); + std::process::exit(exitcode::IOERR); + } + }; + + // we also need to validate that the database is complete enough to run + if let Err(e) = database.validate() { + error!("Error while validating PGx database file: {e}"); + std::process::exit(exitcode::IOERR); + } + + // pre-load the reference genome also + info!("Loading reference genome from {:?}...", cli_settings.reference_filename); + let reference_genome: ReferenceGenome = match ReferenceGenome::from_fasta(&cli_settings.reference_filename) { + Ok(rg) => rg, + Err(e) => { + error!("Error while loading reference genome file: {e}"); + std::process::exit(exitcode::IOERR); + } + }; + + // now hand it to the diplotype caller + let diplotypes: PgxDiplotypes = match pbstarphase::diplotyper::call_diplotypes( + &database, + cli_settings.vcf_filename.as_deref(), + Some(&reference_genome), + &cli_settings.bam_filenames, + &cli_settings + ) { + Ok(dc) => dc, + Err(e) => { + error!("Error while calling diplotypes: {e}"); + std::process::exit(exitcode::DATAERR); + } + }; + + // debug!("Full diplotypes:\n{diplotypes:#?}"); + // save the diplotypes to the defined file + info!("Saving diplotypes to {:?}", cli_settings.diplotype_filename); + match save_json(&diplotypes, &cli_settings.diplotype_filename) { + Ok(()) => {}, + Err(e) => { + error!("Error while writing diplotypes to file: {e}"); + std::process::exit(exitcode::IOERR); + } + }; + + if let Some(filename) = cli_settings.pharmcat_tsv.as_ref() { + info!("Saving PharmCAT diplotypes to {:?}", filename); + match save_pharmcat_tsv(&diplotypes, filename) { + Ok(()) => {}, + Err(e) => { + error!("Error while writing PharmCAT diplotypes to file: {e}"); + std::process::exit(exitcode::IOERR); + } + }; + } +} + +/// Wrapper for the pharmcat output +#[derive(Serialize)] +struct PharmCatRow { + #[serde(rename = "#gene")] + gene: String, + diplotype: String +} + +/// Helper function to save the basic TSV file for feeding into PharmCAT +/// # Arguments +/// * `diplotypes` - our reported diplotypes +/// * `filename` - the output filename, TSV +/// # Errors +/// * if we have any errors opening or writing to the file +fn save_pharmcat_tsv(diplotypes: &PgxDiplotypes, filename: &Path) -> Result<(), Box> { + // modify the delimiter to "," if it ends with .csv + let delimiter: u8 = b'\t'; + let mut csv_writer: csv::Writer = csv::WriterBuilder::new() + .delimiter(delimiter) + .from_path(filename)?; + + // make sure we go through the blocks in order + for (gene, details) in diplotypes.gene_details().iter() { + // check if we have one result or multiple + let diplotypes = details.simple_diplotypes(); + let diplotype_dt: Diplotype = if diplotypes.len() > 1 { + Diplotype::new("Multiple", "Multiple") + } else { + diplotypes[0].clone() + }; + + // check if it's a haplotype or diplotype gene + let diplotype: String = if gene == "MT-RNR1" { + // PharmCAT only accepts a single haplotype for MT (makes sense) + diplotype_dt.homozygous_haplotype().unwrap_or("Unknown").to_string() + } else { + // all others are a diplotype + diplotype_dt.pharmcat_diplotype() + }; + + // write the row out + let block_row = PharmCatRow { + gene: gene.clone(), + diplotype + }; + csv_writer.serialize(&block_row)?; + } + csv_writer.flush()?; + Ok(()) +} + +fn main() { + let cli = get_cli(); + match cli.command { + Commands::Build(settings) => { + run_build(*settings); + }, + Commands::Diplotype(settings) => { + run_diplotype(*settings); + } + } + + info!("Process finished successfully."); +} \ No newline at end of file diff --git a/src/util/file_io.rs b/src/util/file_io.rs new file mode 100644 index 0000000..a244f57 --- /dev/null +++ b/src/util/file_io.rs @@ -0,0 +1,93 @@ + +use bio::io::fasta; +use rustc_hash::FxHashSet as HashSet; +use simple_error::bail; +use std::collections::BTreeMap; +use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::fs::File; +use std::path::Path; + +/// Helper function that loads a file into some type, helpful generic +/// # Arguments +/// * `filename` - the file path to open and parse +/// # Errors +/// * if the file does not open properly +/// * if the deserialization throws errors +pub fn load_json(filename: &Path) -> Result> { + let fp: Box = if filename.extension().unwrap_or_default() == "gz" { + Box::new( + flate2::read::MultiGzDecoder::new( + File::open(filename)? + ) + ) + } else { + Box::new(File::open(filename)?) + }; + let result: T = serde_json::from_reader(fp)?; + Ok(result) +} + +/// This will save a generic serializable struct to JSON. +/// # Arguments +/// * `data` - the data in memory +/// * `out_filename` - user provided path to write to +/// # Errors +/// * if opening or writing to the file throw errors +/// * if JSON serialization throws errors +pub fn save_json(data: &T, out_filename: &Path) -> Result<(), Box> { + let file: Box = if out_filename.extension().unwrap_or_default() == "gz" { + Box::new( + flate2::write::GzEncoder::new( + File::create(out_filename)?, + flate2::Compression::best() + ) + ) + } else { + Box::new(File::create(out_filename)?) + }; + let mut writer = BufWriter::new(file); + serde_json::to_writer_pretty(&mut writer, data)?; + writer.flush()?; + Ok(()) +} + +/// Helper function that will just read a file line-by-line and return the list as a HashSet. +/// # Arguments +/// * `filename` - The file to load into the hash set +/// # Errors +/// * if a file is provided but cannot be opened or read +pub fn load_file_lines(filename: &Path) -> Result, Box> { + // open the file and throw into a buffered reader + let file = File::open(filename)?; + let reader = BufReader::new(file); + + // now add each line + let mut hashset: HashSet = Default::default(); + for line in reader.lines() { + hashset.insert(line?); + } + Ok(hashset) +} + +/// Given a map of keys and values, this will write out them out as a FASTA file +/// # Arguments +/// * `data` - each key is the entry label, each value is the DNA sequence +/// * `filename` - location to save fasta file to +pub fn save_fasta(data: &BTreeMap, filename: &Path) -> Result<(), Box> { + let mut fasta_writer = fasta::Writer::to_file(filename)?; + for (k, v) in data.iter() { + fasta_writer.write(k, None, v.as_bytes())?; + } + Ok(()) +} + +/// Fasta indexer, mirrored from https://docs.rs/rust-htslib/latest/src/rust_htslib/faidx/mod.rs.html#37-48 +/// Ideally, we would call that directly, but it requires an htslib update and that seems to be going poorly... +pub fn index_fasta(filename: &Path) -> Result<(), Box> { + let os_path = std::ffi::CString::new(filename.display().to_string())?; + let rc = unsafe { rust_htslib::htslib::fai_build(os_path.as_ptr()) }; + if rc < 0 { + bail!("Error {rc} while building index for {filename:?}"); + } + Ok(()) +} \ No newline at end of file diff --git a/src/util/homopolymers.rs b/src/util/homopolymers.rs new file mode 100644 index 0000000..25b3981 --- /dev/null +++ b/src/util/homopolymers.rs @@ -0,0 +1,97 @@ + +use itertools::Itertools; + +/// This will take a sequence and homo-polymer compress it, remove many common errors but also potentially HPC true variation. +/// # Arguments +/// * `sequence` - the sequence to run HPC on +/// # Errors +/// * if String::from_utf8 fails +pub fn hpc(sequence: &str) -> Result { + String::from_utf8( + sequence.as_bytes().iter() + .dedup() + .cloned() + .collect::>() + ) +} + + +/// This will take a sequence and an index and return the new index in a homo-polymer compressed version. +/// # Arguments +/// * `sequence` - the sequence to run HPC on +/// * `position` - the position we want to find in the new HPC +pub fn hpc_pos(sequence: &str, position: usize) -> usize { + let mut total_length = 0; + let mut offset = 0; + for (_k, g) in &sequence.as_bytes().iter() + .group_by(|&&v| v) { // creates runs + let l = g.count(); // get the run length + total_length += l; + if position < total_length { + break; + } + offset += 1; + } + + offset +} + + +/// This will take a sequence and homo-polymer compress it, remove many common errors but also potentially HPC true variation. +/// It then aligns it to an HPC guide sequence and adds a prefix buffer if needed. +/// # Arguments +/// * `sequence` - the sequence to run HPC on +/// * `guide_sequence` - the sequence we want to then align against and add a prefix for the offset, this get HPC before we do the alignment +/// * `guide_offset` - the offset into the guide where the sequence starts to align +/// # Errors +/// * if String::from_utf8 fails for either sequence +pub fn hpc_with_guide(sequence: &str, guide_sequence: &str, guide_offset: usize) -> Result<(String, usize), Box> { + // first, HPC the input + let hpc_sequence = hpc(sequence)?; + + // now figure out where this should start in the guide sequence + let hpc_guide_offset = hpc_pos(guide_sequence, guide_offset); + + // now solve the prefix and append it + // let prefix = String::from_utf8(vec![b'*'; hpc_guide_offset])?; + // let final_hpc = prefix + &hpc_sequence; + Ok((hpc_sequence, hpc_guide_offset)) +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_hpc() { + let sequence = "AACAAAAAAGGGTAACAA"; + let expected = "ACAGTACA"; + let hpc_result = hpc(sequence).unwrap(); + assert_eq!(&hpc_result, expected); + } + + #[test] + fn test_hpc_pos() { + let sequence = "AACCCGTTTT"; + for (i, &c) in sequence.as_bytes().iter().enumerate() { + let expected = match c { + b'A' => 0, + b'C' => 1, + b'G' => 2, + b'T' => 3, + _ => panic!("should not happen") + }; + assert_eq!(expected, hpc_pos(sequence, i)); + } + } + + #[test] + fn test_hpc_guide() { + let guide_sequence = "ATTGGGGGAACCCGTTTT"; + let sequence = "GAACCCGTTTT"; + let hpc_result = hpc_with_guide(sequence, guide_sequence, 6).unwrap(); + assert_eq!(hpc_result.0, "GACGT"); // ATT -> ** + assert_eq!(hpc_result.1, 2); + } +} \ No newline at end of file diff --git a/src/util/mod.rs b/src/util/mod.rs new file mode 100644 index 0000000..2d66676 --- /dev/null +++ b/src/util/mod.rs @@ -0,0 +1,9 @@ + +/// Generic functionality for reading/writing serializable object to file +pub mod file_io; +/// Homopolymer-compression related functions +pub mod homopolymers; +/// Functions for sequence manipulation +pub mod sequence; +/// Contains generic statistical functions +pub mod stats; diff --git a/src/util/sequence.rs b/src/util/sequence.rs new file mode 100644 index 0000000..e1b06c3 --- /dev/null +++ b/src/util/sequence.rs @@ -0,0 +1,43 @@ + +use simple_error::{bail, SimpleError}; + +/// Creates a reverse complement sequence from an input. +/// # Arguments +/// * `original` - the sequence to rev-comp +/// # Errors +/// * if any non-ACGNT character is provided +pub fn reverse_complement(original: &[u8]) -> Result, SimpleError> { + original.iter() + .rev() + .map(|c| { + match c { + b'A' => Ok(b'T'), + b'C' => Ok(b'G'), + b'G' => Ok(b'C'), + b'T' => Ok(b'A'), + b'N' => Ok(b'N'), + _ => bail!("Unexpected character for reverse-complement: {c}") + } + }) + .collect::, SimpleError>>() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_reverse_complement() { + let sequence = b"ACCGGGTN"; + let expected = b"NACCCGGT"; + let rc_result = reverse_complement(sequence).unwrap(); + assert_eq!(&rc_result, expected); + } + + #[test] + fn test_reverse_complement_invalid() { + let sequence = b"b"; + let rc_result = reverse_complement(sequence); + assert!(rc_result.is_err()); + } +} \ No newline at end of file diff --git a/src/util/stats.rs b/src/util/stats.rs new file mode 100644 index 0000000..382393d --- /dev/null +++ b/src/util/stats.rs @@ -0,0 +1,72 @@ + +/// This is an amalgamation of the Multinomial distribution from statrs and the factorial function which needs to be in ln mode. +/// Given a set of expected probabilities and observations, it will return the log-likelihood of that observation distribution. +/// References: /~https://github.com/statrs-dev/statrs/blob/e8e9c61b860241c70f9c71f2e07fbd6dde2cf44f/src/function/factorial.rs#L71 +/// /~https://github.com/statrs-dev/statrs/blob/e8e9c61b860241c70f9c71f2e07fbd6dde2cf44f/src/distribution/multinomial.rs#L222 +/// # Arguments +/// * `probs` - The probability of observing each category, the sum is expected to add to 1.0 +/// * `obs` - The number of times each category was observed +/// # Panics +/// * if `probs.len() != obs.len()` +pub fn multinomial_ln_pmf(probs: &[f64], obs: &[u64]) -> f64 { + use statrs::function::factorial::ln_factorial; + + // copy of sanity checks + if probs.len() != obs.len() { + panic!("Expected probs and obs to have equal lengths."); + } + + // we just derive total count here + let total_count: u64 = obs.iter().sum(); + assert!(total_count > 0); + + // here is where have to implement the multinomial in log space + let mut coeff = ln_factorial(total_count); // initialized to total_count! + for observed_count in obs.iter() { + // subtract out each observed_count! (this is division in normal space) + coeff -= ln_factorial(*observed_count); + } + + // the factorial just gets added because it is multiplied in combinatorial space + let val = coeff + + probs.iter() // this whole zip-iter is just copy pasted, but it's basically doing all the probability multiplication + .zip(obs.iter()) + .map(|(pi, xi)| *xi as f64 * pi.ln()) + .fold(0.0, |acc, x| acc + x); + val +} + +#[cfg(test)] +mod tests { + use super::*; + + use assert_approx_eq::assert_approx_eq; + + #[test] + fn test_multinomial() { + // 1 category - should basically always be 1.0 + let probs = [1.0]; + let obs = [10]; + assert_approx_eq!(multinomial_ln_pmf(&probs, &obs), 1.0_f64.ln()); + + // 2 categories (binomial) + let probs = [0.25, 0.75]; + let obs = [1, 3]; // normal ratios + let expected_prob: f64 = 4.0 * 0.25 * 0.75_f64.powf(3.0); + assert_approx_eq!(multinomial_ln_pmf(&probs, &obs), expected_prob.ln()); + + let obs = [3, 1]; // abnormal ratios + let expected_prob = 4.0 * 0.25_f64.powf(3.0) * 0.75; + assert_approx_eq!(multinomial_ln_pmf(&probs, &obs), expected_prob.ln()); + + // 3+ categories + let probs = [0.25, 0.25, 0.5]; + let obs = [1, 1, 2]; // normal ratios + let expected_prob: f64 = (4.0 * 3.0 * 2.0 / 2.0) * 0.25 * 0.25 * 0.5_f64.powf(2.0); + assert_approx_eq!(multinomial_ln_pmf(&probs, &obs), expected_prob.ln()); + + let obs = [2, 2, 0]; // abnormal ratios + let expected_prob: f64 = (4.0 * 3.0 * 2.0 / 2.0 / 2.0) * 0.25_f64.powf(4.0); + assert_approx_eq!(multinomial_ln_pmf(&probs, &obs), expected_prob.ln()); + } +} \ No newline at end of file diff --git a/src/visualization/debug_bam_writer.rs b/src/visualization/debug_bam_writer.rs new file mode 100644 index 0000000..a6bf558 --- /dev/null +++ b/src/visualization/debug_bam_writer.rs @@ -0,0 +1,236 @@ + +use log::{debug, info, warn}; +use minimap2::Aligner; +use rust_htslib::bam::record::{Cigar, CigarString}; +use rust_lib_reference_genome::reference_genome::ReferenceGenome; +use simple_error::bail; +use std::path::PathBuf; +use std::collections::{BTreeMap, HashMap}; + +use crate::data_types::coordinates::Coordinates; + +/// Creates an unmapped record with the minimal content we might need +pub fn unmapped_record(qname: &str, sequence: &str, tags: &BTreeMap) -> Result> { + // we need to create a new record that is unmapped + let mut record = rust_htslib::bam::Record::new(); + let qual = vec![255_u8; sequence.len()]; // this method has no provided quals + let cigar = None; // cigar comes after we align + + // set the basics for the record + record.set( + qname.as_bytes(), + cigar, + sequence.as_bytes(), + &qual + ); + + // set the chromosome and position + record.set_tid(-1); + record.set_pos(-1); + record.set_mapq(20); // do we care about adjusting this? + for (key, value) in tags.iter() { + record.push_aux(key.as_bytes(), rust_htslib::bam::record::Aux::String(value))?; + } + + Ok(record) +} + +/// This is a debug BAM writer, all records are kept in memory until `write_all_records(...)` is called. +/// This is not meant to write a full BAM, but just a few small sections. +pub struct DebugBamWriter<'a> { + /// Output path + out_fn: PathBuf, + /// Reference genome + reference_genome: &'a ReferenceGenome, + /// The actual record writer + writer: Option, + /// Contains all records that will eventually get written to the BAM + records: HashMap> +} + +impl<'a> DebugBamWriter<'a> { + /// Creates a new writer by building the header and prepping the writer for later. + /// # Arguments + /// * `out_fn` - the output filename to save everything to + /// * `reference_genome` - need to build out the header + pub fn new(out_fn: PathBuf, reference_genome: &'a ReferenceGenome) -> Result { + // create a default header + let mut header = rust_htslib::bam::Header::new(); + let mut header_record = rust_htslib::bam::header::HeaderRecord::new(b"HD"); + header_record.push_tag(b"VN", "1.5"); + header_record.push_tag(b"SO", "coordinate"); + header.push_record(&header_record); + + // for each chromosome in our output, we need to add it to the header + for chromosome in reference_genome.contig_keys().iter() { + // @SQ SN:chr22 LN:50818468 + let mut header_record = rust_htslib::bam::header::HeaderRecord::new(b"SQ"); + let target_length = reference_genome.get_full_chromosome(chromosome).len(); + header_record.push_tag(b"SN", chromosome); + header_record.push_tag(b"LN", target_length); + header.push_record(&header_record); + } + + // finally, init the writer for later + let writer = Some(rust_htslib::bam::Writer::from_path(&out_fn, &header, rust_htslib::bam::Format::Bam)?); + Ok(Self { + out_fn, + reference_genome, + writer, + records: Default::default() + }) + } + + /// For a given collection of unmapped records, this will attempt to map them to a specified region and add them to our collection. + /// # Arguments + /// * `unmapped_records` - the record information to map and save as a BAM record + /// * `target_region` - the region targeted for alignment + pub fn map_records_to_region(&mut self, unmapped_records: &[rust_htslib::bam::Record], target_region: &Coordinates) -> Result<(), Box> { + if self.writer.is_none() { + bail!("This writer has already written everything!"); + } + debug!("Generating records for {target_region}..."); + + // build a mapper for the region + let region_sequence = self.reference_genome.get_slice(target_region.chrom(), target_region.start() as usize, target_region.end() as usize); + let dna_aligner: Aligner = Aligner::builder() + .map_hifi() + .with_cigar() + .with_seq(region_sequence)?; + + // we only need cigar and md for debugging + // other settings for mapping + let output_cigar: bool = true; + let output_md: bool = true; + let max_frag_len: Option = None; + let extra_flags = None; + + // get the tid which will be shared for all records in this region + let tid = match self.writer.as_ref().unwrap().header().tid(target_region.chrom().as_bytes()) { + Some(t) => t as i32, + None => bail!("Could not find chromosome \"{}\" in reference genome.", target_region.chrom()) + }; + + // get the vec we populate with records on this chromosome + let record_vec = self.records.entry(target_region.chrom().to_string()).or_default(); + + for umr in unmapped_records.iter() { + // we will need to pull all of these out for setting later + let qname = umr.qname(); + let sequence = umr.seq().as_bytes(); + let qual = umr.qual(); + + // first, map the sequence + let mappings = dna_aligner.map( + &sequence, + output_cigar, output_md, max_frag_len, extra_flags.clone() + )?; + + // pick the mapping with the largest match_len of those created + if mappings.is_empty() { + // technically can happen, but it shouldn't; regardless we don't want to panic here + debug!("Failed to map unmapped record: {:?}", std::str::from_utf8(umr.qname())); + continue; + } + + // find the max based on (match_length - edit_distance) + // in the event of a tie, pick the earlier position + let core_mapping = mappings.iter() + .max_by_key(|&m| (m.match_len - m.alignment.as_ref().unwrap().nm, std::cmp::Reverse(m.target_start))) + .unwrap(); + + let mut cigar = CigarString( + core_mapping.alignment.as_ref().unwrap() + .cigar.as_ref().unwrap().iter() + .map(|&(l, t)| { + match t { + 0 => Cigar::Match(l), + 1 => Cigar::Ins(l), + 2 => Cigar::Del(l), + _ => panic!("unhandled cigar type: {t}") + } + }) + .collect() + ); + + let start_delta = core_mapping.query_start; + if start_delta > 0 { + cigar.insert(0, Cigar::SoftClip(start_delta as u32)); + } + + let end_delta = sequence.len() - core_mapping.query_end as usize; + if end_delta > 0 { + cigar.push(Cigar::SoftClip(end_delta as u32)); + } + + // these should come from the mapping + let cigar = Some(&cigar); + let pos = target_region.start() as i64 + core_mapping.target_start as i64; + + // initialize the mapped record by just cloning the other record and then overwrite the particulars + let mut record = umr.clone(); + record.set( + qname, + cigar, + &sequence, + qual + ); + + // set the chromosome and position + record.set_tid(tid); + record.set_pos(pos); + record.set_mapq(20); // do we care about adjusting this? + + // push it into our "to-save" list + record_vec.push(record); + } + + // done with all records, should be good! + Ok(()) + } + + /// Writes all records out that we've given to this struct + pub fn write_all_records(&mut self) -> Result<(), Box> { + if let Some(writer) = self.writer.as_mut() { + info!("Writing all records to {:?}...", self.out_fn); + for chromosome in self.reference_genome.contig_keys().iter() { + if let Some(record_vec) = self.records.get_mut(chromosome) { + // we have records to write on this chromosome + // first, we need to order by position + record_vec.sort_by_key(|a| a.pos()); + + // now send it to file + for record in record_vec.iter() { + writer.write(record)?; + } + } + } + } else { + bail!("This writer has already written everything!"); + } + + // delete our BAM file handle, flushing it and closing the file prior to indexing + self.writer = None; + + // build the index + info!("Building index..."); + let idx_type = rust_htslib::bam::index::Type::Bai; + match rust_htslib::bam::index::build( + &self.out_fn, + None, + idx_type, + 1 + ) { + Ok(()) => { + info!("Finished building index for {:?}", self.out_fn); + }, + Err(e) => { + warn!("Error while building index for {:?}: {}", self.out_fn, e); + warn!("Continuing with other processing..."); + } + }; + + // finish up + Ok(()) + } +} \ No newline at end of file diff --git a/src/visualization/igv_session_writer.rs b/src/visualization/igv_session_writer.rs new file mode 100644 index 0000000..e829bce --- /dev/null +++ b/src/visualization/igv_session_writer.rs @@ -0,0 +1,241 @@ + +use log::debug; +use rust_lib_reference_genome::reference_genome::ReferenceGenome; +use std::collections::BTreeMap; +use std::path::PathBuf; + +use crate::data_types::coordinates::Coordinates; +use crate::util::file_io::save_fasta; +use crate::visualization::debug_bam_writer::DebugBamWriter; + +pub const BUFFER_LEN: usize = 1000; +pub const CUSTOM_CONTIG: &str = "custom_contig"; + +const SESSION_PATH: &str = "custom_igv_session.xml"; +const REFERENCE_PATH: &str = "custom_reference.fa"; +const REGIONS_PATH: &str = "custom_regions.bed"; +const ALIGN_PATH: &str = "custom_alignments.bam"; + +/// Wrapper stucture that will write out a folder containing everything needed for a custom IGV session +pub struct IgvSessionWriter { + /// The folder we will save everything to + session_folder: PathBuf, + /// Our custom reference genome file + reference_genome: ReferenceGenome, + /// Our regions in the reference genome + regions: Vec<(Coordinates, String)>, + /// Collection of unmapped records we will map + unmapped_records: Vec +} + +impl IgvSessionWriter { + /// Creates a new session writer with the given collection of information + /// # Arguments + /// * `session_folder` - the folder we will eventually save everything to + /// * `reference_genome` - the reference genome we will align to and write to FASTA + /// # Errors + /// * None so far + pub fn new(session_folder: PathBuf, reference_genome: ReferenceGenome, regions: Vec<(Coordinates, String)>, unmapped_records: Vec) -> Self { + IgvSessionWriter { + session_folder, + reference_genome, + regions, + unmapped_records + } + } + + /// Attempts to save all the data that has been provided to the session writer + /// # Errors + /// * if we cannot create the session_folder or any of the sub-files (permissions) + pub fn write_session(&mut self) -> Result<(), Box> { + // create the folder that captures all our other outputs + debug!("Creating IGV session folder at {:?}...", self.session_folder); + std::fs::create_dir_all(&self.session_folder)?; + + // save our reference genome file + self.save_reference_genome()?; + + // save the regions to a BED + self.save_regions_bed()?; + + // create our alignments + self.save_aligned_bam()?; + + // finally save the session file + self.save_session_file()?; + + Ok(()) + } + + /// Saves the reference genome to a FASTA file + fn save_reference_genome(&self) -> Result<(), Box> { + let reference_filename = self.session_folder.join(REFERENCE_PATH); + debug!("Creating custom reference file at {reference_filename:?}"); + + let mut fasta_map: BTreeMap = Default::default(); + for contig_key in self.reference_genome.contig_keys().iter() { + let value = std::str::from_utf8(self.reference_genome.get_full_chromosome(contig_key))?; + fasta_map.insert(contig_key.clone(), value.to_string()); + } + save_fasta(&fasta_map, &reference_filename)?; + // TODO: ideally, we would bump to latest rust_htslib and do this; however, apparently we get compile issues because + // HiPhase (dep) is on 0.39.5 and I guess that causes problems + // likely solution: split off the HiPhase components into a separate crate that can be shared; the components we need do not need htslib + // alternate solution: copy the build function in 0.47.0 + // rust_htslib::faidx::build(&reference_filename)?; + crate::util::file_io::index_fasta(&reference_filename)?; + Ok(()) + } + + /// This will save the regions of interest to a BED file, which is then overlaid in IGV + fn save_regions_bed(&self) -> Result<(), Box> { + let regions_filename = self.session_folder.join(REGIONS_PATH); + debug!("Creating custom regions file at {regions_filename:?}"); + + let mut bed_writer = csv::WriterBuilder::new() + .delimiter(b'\t') + .from_path(regions_filename)?; + + // chrom start end label + for (coordinate, label) in self.regions.iter() { + let start = coordinate.start().to_string(); + let end = coordinate.end().to_string(); + bed_writer.write_record([ + coordinate.chrom(), + &start, + &end, + label + ])?; + } + + Ok(()) + } + + /// Creates an aligned BAM file for visualization + fn save_aligned_bam(&self) -> Result<(), Box> { + let bam_filename = self.session_folder.join(ALIGN_PATH); + debug!("Creating custom BAM file at {bam_filename:?}"); + + // initial the writer with our custom genome + let mut debug_bam_writer = DebugBamWriter::new(bam_filename, &self.reference_genome)?; + + // map all the reads to it + let chrom_len = self.reference_genome.get_full_chromosome(CUSTOM_CONTIG).len(); + let target_region = Coordinates::new( + CUSTOM_CONTIG.to_string(), + BUFFER_LEN as u64, + (chrom_len - BUFFER_LEN) as u64 + ); + debug_bam_writer.map_records_to_region(&self.unmapped_records, &target_region)?; + + // write all the records we got + debug_bam_writer.write_all_records() + } + + /// Create the full session file and saves it + fn save_session_file(&self) -> Result<(), Box> { + use quick_xml::events::{Event, BytesDecl, BytesEnd, BytesStart}; + + let igv_filename = self.session_folder.join(SESSION_PATH); + debug!("Creating custom IGV session file at {igv_filename:?}"); + + let file_handle = std::fs::File::create(igv_filename)?; + let mut writer = quick_xml::writer::Writer::new_with_indent(file_handle, b' ', 4); + writer.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), Some("no"))))?; + + // write the start of the session block + let mut session_start = BytesStart::new("Session"); + session_start.push_attribute(("genome", REFERENCE_PATH)); + writer.write_event(Event::Start(session_start))?; + + // this collects all the files as Resources + writer.write_event(Event::Start(BytesStart::new("Resources")))?; + + // add the bam file as a resource + let mut bam_resource = BytesStart::new("Resource"); + bam_resource.push_attribute(("type", "bam")); + bam_resource.push_attribute(("path", ALIGN_PATH)); + // bam_resource.push_attribute(("index", format!("{ALIGN_PATH}.bai").as_str())); // for some reason, this actually breaks it + writer.write_event(Event::Empty(bam_resource))?; + + // add the bed regions as a resource + let mut bed_resource = BytesStart::new("Resource"); + bed_resource.push_attribute(("type", "bed")); + bed_resource.push_attribute(("path", REGIONS_PATH)); + writer.write_event(Event::Empty(bed_resource))?; + writer.write_event(Event::End(BytesEnd::new("Resources")))?; + + // add the first panel, which is the alignment panel + let mut panel_start = BytesStart::new("Panel"); + panel_start.push_attribute(("name", "Panel0")); + writer.write_event(Event::Start(panel_start))?; + + let alignment_tracks = [ + vec![ + ("attributeKey", "custom_alignments.bam Coverage"), + ("autoScale", "true"), + ("clazz", "org.broad.igv.sam.CoverageTrack"), + ("id", "custom_alignments.bam_coverage") + ], + vec![ + ("attributeKey", "custom_alignments.bam Junctions"), + ("autoScale", "false"), + ("clazz", "org.broad.igv.sam.SpliceJunctionTrack"), + ("id", "custom_alignments.bam_junctions"), + ("visible", "false") + ], + vec![ + ("attributeKey", "custom_alignments.bam"), + ("clazz", "org.broad.igv.sam.AlignmentTrack"), + ("id", "custom_alignments.bam") + ] + ]; + for track_attributes in alignment_tracks.into_iter() { + let mut track = BytesStart::new("Track"); + for ta in track_attributes.into_iter() { + track.push_attribute(ta); + } + writer.write_event(Event::Empty(track))?; + } + + writer.write_event(Event::End(BytesEnd::new("Panel")))?; + + // add the second panel, which has a defined named + let mut panel_start = BytesStart::new("Panel"); + panel_start.push_attribute(("name", "FeaturePanel")); + writer.write_event(Event::Start(panel_start))?; + + let feature_tracks = [ + vec![ + ("attributeKey", "Reference sequence"), + ("clazz", "org.broad.igv.track.SequenceTrack"), + ("id", "Reference sequence") + ], + vec![ + ("attributeKey", "custom_regions.bed"), + ("clazz", "org.broad.igv.track.FeatureTrack"), + ("displayMode", "EXPANDED"), + ("id", "custom_regions.bed") + ] + ]; + for track_attributes in feature_tracks.into_iter() { + let mut track = BytesStart::new("Track"); + for ta in track_attributes.into_iter() { + track.push_attribute(ta); + } + writer.write_event(Event::Empty(track))?; + } + + writer.write_event(Event::End(BytesEnd::new("Panel")))?; + + // add the layout config + let mut panel_start = BytesStart::new("PanelLayout"); + panel_start.push_attribute(("dividerFractions", "0.0,0.85")); + writer.write_event(Event::Empty(panel_start))?; + + // write the end of the session block + writer.write_event(Event::End(BytesEnd::new("Session")))?; + + Ok(()) + } +} diff --git a/src/visualization/mod.rs b/src/visualization/mod.rs new file mode 100644 index 0000000..8c354eb --- /dev/null +++ b/src/visualization/mod.rs @@ -0,0 +1,5 @@ + +/// Shared writer for a debug BAM file +pub mod debug_bam_writer; +/// Creates IGV sessions with custom reference genomes and aligned reads +pub mod igv_session_writer; diff --git a/test_data/CACNA1S/CACNA1S_gene_list.txt b/test_data/CACNA1S/CACNA1S_gene_list.txt new file mode 100644 index 0000000..f602e98 --- /dev/null +++ b/test_data/CACNA1S/CACNA1S_gene_list.txt @@ -0,0 +1 @@ +CACNA1S \ No newline at end of file diff --git a/test_data/CACNA1S/CPIC_API.json b/test_data/CACNA1S/CPIC_API.json new file mode 100644 index 0000000..283c11b --- /dev/null +++ b/test_data/CACNA1S/CPIC_API.json @@ -0,0 +1,103 @@ +[ + { + "id": 777266, + "version": 58, + "genesymbol": "CACNA1S", + "name": "c.3257G>A", + "pharmvarid": null, + "matchesreferencesequence": false, + "structuralvariation": false, + "allele_location_value": [ + { + "alleledefinitionid": 777266, + "locationid": 777261, + "variantallele": "T", + "version": 1, + "sequence_location": { + "id": 777261, + "version": 59, + "name": "c.3257G>A", + "chromosomelocation": "g.201060815C>T", + "genelocation": "g.56752G>A", + "proteinlocation": "p.R1086H", + "genesymbol": "CACNA1S", + "dbsnpid": "rs1800559", + "position": 201060815 + } + } + ] + }, + { + "id": 777264, + "version": 58, + "genesymbol": "CACNA1S", + "name": "c.520C>T", + "pharmvarid": null, + "matchesreferencesequence": false, + "structuralvariation": false, + "allele_location_value": [ + { + "alleledefinitionid": 777264, + "locationid": 777260, + "variantallele": "A", + "version": 1, + "sequence_location": { + "id": 777260, + "version": 59, + "name": "c.520C>T", + "chromosomelocation": "g.201091993G>A", + "genelocation": "g.25574C>T", + "proteinlocation": "p.R174W", + "genesymbol": "CACNA1S", + "dbsnpid": "rs772226819", + "position": 201091993 + } + } + ] + }, + { + "id": 777262, + "version": 58, + "genesymbol": "CACNA1S", + "name": "Reference", + "pharmvarid": null, + "matchesreferencesequence": true, + "structuralvariation": false, + "allele_location_value": [ + { + "alleledefinitionid": 777262, + "locationid": 777260, + "variantallele": "G", + "version": 1, + "sequence_location": { + "id": 777260, + "version": 59, + "name": "c.520C>T", + "chromosomelocation": "g.201091993G>A", + "genelocation": "g.25574C>T", + "proteinlocation": "p.R174W", + "genesymbol": "CACNA1S", + "dbsnpid": "rs772226819", + "position": 201091993 + } + }, + { + "alleledefinitionid": 777262, + "locationid": 777261, + "variantallele": "C", + "version": 1, + "sequence_location": { + "id": 777261, + "version": 59, + "name": "c.3257G>A", + "chromosomelocation": "g.201060815C>T", + "genelocation": "g.56752G>A", + "proteinlocation": "p.R1086H", + "genesymbol": "CACNA1S", + "dbsnpid": "rs1800559", + "position": 201060815 + } + } + ] + } +] \ No newline at end of file diff --git a/test_data/CACNA1S/bad_hom_ps.vcf.gz b/test_data/CACNA1S/bad_hom_ps.vcf.gz new file mode 100644 index 0000000..2218252 Binary files /dev/null and b/test_data/CACNA1S/bad_hom_ps.vcf.gz differ diff --git a/test_data/CACNA1S/bad_hom_ps.vcf.gz.tbi b/test_data/CACNA1S/bad_hom_ps.vcf.gz.tbi new file mode 100644 index 0000000..1aa4663 Binary files /dev/null and b/test_data/CACNA1S/bad_hom_ps.vcf.gz.tbi differ diff --git a/test_data/CACNA1S/compound_het.vcf.gz b/test_data/CACNA1S/compound_het.vcf.gz new file mode 100644 index 0000000..cf203e6 Binary files /dev/null and b/test_data/CACNA1S/compound_het.vcf.gz differ diff --git a/test_data/CACNA1S/compound_het.vcf.gz.tbi b/test_data/CACNA1S/compound_het.vcf.gz.tbi new file mode 100644 index 0000000..bf45a36 Binary files /dev/null and b/test_data/CACNA1S/compound_het.vcf.gz.tbi differ diff --git a/test_data/CACNA1S/database.json b/test_data/CACNA1S/database.json new file mode 100644 index 0000000..804f0e7 --- /dev/null +++ b/test_data/CACNA1S/database.json @@ -0,0 +1,56 @@ +{ + "database_metadata": { + "pbstarphase_version": "0.1.0-9ac34a4", + "cpic_version": "API-2023-09-05T14:03:45.314899420Z", + "build_time": "2023-09-05T14:03:45.314899420Z", + "hla_version" : "fake_version", + "pharmvar_version" : "fake_version" + }, + "gene_entries": { + "CACNA1S": { + "gene_name": "CACNA1S", + "chromosome": "chr1", + "variants": { + "777260": { + "name" : "faux", + "dbsnp_id": "rs772226819", + "position": 201091993, + "alleles": [ + "G", + "A" + ] + }, + "777261": { + "name" : "faux", + "dbsnp_id": "rs1800559", + "position": 201060815, + "alleles": [ + "C", + "T" + ] + } + }, + "defined_haplotypes": { + "Reference": { + "haplotype": { + "777260": "G", + "777261": "C" + } + }, + "c.3257G>A": { + "haplotype": { + "777261": "T" + } + }, + "c.520C>T": { + "haplotype": { + "777260": "A" + } + } + }, + "reference_allele": "Reference" + } + }, + "hla_sequences" : {}, + "cyp2d6_gene_def" : {} +} \ No newline at end of file diff --git a/test_data/CACNA1S/het.vcf.gz b/test_data/CACNA1S/het.vcf.gz new file mode 100644 index 0000000..9719909 Binary files /dev/null and b/test_data/CACNA1S/het.vcf.gz differ diff --git a/test_data/CACNA1S/het.vcf.gz.tbi b/test_data/CACNA1S/het.vcf.gz.tbi new file mode 100644 index 0000000..7435448 Binary files /dev/null and b/test_data/CACNA1S/het.vcf.gz.tbi differ diff --git a/test_data/CACNA1S/hom.vcf.gz b/test_data/CACNA1S/hom.vcf.gz new file mode 100644 index 0000000..0f0858f Binary files /dev/null and b/test_data/CACNA1S/hom.vcf.gz differ diff --git a/test_data/CACNA1S/hom.vcf.gz.tbi b/test_data/CACNA1S/hom.vcf.gz.tbi new file mode 100644 index 0000000..7435448 Binary files /dev/null and b/test_data/CACNA1S/hom.vcf.gz.tbi differ diff --git a/test_data/CYP2D6_configs/full_length.json b/test_data/CYP2D6_configs/full_length.json new file mode 100644 index 0000000..0a3ce63 --- /dev/null +++ b/test_data/CYP2D6_configs/full_length.json @@ -0,0 +1,236 @@ +{ + "cyp_coordinates": { + "CYP2D6": { + "chrom": "chr22", + "start": 42126259, + "end": 42132424 + }, + "CYP2D6_wfa_backbone": { + "chrom": "chr22", + "start": 42126259, + "end": 42132424 + }, + "CYP2D7": { + "chrom": "chr22", + "start": 42139965, + "end": 42145903 + }, + "REP6": { + "chrom": "chr22", + "start": 42123191, + "end": 42125963 + }, + "REP7": { + "chrom": "chr22", + "start": 42135343, + "end": 42138115 + }, + "link_region": { + "chrom": "chr22", + "start": 42132423, + "end": 42135344 + }, + "spacer": { + "chrom": "chr22", + "start": 42138114, + "end": 42139679 + } + }, + "cyp_regions": { + "CYP2D6": { + "exon1": { + "chrom": "chr22", + "start": 42130611, + "end": 42130810 + }, + "exon2": { + "chrom": "chr22", + "start": 42129737, + "end": 42129909 + }, + "exon3": { + "chrom": "chr22", + "start": 42129032, + "end": 42129185 + }, + "exon4": { + "chrom": "chr22", + "start": 42128783, + "end": 42128944 + }, + "exon5": { + "chrom": "chr22", + "start": 42128173, + "end": 42128350 + }, + "exon6": { + "chrom": "chr22", + "start": 42127841, + "end": 42127983 + }, + "exon7": { + "chrom": "chr22", + "start": 42127446, + "end": 42127634 + }, + "exon8": { + "chrom": "chr22", + "start": 42126850, + "end": 42126992 + }, + "exon9": { + "chrom": "chr22", + "start": 42126498, + "end": 42126752 + } + }, + "CYP2D7": { + "exon1": { + "chrom": "chr22", + "start": 42144283, + "end": 42144483 + }, + "exon2": { + "chrom": "chr22", + "start": 42143409, + "end": 42143581 + }, + "exon3": { + "chrom": "chr22", + "start": 42142727, + "end": 42142880 + }, + "exon4": { + "chrom": "chr22", + "start": 42142478, + "end": 42142639 + }, + "exon5": { + "chrom": "chr22", + "start": 42141867, + "end": 42142044 + }, + "exon6": { + "chrom": "chr22", + "start": 42141533, + "end": 42141675 + }, + "exon7": { + "chrom": "chr22", + "start": 42141151, + "end": 42141339 + }, + "exon8": { + "chrom": "chr22", + "start": 42140554, + "end": 42140696 + }, + "exon9": { + "chrom": "chr22", + "start": 42140202, + "end": 42140456 + } + } + }, + "cyp2d6_star5_del": { + "chrom": "chr22", + "start": 42123191, + "end": 42135343 + }, + "cyp_translate": { + "CYP2D6::CYP2D7::exon2": "68", + "CYP2D6::CYP2D7::exon8": "61", + "CYP2D6::CYP2D7::intron1": "68", + "CYP2D6::CYP2D7::intron8": "63", + "CYP2D7::CYP2D6::exon2": "13", + "CYP2D7::CYP2D6::exon3": "13", + "CYP2D7::CYP2D6::exon4": "13", + "CYP2D7::CYP2D6::exon5": "13", + "CYP2D7::CYP2D6::exon6": "13", + "CYP2D7::CYP2D6::exon7": "13", + "CYP2D7::CYP2D6::exon8": "13", + "CYP2D7::CYP2D6::exon9": "13", + "CYP2D7::CYP2D6::intron1": "13", + "CYP2D7::CYP2D6::intron2": "13", + "CYP2D7::CYP2D6::intron3": "13", + "CYP2D7::CYP2D6::intron4": "13", + "CYP2D7::CYP2D6::intron5": "13", + "CYP2D7::CYP2D6::intron6": "13", + "CYP2D7::CYP2D6::intron7": "13", + "CYP2D7::CYP2D6::intron8": "13" + }, + "inferred_connections": [ + [ + "*1", + "*1" + ], + [ + "*10", + "*10" + ], + [ + "*10", + "*36" + ], + [ + "*146", + "*146" + ], + [ + "*17", + "*17" + ], + [ + "*2", + "*2" + ], + [ + "*28", + "*28" + ], + [ + "*29", + "*29" + ], + [ + "*3", + "*3" + ], + [ + "*35", + "*35" + ], + [ + "*4", + "*4" + ], + [ + "*4", + "*68" + ], + [ + "*41", + "*41" + ], + [ + "*43", + "*43" + ], + [ + "*45", + "*45" + ], + [ + "*6", + "*6" + ], + [ + "*9", + "*9" + ] + ], + "unexpected_singletons": [ + "*36", + "*68" + ] +} \ No newline at end of file diff --git a/test_data/CYP2D6_configs/missing_exons.json b/test_data/CYP2D6_configs/missing_exons.json new file mode 100644 index 0000000..1cb0ca7 --- /dev/null +++ b/test_data/CYP2D6_configs/missing_exons.json @@ -0,0 +1,231 @@ +{ + "cyp_coordinates": { + "CYP2D6": { + "chrom": "chr22", + "start": 42126259, + "end": 42132424 + }, + "CYP2D6_wfa_backbone": { + "chrom": "chr22", + "start": 42126259, + "end": 42132424 + }, + "CYP2D7": { + "chrom": "chr22", + "start": 42139965, + "end": 42145903 + }, + "REP6": { + "chrom": "chr22", + "start": 42123191, + "end": 42125963 + }, + "REP7": { + "chrom": "chr22", + "start": 42135343, + "end": 42138115 + }, + "link_region": { + "chrom": "chr22", + "start": 42132423, + "end": 42135344 + }, + "spacer": { + "chrom": "chr22", + "start": 42138114, + "end": 42139679 + } + }, + "cyp_regions": { + "CYP2D6": { + "exon1": { + "chrom": "chr22", + "start": 42130611, + "end": 42130810 + }, + "exon3": { + "chrom": "chr22", + "start": 42129032, + "end": 42129185 + }, + "exon4": { + "chrom": "chr22", + "start": 42128783, + "end": 42128944 + }, + "exon5": { + "chrom": "chr22", + "start": 42128173, + "end": 42128350 + }, + "exon6": { + "chrom": "chr22", + "start": 42127841, + "end": 42127983 + }, + "exon7": { + "chrom": "chr22", + "start": 42127446, + "end": 42127634 + }, + "exon8": { + "chrom": "chr22", + "start": 42126850, + "end": 42126992 + }, + "exon9": { + "chrom": "chr22", + "start": 42126498, + "end": 42126752 + } + }, + "CYP2D7": { + "exon1": { + "chrom": "chr22", + "start": 42144283, + "end": 42144483 + }, + "exon2": { + "chrom": "chr22", + "start": 42143409, + "end": 42143581 + }, + "exon3": { + "chrom": "chr22", + "start": 42142727, + "end": 42142880 + }, + "exon4": { + "chrom": "chr22", + "start": 42142478, + "end": 42142639 + }, + "exon5": { + "chrom": "chr22", + "start": 42141867, + "end": 42142044 + }, + "exon6": { + "chrom": "chr22", + "start": 42141533, + "end": 42141675 + }, + "exon7": { + "chrom": "chr22", + "start": 42141151, + "end": 42141339 + }, + "exon8": { + "chrom": "chr22", + "start": 42140554, + "end": 42140696 + }, + "exon9": { + "chrom": "chr22", + "start": 42140202, + "end": 42140456 + } + } + }, + "cyp2d6_star5_del": { + "chrom": "chr22", + "start": 42123191, + "end": 42135343 + }, + "cyp_translate": { + "CYP2D6::CYP2D7::exon2": "68", + "CYP2D6::CYP2D7::exon8": "61", + "CYP2D6::CYP2D7::intron1": "68", + "CYP2D6::CYP2D7::intron8": "63", + "CYP2D7::CYP2D6::exon2": "13", + "CYP2D7::CYP2D6::exon3": "13", + "CYP2D7::CYP2D6::exon4": "13", + "CYP2D7::CYP2D6::exon5": "13", + "CYP2D7::CYP2D6::exon6": "13", + "CYP2D7::CYP2D6::exon7": "13", + "CYP2D7::CYP2D6::exon8": "13", + "CYP2D7::CYP2D6::exon9": "13", + "CYP2D7::CYP2D6::intron1": "13", + "CYP2D7::CYP2D6::intron2": "13", + "CYP2D7::CYP2D6::intron3": "13", + "CYP2D7::CYP2D6::intron4": "13", + "CYP2D7::CYP2D6::intron5": "13", + "CYP2D7::CYP2D6::intron6": "13", + "CYP2D7::CYP2D6::intron7": "13", + "CYP2D7::CYP2D6::intron8": "13" + }, + "inferred_connections": [ + [ + "*1", + "*1" + ], + [ + "*10", + "*10" + ], + [ + "*10", + "*36" + ], + [ + "*146", + "*146" + ], + [ + "*17", + "*17" + ], + [ + "*2", + "*2" + ], + [ + "*28", + "*28" + ], + [ + "*29", + "*29" + ], + [ + "*3", + "*3" + ], + [ + "*35", + "*35" + ], + [ + "*4", + "*4" + ], + [ + "*4", + "*68" + ], + [ + "*41", + "*41" + ], + [ + "*43", + "*43" + ], + [ + "*45", + "*45" + ], + [ + "*6", + "*6" + ], + [ + "*9", + "*9" + ] + ], + "unexpected_singletons": [ + "*36", + "*68" + ] +} \ No newline at end of file diff --git a/test_data/CYP2D6_configs/missing_regions.json b/test_data/CYP2D6_configs/missing_regions.json new file mode 100644 index 0000000..7612c86 --- /dev/null +++ b/test_data/CYP2D6_configs/missing_regions.json @@ -0,0 +1,231 @@ +{ + "cyp_coordinates": { + "CYP2D6_wfa_backbone": { + "chrom": "chr22", + "start": 42126259, + "end": 42132424 + }, + "CYP2D7": { + "chrom": "chr22", + "start": 42139965, + "end": 42145903 + }, + "REP6": { + "chrom": "chr22", + "start": 42123191, + "end": 42125963 + }, + "REP7": { + "chrom": "chr22", + "start": 42135343, + "end": 42138115 + }, + "link_region": { + "chrom": "chr22", + "start": 42132423, + "end": 42135344 + }, + "spacer": { + "chrom": "chr22", + "start": 42138114, + "end": 42139679 + } + }, + "cyp_regions": { + "CYP2D6": { + "exon1": { + "chrom": "chr22", + "start": 42130611, + "end": 42130810 + }, + "exon2": { + "chrom": "chr22", + "start": 42129737, + "end": 42129909 + }, + "exon3": { + "chrom": "chr22", + "start": 42129032, + "end": 42129185 + }, + "exon4": { + "chrom": "chr22", + "start": 42128783, + "end": 42128944 + }, + "exon5": { + "chrom": "chr22", + "start": 42128173, + "end": 42128350 + }, + "exon6": { + "chrom": "chr22", + "start": 42127841, + "end": 42127983 + }, + "exon7": { + "chrom": "chr22", + "start": 42127446, + "end": 42127634 + }, + "exon8": { + "chrom": "chr22", + "start": 42126850, + "end": 42126992 + }, + "exon9": { + "chrom": "chr22", + "start": 42126498, + "end": 42126752 + } + }, + "CYP2D7": { + "exon1": { + "chrom": "chr22", + "start": 42144283, + "end": 42144483 + }, + "exon2": { + "chrom": "chr22", + "start": 42143409, + "end": 42143581 + }, + "exon3": { + "chrom": "chr22", + "start": 42142727, + "end": 42142880 + }, + "exon4": { + "chrom": "chr22", + "start": 42142478, + "end": 42142639 + }, + "exon5": { + "chrom": "chr22", + "start": 42141867, + "end": 42142044 + }, + "exon6": { + "chrom": "chr22", + "start": 42141533, + "end": 42141675 + }, + "exon7": { + "chrom": "chr22", + "start": 42141151, + "end": 42141339 + }, + "exon8": { + "chrom": "chr22", + "start": 42140554, + "end": 42140696 + }, + "exon9": { + "chrom": "chr22", + "start": 42140202, + "end": 42140456 + } + } + }, + "cyp2d6_star5_del": { + "chrom": "chr22", + "start": 42123191, + "end": 42135343 + }, + "cyp_translate": { + "CYP2D6::CYP2D7::exon2": "68", + "CYP2D6::CYP2D7::exon8": "61", + "CYP2D6::CYP2D7::intron1": "68", + "CYP2D6::CYP2D7::intron8": "63", + "CYP2D7::CYP2D6::exon2": "13", + "CYP2D7::CYP2D6::exon3": "13", + "CYP2D7::CYP2D6::exon4": "13", + "CYP2D7::CYP2D6::exon5": "13", + "CYP2D7::CYP2D6::exon6": "13", + "CYP2D7::CYP2D6::exon7": "13", + "CYP2D7::CYP2D6::exon8": "13", + "CYP2D7::CYP2D6::exon9": "13", + "CYP2D7::CYP2D6::intron1": "13", + "CYP2D7::CYP2D6::intron2": "13", + "CYP2D7::CYP2D6::intron3": "13", + "CYP2D7::CYP2D6::intron4": "13", + "CYP2D7::CYP2D6::intron5": "13", + "CYP2D7::CYP2D6::intron6": "13", + "CYP2D7::CYP2D6::intron7": "13", + "CYP2D7::CYP2D6::intron8": "13" + }, + "inferred_connections": [ + [ + "*1", + "*1" + ], + [ + "*10", + "*10" + ], + [ + "*10", + "*36" + ], + [ + "*146", + "*146" + ], + [ + "*17", + "*17" + ], + [ + "*2", + "*2" + ], + [ + "*28", + "*28" + ], + [ + "*29", + "*29" + ], + [ + "*3", + "*3" + ], + [ + "*35", + "*35" + ], + [ + "*4", + "*4" + ], + [ + "*4", + "*68" + ], + [ + "*41", + "*41" + ], + [ + "*43", + "*43" + ], + [ + "*45", + "*45" + ], + [ + "*6", + "*6" + ], + [ + "*9", + "*9" + ] + ], + "unexpected_singletons": [ + "*36", + "*68" + ] +} \ No newline at end of file diff --git a/test_data/HLA-faux/database.json b/test_data/HLA-faux/database.json new file mode 100644 index 0000000..2eaa5dc --- /dev/null +++ b/test_data/HLA-faux/database.json @@ -0,0 +1,37 @@ +{ + "database_metadata": { + "pbstarphase_version": "0.8.0-79d4679", + "cpic_version": "API-2023-12-19T16:11:50.938951041Z", + "hla_version": "v.354.0-alpha", + "pharmvar_version" : "fake_version", + "build_time": "2023-12-19T16:11:50.938951041Z" + }, + "gene_entries": {}, + "hla_sequences": { + "HLA:HLA00037": { + "hla_id": "HLA:HLA00037", + "gene_name": "HLA-A", + "star_allele": [ + "03", + "01", + "01", + "01" + ], + "dna_sequence": "CAGGAGCAGAGGGGTCAGGGCGAAGTCCCAGGGCCCCAGGCGTGGCTCTCAGAGTCTCAGGCCCCGAAGGCGGTGTATGGATTGGGGAGTCCCAGCCTTGGGGATTCCCCAACTCCGCAGTTTCTTTTCTCCCTCTCCCAACCTACGTAGGGTCCTTCATCCTGGATACTCACGACGCGGACCCAGTTCTCACTCCCATTGGGTGTCGGGTTTCCAGAGAAGCCAATCAGTGTCGTCGCGGTCGCTGTTCTAAAGCCCGCACGCACCCACCGGGACTCAGATTCTCCCCAGACGCCGAGGATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGGCCCTGGCCCTGACCCAGACCTGGGCGGGTGAGTGCGGGGTCGGGAGGGAAACCGCCTCTGCGGGGAGAAGCAAGGGGCCCTCCTGGCGGGGGCGCAGGACCGGGGGAGCCGCGCCGGGACGAGGGTCGGGCAGGTCTCAGCCACTGCTCGCCCCCAGGCTCCCACTCCATGAGGTATTTCTTCACATCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCGCCGTGGGCTACGTGGACGACACGCAGTTCGTGCGGTTCGACAGCGACGCCGCGAGCCAGAGGATGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCAGGAGACACGGAATGTGAAGGCCCAGTCACAGACTGACCGAGTGGACCTGGGGACCCTGCGCGGCTACTACAACCAGAGCGAGGCCGGTGAGTGACCCCGGCCGGGGGCGCAGGTCAGGACCCCTCATCCCCCACGGACGGGCCAGGTCGCCCACAGTCTCCGGGTCCGAGATCCACCCCGAAGCCGCGGGACCCCGAGACCCTTGCCCCGGGAGAGGCCCAGGCGCCTTTACCCGGTTTCATTTTCAGTTTAGGCCAAAAATCCCCCCGGGTTGGTCGGGGCTGGGCGGGGCTCGGGGGACTGGGCTGACCGCGGGGTCGGGGCCAGGTTCTCACACCATCCAGATAATGTATGGCTGCGACGTGGGGTCGGACGGGCGCTTCCTCCGCGGGTACCGGCAGGACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCTTGGACCGCGGCGGACATGGCGGCTCAGATCACCAAGCGCAAGTGGGAGGCGGCCCATGAGGCGGAGCAGTTGAGAGCCTACCTGGATGGCACGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGAGACGCTGCAGCGCACGGGTACCAGGGGCCACGGGGCGCCTCCCTGATCGCCTGTAGATCTCCCGGGCTGGCCTCCCACAAGGAGGGGAGACAATTGGGACCAACACTAGAATATCACCCTCCCTCTGGTCCTGAGGGAGAGGAATCCTCCTGGGTTCCAGATCCTGTACCAGAGAGTGACTCTGAGGTTCCGCCCTGCTCTCTGACACAATTAAGGGATAAAATCTCTGAAGGAGTGACGGGAAGACGATCCCTCGAATACTGATGAGTGGTTCCCTTTGACACCGGCAGCAGCCTTGGGCCCGTGACTTTTCCTCTCAGGCCTTGTTCTCTGCTTCACACTCAATGTGTGTGGGGGTCTGAGTCCAGCACTTCTGAGTCCCTCAGCCTCCACTCAGGTCAGGACCAGAAGTCGCTGTTCCCTTCTCAGGGAATAGAAGATTATCCCAGGTGCCTGTGTCCAGGCTGGTGTCTGGGTTCTGTGCTCTCTTCCCCATCCCGGGTGTCCTGTCCATTCTCAAGATGGCCACATGCGTGCTGGTGGAGTGTCCCATGACAGATGCAAAATGCCTGAATTTTCTGACTCTTCCCGTCAGACCCCCCCAAGACACATATGACCCACCACCCCATCTCTGACCATGAGGCCACCCTGAGGTGCTGGGCCCTGGGCTTCTACCCTGCGGAGATCACACTGACCTGGCAGCGGGATGGGGAGGACCAGACCCAGGACACGGAGCTCGTGGAGACCAGGCCTGCAGGGGATGGAACCTTCCAGAAGTGGGCGGCTGTGGTGGTGCCTTCTGGAGAGGAGCAGAGATACACCTGCCATGTGCAGCATGAGGGTCTGCCCAAGCCCCTCACCCTGAGATGGGGTAAGGAGGGAGATGGGGGTGTCATGTCTCTTAGGGAAAGCAGGAGCCTCTCTGGAGACCTTTAGCAGGGTCAGGGCCCCTCACCTTCCCCTCTTTTCCCAGAGCTGTCTTCCCAGCCCACCATCCCCATCGTGGGCATCATTGCTGGCCTGGTTCTCCTTGGAGCTGTGATCACTGGAGCTGTGGTCGCTGCCGTGATGTGGAGGAGGAAGAGCTCAGGTGGAGAAGGGGTGAAGGGTGGGGTCTGAGATTTCTTGTCTCACTGAGGGTTCCAAGCCCCAGCTAGAAATGTGCCCTGTCTCATTACTGGGAAGCACCGTCCACAATCATGGGCCTACCCAGTCTGGGCCCTGTGTGCCAGCACTTACTCTTTTGTAAAGCACCTGTTAAAATGAAGGACAGATTTATCACCTTGATTACGGCGGTGATGGGACCTGATCCCAGCAGTCACAAGTCACAGGGGAAGGTCCCTGAGGACAGACCTCAGGAGGGCTATTGGTCCAGGACCCACACCTGCTTTCTTCATGTTTCCTGATCCCGCCCTGGGTCTGCAGTCACACATTTCTGGAAACTTCTCTGGGGTCCAAGACTAGGAGGTTCCTCTAGGACCTTAAGGCCCTGGCTCCTTTCTGGTATCTCACAGGACATTTTCTTCTCACAGATAGAAAAGGAGGGAGTTACACTCAGGCTGCAAGTAAGTATGAAGGAGGCTGATGCCTGAGGTCCTTGGGATATTGTGTTTGGGAGCCCATGGGGGAGCCCACCCACCTCACAATTCCTCCTCTAGCCACATCTTCTGTGGGATCTGACCAGGTTCTGTTTTTGTTCTACCCCAGGCAGTGACAGTGCCCAGGGCTCTGATGTGTCCCTCACAGCTTGTAAAGGTGAGAGCTTGGAGGACCTAATGTGTGTTGGGTGTTGGGCGGAACAGTGGACACAGCTGTGCTATGGGGTTTCTTTGCATTGGATGTATTGAGCATGCGATGGGCTGTTTAAGGTGTGACCCCTCACTGTGATGGATATGAATTTGTTCATGAATATTTTTTTCTATAGTGTGAGACAGCTGCCTTGTGTGGGACTGAGAGGCAAGAGTTGTTCCTGCCCTTCCCTTTGTGACTTGAAGAACCCTGACTTTGTTTCTGCAAAGGCACCTGCATGTGTCTGTGTTCGTGTAGGCATAATGTGAGGAGGTGGGGAGACCACCCCACCCCCATGTCCACCATGACCCTCTTCCCACGCTGACCTGTGCTCCCTCCCCAATCATCTTTCCTGTTCCAGAGAGGTGGGGCTGAGGTGTCTCCATCTCTGTCTCAACTTCATGGTGCACTGAGCTGTAACTTCTTCCTTCCCTATTAAAA", + "cdna_sequence": "ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGGCCCTGGCCCTGACCCAGACCTGGGCGGGCTCCCACTCCATGAGGTATTTCTTCACATCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCGCCGTGGGCTACGTGGACGACACGCAGTTCGTGCGGTTCGACAGCGACGCCGCGAGCCAGAGGATGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCAGGAGACACGGAATGTGAAGGCCCAGTCACAGACTGACCGAGTGGACCTGGGGACCCTGCGCGGCTACTACAACCAGAGCGAGGCCGGTTCTCACACCATCCAGATAATGTATGGCTGCGACGTGGGGTCGGACGGGCGCTTCCTCCGCGGGTACCGGCAGGACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCTTGGACCGCGGCGGACATGGCGGCTCAGATCACCAAGCGCAAGTGGGAGGCGGCCCATGAGGCGGAGCAGTTGAGAGCCTACCTGGATGGCACGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGAGACGCTGCAGCGCACGGACCCCCCCAAGACACATATGACCCACCACCCCATCTCTGACCATGAGGCCACCCTGAGGTGCTGGGCCCTGGGCTTCTACCCTGCGGAGATCACACTGACCTGGCAGCGGGATGGGGAGGACCAGACCCAGGACACGGAGCTCGTGGAGACCAGGCCTGCAGGGGATGGAACCTTCCAGAAGTGGGCGGCTGTGGTGGTGCCTTCTGGAGAGGAGCAGAGATACACCTGCCATGTGCAGCATGAGGGTCTGCCCAAGCCCCTCACCCTGAGATGGGAGCTGTCTTCCCAGCCCACCATCCCCATCGTGGGCATCATTGCTGGCCTGGTTCTCCTTGGAGCTGTGATCACTGGAGCTGTGGTCGCTGCCGTGATGTGGAGGAGGAAGAGCTCAGATAGAAAAGGAGGGAGTTACACTCAGGCTGCAAGCAGTGACAGTGCCCAGGGCTCTGATGTGTCCCTCACAGCTTGTAAAGTGTGA" + }, + "HLA:HLA00132": { + "hla_id": "HLA:HLA00132", + "gene_name": "HLA-B", + "star_allele": [ + "07", + "02", + "01", + "01" + ], + "dna_sequence": "GATCAGGACGAAGTCCCAGGTCCCGGACGGGGCTCTCAGGGTCTCAGGCTCCGAGGGCCGCGTCTGCAATGGGGAGGCGCAGCGTTGGGGATTCCCCACTCCCCTGAGTTTCACTTCTTCTCCCAACTTGTGTCGGGTCCTTCTTCCAGGATACTCGTGACGCGTCCCCACTTCCCACTCCCATTGGGTATTGGATATCTAGAGAAGCCAATCAGCGTCGCCGCGGTCCCAGTTCTAAAGTCCCCACGCACCCACCCGGACTCAGAGTCTCCTCAGACGCCGAGATGCTGGTCATGGCGCCCCGAACCGTCCTCCTGCTGCTCTCGGCGGCCCTGGCCCTGACCGAGACCTGGGCCGGTGAGTGCGGGTCGGGAGGGAAATGGCCTCTGCCGGGAGGAGCGAGGGGACCGCAGGCGGGGGCGCAGGACCTGAGGAGCCGCGCCGGGAGGAGGGTCGGGCGGGTCTCAGCCCCTCCTCACCCCCAGGCTCCCACTCCATGAGGTATTTCTACACCTCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCTCAGTGGGCTACGTGGACGACACCCAGTTCGTGAGGTTCGACAGCGACGCCGCGAGTCCGAGAGAGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCGGAACACACAGATCTACAAGGCCCAGGCACAGACTGACCGAGAGAGCCTGCGGAACCTGCGCGGCTACTACAACCAGAGCGAGGCCGGTGAGTGACCCCGGCCCGGGGCGCAGGTCACGACTCCCCATCCCCCACGTACGGCCCGGGTCGCCCCGAGTCTCCGGGTCCGAGATCCGCCTCCCTGAGGCCGCGGGACCCGCCCAGACCCTCGACCGGCGAGAGCCCCAGGCGCGTTTACCCGGTTTCATTTTCAGTTGAGGCCAAAATCCCCGCGGGTTGGTCGGGGCGGGGCGGGGCTCGGGGGACTGGGCTGACCGCGGGGCCGGGGCCAGGGTCTCACACCCTCCAGAGCATGTACGGCTGCGACGTGGGGCCGGACGGGCGCCTCCTCCGCGGGCATGACCAGTACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCCTGGACCGCCGCGGACACGGCGGCTCAGATCACCCAGCGCAAGTGGGAGGCGGCCCGTGAGGCGGAGCAGCGGAGAGCCTACCTGGAGGGCGAGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGACAAGCTGGAGCGCGCTGGTACCAGGGGCAGTGGGGAGCCTTCCCCATCTCCTATAGGTCGCCGGGGATGGCCTCCCACGAGAAGAGGAGGAAAATGGGATCAGCGCTAGAATGTCGCCCTCCGTTGAATGGAGAATGGCATGAGTTTTCCTGAGTTTCCTCTGAGGGCCCCCTCTTCTCTCTAGACAATTAAGGAATGACGTCTCTGAGGAAATGGAGGGGAAGACAGTCCCTAGAATACTGATCAGGGGTCCCCTTTGACCCCTGCAGCAGCCTTGGGAACCGTGACTTTTCCTCTCAGGCCTTGTTCTCTGCCTCACACTCAGTGTGTTTGGGGCTCTGATTCCAGCACTTCTGAGTCACTTTACCTCCACTCAGATCAGGAGCAGAAGTCCCTGTTCCCCGCTCAGAGACTCGAACTTTCCAATGAATAGGAGATTATCCCAGGTGCCTGCGTCCAGGCTGGTGTCTGGGTTCTGTGCCCCTTCCCCACCCCAGGTGTCCTGTCCATTCTCAGGCTGGTCACATGGGTGGTCCTAGGGTGTCCCATGAAAGATGCAAAGCGCCTGAATTTTCTGACTCTTCCCATCAGACCCCCCAAAGACACACGTGACCCACCACCCCATCTCTGACCATGAGGCCACCCTGAGGTGCTGGGCCCTGGGTTTCTACCCTGCGGAGATCACACTGACCTGGCAGCGGGATGGCGAGGACCAAACTCAGGACACTGAGCTTGTGGAGACCAGACCAGCAGGAGATAGAACCTTCCAGAAGTGGGCAGCTGTGGTGGTGCCTTCTGGAGAAGAGCAGAGATACACATGCCATGTACAGCATGAGGGGCTGCCGAAGCCCCTCACCCTGAGATGGGGTAAGGAGGGGGATGAGGGGTCATATCTCTTCTCAGGGAAAGCAGGAGCCCTTCAGCAGGGTCAGGGCCCCTCATCTTCCCCTCCTTTCCCAGAGCCGTCTTCCCAGTCCACCGTCCCCATCGTGGGCATTGTTGCTGGCCTGGCTGTCCTAGCAGTTGTGGTCATCGGAGCTGTGGTCGCTGCTGTGATGTGTAGGAGGAAGAGTTCAGGTAGGGAAGGGGTGAGGGGTGGGGTCTGGGTTTTCTTGTCCCACTGGGGGTTTCAAGCCCCAGGTAGAAGTGTTCCCTGCCTCATTACTGGGAAGCAGCATGCACACAGGGGCTAACGCAGCCTGGGACCCTGTGTGCCAGCACTTACTCTTTTGTGCAGCACATGTGACAATGAAGGATGGATGTATCACCTTGATGGTTGTGGTGTTGGGGTCCTGATTCCAGCATTCATGAGTCAGGGGAAGGTCCCTGCTAAGGACAGACCTTAGGAGGGCAGTTGGTCCAGGACCCACACTTGCTTTCCTCGTGTTTCCTGATCCTGCCCTGGGTCTGTAGTCATACTTCTGGAAATTCCTTTTGGGTCCAAGACTAGGAGGTTCCTCTAAGATCTCATGGCCCTGCTTCCTCCCAGTGCCCTCACAGGACATTTTCTTCCCACAGGTGGAAAAGGAGGGAGCTACTCTCAGGCTGCGTGTAAGTGGTGGGGGTGGGAGTGTGGAGGAGCTCACCCACCCCATAATTCCTCCTGTCCCACGTCTCCTGCGGGCTCTGACCAGGTCCTGTTTTTGTTCTACTCCAGGCAGCGACAGTGCCCAGGGCTCTGATGTGTCTCTCACAGCTTGAAAAGGTGAGATTCTTGGGGTCTAGAGTGGGTGGGGTGGCGGGTCTGGGGGTGGGTGGGGCAGAGGGGAAAGGCCTGGGTAATGGGGATTCTTTGATTGGGATGTTTCGCGTGTGTGGTGGGCTGTTTAGAGTGTCATCGCTTACCATGACTAACCAGAATTTGTTCATGACTGTTGTTTTCTGTAGCCTGAGACAGCTGTCTTGTGAGGGACTGAGATGCAGGATTTCTTCACGCCTCCCCTTTGTGACTTCAAGAGCCTCTGGCATCTCTTTCTGCAAAGGCACCTGAATGTGTCTGCGTCCCTGTTAGCATAATGTGAGGAGGTGGAGAGACAGCCCACCCTTGTGTCCACTGTGACCCCTGTTCCCATGCTGACCTGTGTTTCCTCCCCAGTCATCTTTCTTGTTCCAGAGAGGTGGGGCTGGATGTCTCCATCTCTGTCTCAACTTTACGTGCACTGAGCTGCAACTTCTTACTTCCCTACTGAAAATAAGAATCTGAATATAAATTTGTTTTCTCAAATATTTGCTATGAGAGGTTGATGGATTAATTAAATAAGTCAATTCCTGGAATTTGAGAGAGCAAATAAAGACCTGAGAACCTTCCAGAATCTGCATGTTCGCTGTGCTGAGTCTGTTGCAGGTGGGGTGTGGAGAAGGCTGTGGGGGGCCGAGTGTGGATGGGGCCTGTGCCCATTTGGTGTTGAGTCCATCATGGGCTTTATGTGGTTAGTCCTCAGCTGGGTCACCTTCACTGCTCCATTGTCCTTGTCCCTTCAGTGGAAACTTGTCCAGTGGGAGCTGTGACCACAGAGGCTCACACATCGCCCAGGGCGGCCCCTGCACACGGGGGTCTCTGTGCATTCTGAGACAAATTTTCAGAGCCATTCACCTCCTGCTCTGCTTCTAGAGCTCCTTTTCTGCTCTGCTCTTCTGCCCTCTCTCCCTGCCCTGGTTCTAGTGATCTTGGTGCTGAATCCAATCCCAACTCATGAATCTGTAAAGCAGAGTCTAATTTAGACTTACATTTGTCTGTGAAATTGGACCCGTCATCAAGGACTGTTCTTTCCTGAAGAGAGAACCTGATTGTGTGCTGCAGTGTGCTGGGGCAGGGGGTGCGG", + "cdna_sequence": "ATGCTGGTCATGGCGCCCCGAACCGTCCTCCTGCTGCTCTCGGCGGCCCTGGCCCTGACCGAGACCTGGGCCGGCTCCCACTCCATGAGGTATTTCTACACCTCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCTCAGTGGGCTACGTGGACGACACCCAGTTCGTGAGGTTCGACAGCGACGCCGCGAGTCCGAGAGAGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCGGAACACACAGATCTACAAGGCCCAGGCACAGACTGACCGAGAGAGCCTGCGGAACCTGCGCGGCTACTACAACCAGAGCGAGGCCGGGTCTCACACCCTCCAGAGCATGTACGGCTGCGACGTGGGGCCGGACGGGCGCCTCCTCCGCGGGCATGACCAGTACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCCTGGACCGCCGCGGACACGGCGGCTCAGATCACCCAGCGCAAGTGGGAGGCGGCCCGTGAGGCGGAGCAGCGGAGAGCCTACCTGGAGGGCGAGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGACAAGCTGGAGCGCGCTGACCCCCCAAAGACACACGTGACCCACCACCCCATCTCTGACCATGAGGCCACCCTGAGGTGCTGGGCCCTGGGTTTCTACCCTGCGGAGATCACACTGACCTGGCAGCGGGATGGCGAGGACCAAACTCAGGACACTGAGCTTGTGGAGACCAGACCAGCAGGAGATAGAACCTTCCAGAAGTGGGCAGCTGTGGTGGTGCCTTCTGGAGAAGAGCAGAGATACACATGCCATGTACAGCATGAGGGGCTGCCGAAGCCCCTCACCCTGAGATGGGAGCCGTCTTCCCAGTCCACCGTCCCCATCGTGGGCATTGTTGCTGGCCTGGCTGTCCTAGCAGTTGTGGTCATCGGAGCTGTGGTCGCTGCTGTGATGTGTAGGAGGAAGAGTTCAGGTGGAAAAGGAGGGAGCTACTCTCAGGCTGCGTGCAGCGACAGTGCCCAGGGCTCTGATGTGTCTCTCACAGCTTGA" + } + }, + "cyp2d6_gene_def" : {} +} \ No newline at end of file diff --git a/test_data/HLA_configs/full_length.json b/test_data/HLA_configs/full_length.json new file mode 100644 index 0000000..e0fa7ad --- /dev/null +++ b/test_data/HLA_configs/full_length.json @@ -0,0 +1,100 @@ +{ + "hla_coordinates": { + "HLA-A": { + "chrom": "chr6", + "start": 29942254, + "end": 29945870 + }, + "HLA-B": { + "chrom": "chr6", + "start": 31353362, + "end": 31357442 + } + }, + "hla_exons": { + "HLA-A": [ + { + "chrom": "chr6", + "start": 29942531, + "end": 29942626 + }, + { + "chrom": "chr6", + "start": 29942756, + "end": 29943026 + }, + { + "chrom": "chr6", + "start": 29943267, + "end": 29943543 + }, + { + "chrom": "chr6", + "start": 29944121, + "end": 29944397 + }, + { + "chrom": "chr6", + "start": 29944499, + "end": 29944616 + }, + { + "chrom": "chr6", + "start": 29945058, + "end": 29945091 + }, + { + "chrom": "chr6", + "start": 29945233, + "end": 29945281 + }, + { + "chrom": "chr6", + "start": 29945450, + "end": 29945870 + } + ], + "HLA-B": [ + { + "chrom": "chr6", + "start": 31353874, + "end": 31354296 + }, + { + "chrom": "chr6", + "start": 31354478, + "end": 31354526 + }, + { + "chrom": "chr6", + "start": 31354632, + "end": 31354665 + }, + { + "chrom": "chr6", + "start": 31355106, + "end": 31355223 + }, + { + "chrom": "chr6", + "start": 31355316, + "end": 31355592 + }, + { + "chrom": "chr6", + "start": 31356166, + "end": 31356442 + }, + { + "chrom": "chr6", + "start": 31356687, + "end": 31356957 + }, + { + "chrom": "chr6", + "start": 31357085, + "end": 31357179 + } + ] + } +} \ No newline at end of file diff --git a/test_data/HLA_configs/missing_exons.json b/test_data/HLA_configs/missing_exons.json new file mode 100644 index 0000000..3b0e5f2 --- /dev/null +++ b/test_data/HLA_configs/missing_exons.json @@ -0,0 +1,95 @@ +{ + "hla_coordinates": { + "HLA-A": { + "chrom": "chr6", + "start": 29942254, + "end": 29945870 + }, + "HLA-B": { + "chrom": "chr6", + "start": 31353362, + "end": 31357442 + } + }, + "hla_exons": { + "HLA-A": [ + { + "chrom": "chr6", + "start": 29942531, + "end": 29942626 + }, + { + "chrom": "chr6", + "start": 29942756, + "end": 29943026 + }, + { + "chrom": "chr6", + "start": 29943267, + "end": 29943543 + }, + { + "chrom": "chr6", + "start": 29944121, + "end": 29944397 + }, + { + "chrom": "chr6", + "start": 29944499, + "end": 29944616 + }, + { + "chrom": "chr6", + "start": 29945058, + "end": 29945091 + }, + { + "chrom": "chr6", + "start": 29945233, + "end": 29945281 + }, + { + "chrom": "chr6", + "start": 29945450, + "end": 29945870 + } + ], + "HLA-B": [ + { + "chrom": "chr6", + "start": 31354478, + "end": 31354526 + }, + { + "chrom": "chr6", + "start": 31354632, + "end": 31354665 + }, + { + "chrom": "chr6", + "start": 31355106, + "end": 31355223 + }, + { + "chrom": "chr6", + "start": 31355316, + "end": 31355592 + }, + { + "chrom": "chr6", + "start": 31356166, + "end": 31356442 + }, + { + "chrom": "chr6", + "start": 31356687, + "end": 31356957 + }, + { + "chrom": "chr6", + "start": 31357085, + "end": 31357179 + } + ] + } +} \ No newline at end of file diff --git a/test_data/HLA_configs/missing_regions.json b/test_data/HLA_configs/missing_regions.json new file mode 100644 index 0000000..29323d1 --- /dev/null +++ b/test_data/HLA_configs/missing_regions.json @@ -0,0 +1,95 @@ +{ + "hla_coordinates": { + "HLA-A": { + "chrom": "chr6", + "start": 29942254, + "end": 29945870 + } + }, + "hla_exons": { + "HLA-A": [ + { + "chrom": "chr6", + "start": 29942531, + "end": 29942626 + }, + { + "chrom": "chr6", + "start": 29942756, + "end": 29943026 + }, + { + "chrom": "chr6", + "start": 29943267, + "end": 29943543 + }, + { + "chrom": "chr6", + "start": 29944121, + "end": 29944397 + }, + { + "chrom": "chr6", + "start": 29944499, + "end": 29944616 + }, + { + "chrom": "chr6", + "start": 29945058, + "end": 29945091 + }, + { + "chrom": "chr6", + "start": 29945233, + "end": 29945281 + }, + { + "chrom": "chr6", + "start": 29945450, + "end": 29945870 + } + ], + "HLA-B": [ + { + "chrom": "chr6", + "start": 31353874, + "end": 31354296 + }, + { + "chrom": "chr6", + "start": 31354478, + "end": 31354526 + }, + { + "chrom": "chr6", + "start": 31354632, + "end": 31354665 + }, + { + "chrom": "chr6", + "start": 31355106, + "end": 31355223 + }, + { + "chrom": "chr6", + "start": 31355316, + "end": 31355592 + }, + { + "chrom": "chr6", + "start": 31356166, + "end": 31356442 + }, + { + "chrom": "chr6", + "start": 31356687, + "end": 31356957 + }, + { + "chrom": "chr6", + "start": 31357085, + "end": 31357179 + } + ] + } +} \ No newline at end of file diff --git a/test_data/README.md b/test_data/README.md new file mode 100644 index 0000000..aa2db3e --- /dev/null +++ b/test_data/README.md @@ -0,0 +1,7 @@ +# Test data files +Collection of input files we can test with. + +1. CACNA1S - Contains real database from CPIC for CACNA1S, a relatively simple gene to do tests with + 1. `CPIC_API.json` - CPIC results just for gene _CACNA1S_; generated via [https://api.cpicpgx.org/v1/allele_definition?genesymbol=eq.CACNA1S&select=*,%20allele_location_value(*,%20sequence_location(*))&order=name](https://api.cpicpgx.org/v1/allele_definition?genesymbol=eq.CACNA1S&select=*,%20allele_location_value(*,%20sequence_location(*))&order=name) +2. RNR1-faux - Contains fake data simulating poly-allelic calls present in RNR1. This primarily focuses on variants like rs1556422499, which take the form of T -> [delT, delinsCn]. +3. UGT1A1-faux - Contains fake data simulating calls present in UGT1A1. This primarily focuses on testing the various allele combinations when phased data is provided including: testing same phase, testing opposite phase, testing homozygous + phased variant, and testing phased on different phase sets (PS tag). Real variants were altered to work with our small reference file and simulate the real allelic combinations from UGT1A1. diff --git a/test_data/RNR1-faux/compound_het.vcf.gz b/test_data/RNR1-faux/compound_het.vcf.gz new file mode 100644 index 0000000..fe0107d Binary files /dev/null and b/test_data/RNR1-faux/compound_het.vcf.gz differ diff --git a/test_data/RNR1-faux/compound_het.vcf.gz.tbi b/test_data/RNR1-faux/compound_het.vcf.gz.tbi new file mode 100644 index 0000000..6680be0 Binary files /dev/null and b/test_data/RNR1-faux/compound_het.vcf.gz.tbi differ diff --git a/test_data/RNR1-faux/database.json b/test_data/RNR1-faux/database.json new file mode 100644 index 0000000..d836da4 --- /dev/null +++ b/test_data/RNR1-faux/database.json @@ -0,0 +1,47 @@ +{ + "database_metadata": { + "pbstarphase_version": "0.1.0-9ac34a4", + "cpic_version": "API-2023-09-05T14:03:45.314899420Z", + "build_time": "2023-09-05T14:03:45.314899420Z", + "hla_version" : "fake_version", + "pharmvar_version" : "fake_version" + }, + "gene_entries": { + "MT-RNR1": { + "gene_name": "MT-RNR1", + "chromosome": "chr2", + "variants": { + "1104230": { + "name" : "faux", + "dbsnp_id": "rs1556422499", + "position": 13, + "alleles": [ + "T", + "delT", + "delinsCC; delinsCCC; delinsCCCC; delinsCCCCC; delinsCCCCCC; delinsCCCCCCC" + ] + } + }, + "defined_haplotypes": { + "961T>del": { + "haplotype": { + "1104230": "delT" + } + }, + "961T>del+Cn": { + "haplotype": { + "1104230": "delinsCC; delinsCCC; delinsCCCC; delinsCCCCC; delinsCCCCCC; delinsCCCCCCC" + } + }, + "Reference": { + "haplotype": { + "1104230": "T" + } + } + }, + "reference_allele": "Reference" + } + }, + "hla_sequences" : {}, + "cyp2d6_gene_def" : {} +} \ No newline at end of file diff --git a/test_data/RNR1-faux/hom.vcf.gz b/test_data/RNR1-faux/hom.vcf.gz new file mode 100644 index 0000000..1bab1e2 Binary files /dev/null and b/test_data/RNR1-faux/hom.vcf.gz differ diff --git a/test_data/RNR1-faux/hom.vcf.gz.tbi b/test_data/RNR1-faux/hom.vcf.gz.tbi new file mode 100644 index 0000000..ecd2a5d Binary files /dev/null and b/test_data/RNR1-faux/hom.vcf.gz.tbi differ diff --git a/test_data/UGT1A1-faux/database.json b/test_data/UGT1A1-faux/database.json new file mode 100644 index 0000000..8af6cb5 --- /dev/null +++ b/test_data/UGT1A1-faux/database.json @@ -0,0 +1,74 @@ +{ + "database_metadata": { + "pbstarphase_version": "0.1.0-9ac34a4", + "cpic_version": "API-2023-09-05T14:03:45.314899420Z", + "build_time": "2023-09-05T14:03:45.314899420Z", + "hla_version" : "fake_version", + "pharmvar_version" : "fake_version" + }, + "gene_entries": { + "UGT1A1": { + "gene_name": "UGT1A1", + "chromosome": "chr2", + "variants": { + "779484": { + "name" : "faux", + "dbsnp_id": "faux-rs887829", + "position": 2, + "alleles": [ + "C", + "T" + ] + }, + "1000510": { + "name" : "faux", + "dbsnp_id": "faux-rs3064744", + "position": 11, + "alleles": [ + "AGT(3)", + "AGT(4)", + "AGT(5)" + ] + } + }, + "defined_haplotypes": { + "*1": { + "haplotype": { + "779484": "C", + "1000510": "AGT(3)" + } + }, + "*28": { + "haplotype": { + "1000510": "AGT(4)" + } + }, + "*37": { + "haplotype": { + "1000510": "AGT(5)" + } + }, + "*80": { + "haplotype": { + "779484": "T" + } + }, + "*80+*28": { + "haplotype": { + "779484": "T", + "1000510": "AGT(4)" + } + }, + "*80+*37": { + "haplotype": { + "779484": "T", + "1000510": "AGT(5)" + } + } + }, + "reference_allele": "*1" + } + }, + "hla_sequences" : {}, + "cyp2d6_gene_def" : {} +} \ No newline at end of file diff --git a/test_data/UGT1A1-faux/different_phaseset_001.vcf.gz b/test_data/UGT1A1-faux/different_phaseset_001.vcf.gz new file mode 100644 index 0000000..bd816d7 Binary files /dev/null and b/test_data/UGT1A1-faux/different_phaseset_001.vcf.gz differ diff --git a/test_data/UGT1A1-faux/different_phaseset_001.vcf.gz.tbi b/test_data/UGT1A1-faux/different_phaseset_001.vcf.gz.tbi new file mode 100644 index 0000000..346458d Binary files /dev/null and b/test_data/UGT1A1-faux/different_phaseset_001.vcf.gz.tbi differ diff --git a/test_data/UGT1A1-faux/different_phaseset_002.vcf.gz b/test_data/UGT1A1-faux/different_phaseset_002.vcf.gz new file mode 100644 index 0000000..7cb3a26 Binary files /dev/null and b/test_data/UGT1A1-faux/different_phaseset_002.vcf.gz differ diff --git a/test_data/UGT1A1-faux/different_phaseset_002.vcf.gz.tbi b/test_data/UGT1A1-faux/different_phaseset_002.vcf.gz.tbi new file mode 100644 index 0000000..6b72f4d Binary files /dev/null and b/test_data/UGT1A1-faux/different_phaseset_002.vcf.gz.tbi differ diff --git a/test_data/UGT1A1-faux/hethom_phase_001.vcf.gz b/test_data/UGT1A1-faux/hethom_phase_001.vcf.gz new file mode 100644 index 0000000..5f0ad56 Binary files /dev/null and b/test_data/UGT1A1-faux/hethom_phase_001.vcf.gz differ diff --git a/test_data/UGT1A1-faux/hethom_phase_001.vcf.gz.tbi b/test_data/UGT1A1-faux/hethom_phase_001.vcf.gz.tbi new file mode 100644 index 0000000..f766943 Binary files /dev/null and b/test_data/UGT1A1-faux/hethom_phase_001.vcf.gz.tbi differ diff --git a/test_data/UGT1A1-faux/opposite_phase_001.vcf.gz b/test_data/UGT1A1-faux/opposite_phase_001.vcf.gz new file mode 100644 index 0000000..52b28a8 Binary files /dev/null and b/test_data/UGT1A1-faux/opposite_phase_001.vcf.gz differ diff --git a/test_data/UGT1A1-faux/opposite_phase_001.vcf.gz.tbi b/test_data/UGT1A1-faux/opposite_phase_001.vcf.gz.tbi new file mode 100644 index 0000000..a6cd574 Binary files /dev/null and b/test_data/UGT1A1-faux/opposite_phase_001.vcf.gz.tbi differ diff --git a/test_data/UGT1A1-faux/opposite_phase_002.vcf.gz b/test_data/UGT1A1-faux/opposite_phase_002.vcf.gz new file mode 100644 index 0000000..43935d0 Binary files /dev/null and b/test_data/UGT1A1-faux/opposite_phase_002.vcf.gz differ diff --git a/test_data/UGT1A1-faux/opposite_phase_002.vcf.gz.tbi b/test_data/UGT1A1-faux/opposite_phase_002.vcf.gz.tbi new file mode 100644 index 0000000..04d896a Binary files /dev/null and b/test_data/UGT1A1-faux/opposite_phase_002.vcf.gz.tbi differ diff --git a/test_data/UGT1A1-faux/same_phase_001.vcf.gz b/test_data/UGT1A1-faux/same_phase_001.vcf.gz new file mode 100644 index 0000000..0e80e3c Binary files /dev/null and b/test_data/UGT1A1-faux/same_phase_001.vcf.gz differ diff --git a/test_data/UGT1A1-faux/same_phase_001.vcf.gz.tbi b/test_data/UGT1A1-faux/same_phase_001.vcf.gz.tbi new file mode 100644 index 0000000..d09498c Binary files /dev/null and b/test_data/UGT1A1-faux/same_phase_001.vcf.gz.tbi differ diff --git a/test_data/UGT1A1-faux/same_phase_002.vcf.gz b/test_data/UGT1A1-faux/same_phase_002.vcf.gz new file mode 100644 index 0000000..6321a6d Binary files /dev/null and b/test_data/UGT1A1-faux/same_phase_002.vcf.gz differ diff --git a/test_data/UGT1A1-faux/same_phase_002.vcf.gz.tbi b/test_data/UGT1A1-faux/same_phase_002.vcf.gz.tbi new file mode 100644 index 0000000..d09498c Binary files /dev/null and b/test_data/UGT1A1-faux/same_phase_002.vcf.gz.tbi differ diff --git a/test_data/empty_gene_list.txt b/test_data/empty_gene_list.txt new file mode 100644 index 0000000..e69de29 diff --git a/test_data/test_reference.fa b/test_data/test_reference.fa new file mode 100644 index 0000000..1ceb431 --- /dev/null +++ b/test_data/test_reference.fa @@ -0,0 +1,6 @@ +>chr1 +AAAAAAAAAA +ACACACACAC +>chr2 +ACACACACAC +AGTAGTAGTA \ No newline at end of file