Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement hictk metadata command #204

Merged
merged 13 commits into from
Aug 16, 2024
4 changes: 3 additions & 1 deletion .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ jobs:
apt-get update
apt-get install -y --no-install-recommends python3-pip xz-utils

python3 -m pip install 'cooler==0.10.0' 'numpy<2'
python3 -m pip install 'cooler==0.10.0' 'numpy<2' 'pyyaml'

- name: Run integration tests
run: |
Expand Down Expand Up @@ -207,6 +207,8 @@ jobs:

test/scripts/hictk_merge.sh build/src/hictk/hictk

test/scripts/hictk_metadata.sh build/src/hictk/hictk

test/scripts/hictk_rename_chromosomes.sh build/src/hictk/hictk

test/scripts/hictk_validate.sh build/src/hictk/hictk
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/macos-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ jobs:
run: |
echo 'cooler==0.10.0' > requirements.txt
echo 'numpy<2' >> requirements.txt
echo 'pyyaml' >> requirements.txt

- uses: actions/setup-python@v5
with:
Expand Down Expand Up @@ -430,6 +431,10 @@ jobs:
run: |
test/scripts/hictk_merge.sh bin/hictk

- name: Test hictk metadata
run: |
test/scripts/hictk_metadata.sh bin/hictk

- name: Test hictk rename-chroms
run: |
test/scripts/hictk_rename_chromosomes.sh bin/hictk
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/ubuntu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,10 @@ jobs:
run: |
test/scripts/hictk_merge.sh bin/hictk

- name: Test hictk metadata
run: |
test/scripts/hictk_metadata.sh bin/hictk

- name: Test hictk rename-chroms
run: |
test/scripts/hictk_rename_chromosomes.sh bin/hictk
Expand Down
2 changes: 2 additions & 0 deletions conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,12 @@ def requirements(self):
self.requires("hdf5/1.14.3#31ccd8d4de83844f5db48471df1944a1")
self.requires("highfive/2.9.0#c57477beed8b0110fadeb6da8f48bcc5")
self.requires("libdeflate/1.20#3bd86e0160becf992346aa68b4938c40")
self.requires("nlohmann_json/3.11.3#45828be26eb619a2e04ca517bb7b828d")
self.requires("parallel-hashmap/1.3.12#dc7755096d8a1fac7792fdd85760b6ca")
self.requires("readerwriterqueue/1.0.6#aaa5ff6fac60c2aee591e9e51b063b83")
self.requires("span-lite/0.11.0#519fd49fff711674cfed8cd17d4ed422")
self.requires("spdlog/1.14.1#972bbf70be1da4bc57ea589af0efde03")
self.requires("tomlplusplus/3.4.0#85dbfed71376fb8dc23cdcc0570e4727")
self.requires("zstd/1.5.6#afefe79a309bc2a7b9f56c2093504c8b")

def validate(self):
Expand Down
50 changes: 36 additions & 14 deletions docs/cli_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Subcommands
fix-mcool Fix corrupted .mcool files.
load Build .cool and .hic files from interactions in various text formats.
merge Merge multiple Cooler or .hic files into a single file.
metadata Print file metadata to stdout.
rename-chromosomes, rename-chroms
Rename chromosomes found in a Cooler file.
validate Validate .hic and Cooler files.
Expand Down Expand Up @@ -60,7 +61,7 @@ hictk balance ice
- genome-wide interactions (gw)
- trans-only interactions (trans)
- cis-only interactions (cis)
--tmpdir TEXT [/tmp] Path to a folder where to store temporary data.
--tmpdir TEXT:DIR [/tmp] Path to a folder where to store temporary data.
--ignore-diags UINT [2] Number of diagonals (including the main diagonal) to mask before balancing.
--mad-max FLOAT:NONNEGATIVE [5]
Mask bins using the MAD-max filter.
Expand All @@ -78,15 +79,16 @@ hictk balance ice
Rescale weights such that rows sum approximately to 2.
--name TEXT Name to use when writing weights to file.
Defaults to ICE, INTER_ICE and GW_ICE when --mode is cis, trans and gw, respectively.
--create-weight-link Create a symbolic link to the balancing weights at clr::/bins/weight.
--create-weight-link,--no-create-weight-link{false}
Create a symbolic link to the balancing weights at clr::/bins/weight.
Ignored when balancing .hic files
--in-memory Store all interactions in memory (greatly improves performance).
--stdout Write balancing weights to stdout instead of writing them to the input file.
--chunk-size UINT:POSITIVE [10000000]
Number of interactions to process at once. Ignored when using --in-memory.
-v,--verbosity UINT:INT in [1 - 4] []
Set verbosity of output to the console.
-t,--threads UINT:UINT in [1 - 16] [1]
-t,--threads UINT:UINT in [1 - 32] [1]
Maximum number of parallel threads to spawn.
-l,--compression-lvl UINT:INT in [0 - 19] []
Compression level used to compress temporary files using ZSTD.
Expand Down Expand Up @@ -132,7 +134,7 @@ hictk balance scale
Number of interactions to process at once. Ignored when using --in-memory.
-v,--verbosity UINT:INT in [1 - 4] []
Set verbosity of output to the console.
-t,--threads UINT:UINT in [1 - 16] [1]
-t,--threads UINT:UINT in [1 - 32] [1]
Maximum number of parallel threads to spawn.
-l,--compression-lvl UINT:INT in [0 - 19] []
Compression level used to compress temporary files using ZSTD.
Expand Down Expand Up @@ -194,12 +196,12 @@ hictk convert
Pass NONE to avoid copying normalization vectors.
--fail-if-norm-not-found Fail if any of the requested normalization vectors are missing.
-g,--genome TEXT Genome assembly name. By default this is copied from the .hic file metadata.
--tmpdir TEXT Path where to store temporary files.
--tmpdir TEXT:DIR [/tmp] Path where to store temporary files.
--chunk-size UINT:POSITIVE [10000000]
Batch size to use when converting .[m]cool to .hic.
-v,--verbosity UINT:INT in [1 - 4] []
Set verbosity of output to the console.
-t,--threads UINT:UINT in [2 - 16] [2]
-t,--threads UINT:UINT in [2 - 32] [2]
Maximum number of parallel threads to spawn.
When converting from hic to cool, only two threads will be used.
-l,--compression-lvl UINT:INT in [1 - 12] [6]
Expand Down Expand Up @@ -257,7 +259,7 @@ hictk fix-mcool
output TEXT REQUIRED Path where to store the restored .mcool.
Options:
-h,--help Print this help message and exit
--tmpdir TEXT [/tmp] Path to a folder where to store temporary data.
--tmpdir TEXT:DIR [/tmp] Path to a folder where to store temporary data.
--skip-balancing Do not recompute or copy balancing weights.
--check-base-resolution Check whether the base resolution is corrupted.
--in-memory Store all interactions in memory while balancing (greatly improves performance).
Expand All @@ -266,7 +268,7 @@ hictk fix-mcool
Ignored when using --in-memory.
-v,--verbosity UINT:INT in [1 - 4] []
Set verbosity of output to the console.
-t,--threads UINT:UINT in [1 - 16] [1]
-t,--threads UINT:UINT in [1 - 32] [1]
Maximum number of parallel threads to spawn (only applies to the balancing stage).
-l,--compression-lvl UINT:INT in [0 - 19] []
Compression level used to compress temporary files using ZSTD (only applies to the balancing stage).
Expand Down Expand Up @@ -309,10 +311,10 @@ hictk load
-l,--compression-lvl UINT:INT bounded to [1 - 12]
Compression level used to compress interactions.
Defaults to 6 and 10 for .cool and .hic files, respectively.
-t,--threads UINT:UINT in [1 - 16] [1]
-t,--threads UINT:UINT in [1 - 32] [1]
Maximum number of parallel threads to spawn.
When loading interactions in a .cool file, only a single thread will be used.
--tmpdir TEXT [/tmp] Path to a folder where to store temporary data.
--tmpdir TEXT:DIR [/tmp] Path to a folder where to store temporary data.
-v,--verbosity UINT:INT in [1 - 4] []
Set verbosity of output to the console.

Expand All @@ -338,16 +340,36 @@ hictk merge
-l,--compression-lvl UINT:INT bounded to [1 - 12]
Compression level used to compress interactions.
Defaults to 6 and 10 for .cool and .hic files, respectively.
-t,--threads UINT:UINT in [1 - 16] [1]
-t,--threads UINT:UINT in [1 - 32] [1]
Maximum number of parallel threads to spawn.
When merging interactions in Cooler format, only a single thread will be used.
--tmpdir TEXT [/tmp] Path to a folder where to store temporary data.
--tmpdir TEXT:DIR [/tmp] Path to a folder where to store temporary data.
--skip-all-vs-all,--no-skip-all-vs-all{false}
Do not generate All vs All matrix.
Has no effect when merging .cool files.
-v,--verbosity UINT:INT in [1 - 4] []
Set verbosity of output to the console.

hictk metadata
--------------

.. code-block:: text

Print file metadata to stdout.
Usage: hictk metadata [OPTIONS] uri
Positionals:
uri TEXT:(((Cooler) OR (Multires-cooler)) OR (Single-cell-cooler)) OR (HiC) REQUIRED
Path to a .hic or .[ms]cool file (Cooler URI syntax supported).
Options:
-h,--help Print this help message and exit
-f,--output-format TEXT:{json,toml,yaml} [json]
Format used to return file metadata.
Should be one of: json, toml, or yaml.
--include-file-path,--exclude-file-path{false}
Output the given input path using attribute "uri".
--recursive Print metadata for each resolution or cell contained in a
multi-resolution or single-cell file.

hictk rename-chromosomes
------------------------

Expand Down Expand Up @@ -410,7 +432,7 @@ hictk zoomify
-l,--compression-lvl UINT:INT bounded to [1 - 12] [6]
Compression level used to compress interactions.
Defaults to 6 and 10 for .mcool and .hic files, respectively.
-t,--threads UINT:UINT in [1 - 16] [1]
-t,--threads UINT:UINT in [1 - 32] [1]
Maximum number of parallel threads to spawn.
When zoomifying interactions from a .cool file, only a single thread will be used.
--chunk-size UINT [10000000]
Expand All @@ -419,6 +441,6 @@ hictk zoomify
--skip-all-vs-all,--no-skip-all-vs-all{false}
Do not generate All vs All matrix.
Has no effect when zoomifying .cool files.
--tmpdir TEXT [/tmp] Path to a folder where to store temporary data.
--tmpdir TEXT:DIR [/tmp] Path to a folder where to store temporary data.
-v,--verbosity UINT:INT in [1 - 4] []
Set verbosity of output to the console.
1 change: 1 addition & 0 deletions docs/generate_cli_reference.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ subcommands=(
fix-mcool
load
merge
metadata
rename-chromosomes
validate
zoomify
Expand Down
14 changes: 12 additions & 2 deletions src/hictk/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@

find_package(CLI11 REQUIRED)
find_package(FMT REQUIRED)
find_package(nlohmann_json REQUIRED)
find_package(readerwriterqueue REQUIRED)
find_package(spdlog REQUIRED)
find_package(tomlplusplus REQUIRED)
find_package(Filesystem REQUIRED)
find_package(Threads REQUIRED)

Expand All @@ -21,6 +23,7 @@ target_sources(
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_fix_mcool.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_load.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_merge.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_metadata.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_rename_chromosomes.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_validate.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_zoomify.cpp
Expand All @@ -33,6 +36,11 @@ target_sources(
${CMAKE_CURRENT_SOURCE_DIR}/fix_mcool/fix_mcool.cpp
${CMAKE_CURRENT_SOURCE_DIR}/load/load.cpp
${CMAKE_CURRENT_SOURCE_DIR}/merge/merge.cpp
${CMAKE_CURRENT_SOURCE_DIR}/metadata/cool.hpp
${CMAKE_CURRENT_SOURCE_DIR}/metadata/hic.hpp
${CMAKE_CURRENT_SOURCE_DIR}/metadata/mcool.hpp
${CMAKE_CURRENT_SOURCE_DIR}/metadata/scool.hpp
${CMAKE_CURRENT_SOURCE_DIR}/metadata/metadata.cpp
${CMAKE_CURRENT_SOURCE_DIR}/rename_chromosomes/rename_chromosomes.cpp
${CMAKE_CURRENT_SOURCE_DIR}/validate/validate.cpp
${CMAKE_CURRENT_SOURCE_DIR}/zoomify/zoomify.cpp)
Expand All @@ -44,11 +52,13 @@ target_link_system_libraries(
hictk
PRIVATE
CLI11::CLI11
fmt::fmt-header-only
nlohmann_json::nlohmann_json
readerwriterqueue::readerwriterqueue
spdlog::spdlog_header_only
std::filesystem
tomlplusplus::tomlplusplus
PUBLIC
fmt::fmt-header-only
spdlog::spdlog_header_only
Threads::Threads)

include(GNUInstallDirs)
Expand Down
10 changes: 10 additions & 0 deletions src/hictk/cli/cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
_subcommand = subcommand::load;
} else if (_cli.get_subcommand("merge")->parsed()) {
_subcommand = subcommand::merge;
} else if (_cli.get_subcommand("metadata")->parsed()) {
_subcommand = subcommand::metadata;
} else if (_cli.get_subcommand("rename-chromosomes")->parsed()) {
_subcommand = subcommand::rename_chromosomes;
} else if (_cli.get_subcommand("validate")->parsed()) {
Expand Down Expand Up @@ -92,6 +94,8 @@
return "load";
case merge:
return "merge";
case metadata:
return "metadata";

Check warning on line 98 in src/hictk/cli/cli.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli.cpp#L97-L98

Added lines #L97 - L98 were not covered by tests
case rename_chromosomes:
return "rename-chromosomes";
case validate:
Expand All @@ -116,6 +120,7 @@
make_fix_mcool_subcommand();
make_load_subcommand();
make_merge_subcommand();
make_metadata_subcommand();
make_rename_chromosomes_subcommand();
make_validate_subcommand();
make_zoomify_subcommand();
Expand All @@ -141,6 +146,8 @@
case merge:
validate_merge_subcommand();
break;
case metadata:
break;
case rename_chromosomes:
validate_rename_chromosomes_subcommand();
break;
Expand Down Expand Up @@ -174,6 +181,9 @@
case merge:
transform_args_merge_subcommand();
break;
case metadata:
transform_args_metadata_subcommand();
break;
case rename_chromosomes:
transform_args_rename_chromosomes_subcommand();
break;
Expand Down
77 changes: 77 additions & 0 deletions src/hictk/cli/cli_metadata.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// Copyright (C) 2024 Roberto Rossini <roberros@uio.no>
//
// SPDX-License-Identifier: MIT

#include <fmt/format.h>
#include <fmt/ranges.h>
#include <spdlog/spdlog.h>

#include <CLI/CLI.hpp>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <filesystem>
#include <limits>
#include <stdexcept>
#include <string>
#include <variant>
#include <vector>

#include "hictk/tools/cli.hpp"
#include "hictk/tools/config.hpp"

namespace hictk::tools {

void Cli::make_metadata_subcommand() {
auto& sc = *_cli.add_subcommand("metadata", "Print file metadata to stdout.")
->fallthrough()
->preparse_callback([this]([[maybe_unused]] std::size_t i) {
assert(_config.index() == 0);
_config = MetadataConfig{};
});

_config = MetadataConfig{};
auto& c = std::get<MetadataConfig>(_config);

// clang-format off
sc.add_option(
"uri",
c.uri,
"Path to a .hic or .[ms]cool file (Cooler URI syntax supported).")
->check(IsValidCoolerFile | IsValidMultiresCoolerFile | IsValidSingleCellCoolerFile | IsValidHiCFile)
->required();

sc.add_option(
"-f,--output-format",
c.output_format,
"Format used to return file metadata.\n"
"Should be one of: json, toml, or yaml.")
->check(CLI::IsMember({"json", "toml", "yaml"}))
->capture_default_str();
sc.add_flag(
"--include-file-path,!--exclude-file-path",
c.include_file_path,
"Output the given input path using attribute \"uri\".")
->capture_default_str();
sc.add_flag(
"--recursive",
c.recursive,
"Print metadata for each resolution or cell contained in a\n"
"multi-resolution or single-cell file.")
->capture_default_str();
// clang-format on

_config = std::monostate{};
}

void Cli::transform_args_metadata_subcommand() {
auto& c = std::get<MetadataConfig>(_config);

c.input_format = infer_input_format(c.uri);

// in spdlog, high numbers correspond to low log levels
assert(c.verbosity > 0 && c.verbosity < 5);
c.verbosity = static_cast<std::uint8_t>(spdlog::level::critical) - c.verbosity;
}

} // namespace hictk::tools
Loading
Loading