Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make dense trio MT #651

Merged
merged 3 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions gnomad_qc/v4/resources/basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def get_gnomad_v4_vds(
entries_to_keep: Optional[List[str]] = None,
annotate_het_non_ref: bool = False,
checkpoint_variant_data: bool = False,
naive_coalesce_partitions: Optional[int] = None,
) -> hl.vds.VariantDataset:
"""
Get gnomAD v4 data with desired filtering and metadata annotations.
Expand Down Expand Up @@ -96,6 +97,8 @@ def get_gnomad_v4_vds(
'_het_non_ref') to the variant data. Default is False.
:param checkpoint_variant_data: Whether to checkpoint the variant data MT after
splitting and filtering. Default is False.
:param naive_coalesce_partitions: Optional argument to coalesce the VDS to a
specific number of partitions using naive coalesce.
:return: gnomAD v4 dataset with chosen annotations and filters.
"""
if remove_hard_filtered_samples and remove_hard_filtered_samples_no_sex:
Expand Down Expand Up @@ -158,6 +161,12 @@ def get_gnomad_v4_vds(
logger.info("Filtering to chromosome %s...", chrom)
vds = hl.vds.filter_chromosomes(vds, keep=chrom)

if naive_coalesce_partitions:
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
vds = hl.vds.VariantDataset(
vds.reference_data.naive_coalesce(naive_coalesce_partitions),
vds.variant_data.naive_coalesce(naive_coalesce_partitions),
)

if filter_partitions:
logger.info("Filtering to %s partitions...", len(filter_partitions))
vds = hl.vds.VariantDataset(
Expand Down
25 changes: 25 additions & 0 deletions gnomad_qc/v4/resources/sample_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,31 @@ def ped_filter_param_json_path(
return f"{get_sample_qc_root(version)}/relatedness/trios/gnomad.exomes.v{version}.ped_filters.json"


def dense_trio_mt(
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
releasable: bool = True,
test: bool = False,
) -> VersionedMatrixTableResource:
"""
Get the VersionedMatrixTableResource for the dense trio MatrixTable.

:param releasable: Whether to get the resource for the releasable trios only.
:param test: Whether to use a tmp path for a test resource.
:return: VersionedMatrixTableResource of dense trio MatrixTable.
"""
data_type = "exomes"
return VersionedMatrixTableResource(
CURRENT_SAMPLE_QC_VERSION,
{
version: MatrixTableResource(
f"{get_sample_qc_root(version, test, data_type='exomes')}"
f"/relatedness/trios/gnomad.{data_type}.v{version}.trios"
f"{'.releasable' if releasable else ''}.dense.mt"
)
for version in SAMPLE_QC_VERSIONS
},
)


######################################################################
# Other resources
######################################################################
Expand Down
95 changes: 93 additions & 2 deletions gnomad_qc/v4/sample_qc/identify_trios.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import hail as hl
from gnomad.sample_qc.relatedness import (
create_fake_pedigree,
filter_to_trios,
get_duplicated_samples,
get_duplicated_samples_ht,
infer_families,
Expand All @@ -24,8 +25,9 @@
)
from gnomad_qc.slack_creds import slack_token
from gnomad_qc.v4.resources.basics import get_gnomad_v4_vds
from gnomad_qc.v4.resources.meta import project_meta
from gnomad_qc.v4.resources.meta import meta, project_meta
from gnomad_qc.v4.resources.sample_qc import (
dense_trio_mt,
duplicates,
finalized_outlier_filtering,
interval_qc_pass,
Expand Down Expand Up @@ -385,12 +387,62 @@ def filter_ped(
return hl.Pedigree([trio for trio in ped.trios if trio.s in trios]), cutoffs


def get_trio_resources(overwrite: bool, test: bool) -> PipelineResourceCollection:
def create_dense_trio_mt(
fam_ht: hl.Table,
meta_ht: hl.Table,
releasable_only: bool = True,
test: bool = False,
naive_coalesce_partitions: Optional[int] = None,
) -> hl.MatrixTable:
"""
Create a dense MatrixTable for high quality trios.

:param fam_ht: Table with family information.
:param meta_ht: Table with metadata information.
:param releasable_only: Whether to only include trios that are releasable. Default
is True.
:param test: Whether to filter to chr20 for testing. Default is False.
:param naive_coalesce_partitions: Optional Number of partitions to coalesce the VDS
to. Default is None.
:return: Dense MatrixTable with high quality trios.
"""
filter_expr = meta_ht.high_quality
if releasable_only:
filter_expr &= meta_ht.project_meta.releasable

# Filter the metadata table to only samples in high quality and releasable trios.
meta_ht = meta_ht.filter(filter_expr)
fam_ht = fam_ht.filter(
hl.is_defined(meta_ht[fam_ht.id])
& hl.is_defined(meta_ht[fam_ht.pat_id])
& hl.is_defined(meta_ht[fam_ht.mat_id])
)
meta_ht = filter_to_trios(meta_ht, fam_ht)

# Get the gnomAD VDS filtered to high quality releasable trios.
vds = get_gnomad_v4_vds(
high_quality_only=True,
chrom="chr20" if test else None,
filter_samples_ht=meta_ht,
entries_to_keep=["LA", "LGT", "LAD", "LPGT", "LPL", "DP", "GQ", "SB"],
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
naive_coalesce_partitions=naive_coalesce_partitions,
)

return hl.vds.to_dense_mt(vds)


def get_trio_resources(
overwrite: bool,
test: bool,
dense_trio_mt_releasable_only: bool = True,
) -> PipelineResourceCollection:
"""
Get PipelineResourceCollection for all resources needed in the trio identification pipeline.

:param overwrite: Whether to overwrite existing resources.
:param test: Whether to use test resources.
:param dense_trio_mt_releasable_only: Whether to only include trios that are
releasable in the dense trio MT. Default is True.
:return: PipelineResourceCollection containing resources for all steps of the
trio identification pipeline.
"""
Expand Down Expand Up @@ -457,6 +509,16 @@ def get_trio_resources(overwrite: bool, test: bool) -> PipelineResourceCollectio
},
pipeline_input_steps=[infer_families, run_mendel_errors],
)
create_dense_trio_mt = PipelineStepResourceCollection(
"--create-dense-trio-mt",
output_resources={
"dense_trio_mt": dense_trio_mt(
releasable=dense_trio_mt_releasable_only, test=test
)
},
pipeline_input_steps=[finalize_ped],
add_input_resources={"Finalized metadata HT": {"meta_ht": meta()}},
)

# Add all steps to the trio identification pipeline resource collection.
trio_pipeline.add_steps(
Expand All @@ -466,6 +528,7 @@ def get_trio_resources(overwrite: bool, test: bool) -> PipelineResourceCollectio
"create_fake_pedigree": create_fake_pedigree,
"run_mendel_errors": run_mendel_errors,
"finalize_ped": finalize_ped,
"create_dense_trio_mt": create_dense_trio_mt,
}
)

Expand Down Expand Up @@ -566,6 +629,17 @@ def main(args):
with hl.hadoop_open(res.filter_json, "w") as d:
d.write(json.dumps(filters))

if args.create_dense_trio_mt:
res = trio_resources.create_dense_trio_mt
res.check_resource_existence()
create_dense_trio_mt(
res.final_ped.ht(),
res.meta_ht.ht(),
args.releasable_only,
test,
naive_coalesce_partitions=args.naive_coalesce_partitions,
).write(res.dense_trio_mt.path, overwrite=args.overwrite)


def get_script_argument_parser() -> argparse.ArgumentParser:
"""Get script argument parser."""
Expand Down Expand Up @@ -724,6 +798,23 @@ def get_script_argument_parser() -> argparse.ArgumentParser:
type=int,
default=24,
)
dense_trio_mt_args = parser.add_argument_group("Create dense trio MT.")
dense_trio_mt_args.add_argument(
"--create-dense-trio-mt",
help=("Create a dense MT for high quality trios."),
action="store_true",
)
dense_trio_mt_args.add_argument(
"--releasable-only",
help=("Only include trios that are releasable."),
action="store_true",
)
dense_trio_mt_args.add_argument(
"--naive-coalesce-partitions",
help=("Number of partitions to coalesce the VDS to."),
type=int,
default=5000,
)

return parser

Expand Down
Loading