From cf6d9f8190363bf63eb0908d798915294de2749d Mon Sep 17 00:00:00 2001 From: klaricch Date: Wed, 4 May 2022 11:31:03 -0400 Subject: [PATCH 1/6] add pc_names param --- gnomad/sample_qc/ancestry.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py index 0c7a2e9c4..f4f8813d5 100644 --- a/gnomad/sample_qc/ancestry.py +++ b/gnomad/sample_qc/ancestry.py @@ -118,6 +118,7 @@ def pc_project( def assign_population_pcs( pop_pca_scores: Union[hl.Table, pd.DataFrame], pc_cols: Union[hl.expr.ArrayExpression, List[str]], + pc_names: List[int], known_col: str = "known_pop", fit: Any = None, # Type should be RandomForestClassifier but we do not want to import sklearn.RandomForestClassifier outside seed: int = 42, @@ -149,6 +150,7 @@ def assign_population_pcs( :param pop_pca_scores: Input Hail Table or Pandas Dataframe :param pc_cols: Columns storing the PCs to use + :param pc_names: List fo integer to use for naming the selected PCs (i.e. an input of [1, 3] will result in the first two PCs of pc_cols being named PC1 and PC3) :param known_col: Column storing the known population labels :param fit: Fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call) :param seed: Random seed @@ -172,10 +174,8 @@ def assign_population_pcs( # Explode the PC array num_out_cols = min([len(x) for x in pop_pc_pd["pca_scores"].values.tolist()]) - pc_cols = [f"PC{i+1}" for i in range(num_out_cols)] - pop_pc_pd[pc_cols] = pd.DataFrame(pop_pc_pd["pca_scores"].values.tolist())[ - list(range(num_out_cols)) - ] + pc_cols = [f"PC{i}" for i in pc_names] + pop_pc_pd[pc_cols] = pd.DataFrame(pop_pc_pd["pca_scores"].values.tolist()) else: pop_pc_pd = pop_pca_scores @@ -230,8 +230,14 @@ def assign_population_pcs( if hail_input: pops_ht = hl.Table.from_pandas(pop_pc_pd, key=list(pop_pca_scores.key)) - pops_ht.annotate_globals( - assign_pops_from_pc_params=hl.struct(min_assignment_prob=min_prob) + pops_ht = pops_ht.annotate_globals( + assign_pops_from_pc_params=hl.struct( + min_assignment_prob=min_prob, error_rate=error_rate + ) + ) + pops_ht = pops_ht.annotate( + evaluation_sample=hl.literal(list(evaluate_fit.s)).contains(pops_ht.s), + training_sample=hl.literal(list(train_fit.s)).contains(pops_ht.s), ) return pops_ht, pop_clf else: From d5714339b7ef264cc994f5925839c2d27922899f Mon Sep 17 00:00:00 2001 From: klaricch Date: Wed, 4 May 2022 11:37:07 -0400 Subject: [PATCH 2/6] small edit --- gnomad/sample_qc/ancestry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py index f4f8813d5..d4772cf4b 100644 --- a/gnomad/sample_qc/ancestry.py +++ b/gnomad/sample_qc/ancestry.py @@ -150,7 +150,7 @@ def assign_population_pcs( :param pop_pca_scores: Input Hail Table or Pandas Dataframe :param pc_cols: Columns storing the PCs to use - :param pc_names: List fo integer to use for naming the selected PCs (i.e. an input of [1, 3] will result in the first two PCs of pc_cols being named PC1 and PC3) + :param pc_names: List of integers to use for naming the selected PCs (i.e. an input of [1, 3] will result in the first two PCs of pc_cols being named PC1 and PC3) :param known_col: Column storing the known population labels :param fit: Fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call) :param seed: Random seed From 7a6819af30ac0d041bf7a620ead4a54d01f6db3f Mon Sep 17 00:00:00 2001 From: klaricch Date: Wed, 4 May 2022 17:23:14 -0400 Subject: [PATCH 3/6] edit pc_cols param --- gnomad/sample_qc/ancestry.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py index d4772cf4b..72cbcbe08 100644 --- a/gnomad/sample_qc/ancestry.py +++ b/gnomad/sample_qc/ancestry.py @@ -117,8 +117,7 @@ def pc_project( def assign_population_pcs( pop_pca_scores: Union[hl.Table, pd.DataFrame], - pc_cols: Union[hl.expr.ArrayExpression, List[str]], - pc_names: List[int], + pc_cols: Union[List[int], List[str]], known_col: str = "known_pop", fit: Any = None, # Type should be RandomForestClassifier but we do not want to import sklearn.RandomForestClassifier outside seed: int = 42, @@ -137,7 +136,7 @@ def assign_population_pcs( As input, this function can either take: - A Hail Table (typically the output of `hwe_normalized_pca`). In this case, - - `pc_cols` should be an ArrayExpression of Floats where each element is one of the PCs to use. + - `pc_cols` should be a list of integers where each element is one of the PCs to use. - A Hail Table will be returned as output - A Pandas DataFrame. In this case: - Each PC should be in a separate column and `pc_cols` is the list of all the columns containing the PCs to use. @@ -149,8 +148,7 @@ def assign_population_pcs( can be used to expand this column into multiple `PC` columns. :param pop_pca_scores: Input Hail Table or Pandas Dataframe - :param pc_cols: Columns storing the PCs to use - :param pc_names: List of integers to use for naming the selected PCs (i.e. an input of [1, 3] will result in the first two PCs of pc_cols being named PC1 and PC3) + :param pc_cols: List of which PCS to use/columns storing the PCs to use (i.e. [1,2,4,5]) :param known_col: Column storing the known population labels :param fit: Fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call) :param seed: Random seed @@ -165,16 +163,16 @@ def assign_population_pcs( hail_input = isinstance(pop_pca_scores, hl.Table) if hail_input: + pcs_to_pull = [pop_pca_scores.scores[i - 1] for i in pc_cols] if not fit: - pop_pca_scores = pop_pca_scores.select(known_col, pca_scores=pc_cols) + pop_pca_scores = pop_pca_scores.select(known_col, pca_scores=pcs_to_pull) else: - pop_pca_scores = pop_pca_scores.select(pca_scores=pc_cols) + pop_pca_scores = pop_pca_scores.select(pca_scores=pcs_to_pull) pop_pc_pd = pop_pca_scores.to_pandas() # Explode the PC array - num_out_cols = min([len(x) for x in pop_pc_pd["pca_scores"].values.tolist()]) - pc_cols = [f"PC{i}" for i in pc_names] + pc_cols = [f"PC{i}" for i in pc_cols] pop_pc_pd[pc_cols] = pd.DataFrame(pop_pc_pd["pca_scores"].values.tolist()) else: @@ -295,4 +293,4 @@ def run_pca_with_relateds( ) related_scores = pc_project(related_mt, pca_loadings) pca_scores = pca_scores.union(related_scores) - return pca_evals, pca_scores, pca_loadings + return pca_evals, pca_scores, pca_loadings \ No newline at end of file From da157dca7105ee1a83adf74b7ac87c4672bf6001 Mon Sep 17 00:00:00 2001 From: klaricch Date: Wed, 4 May 2022 18:20:25 -0400 Subject: [PATCH 4/6] run black --- gnomad/sample_qc/ancestry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py index 72cbcbe08..cad8d2778 100644 --- a/gnomad/sample_qc/ancestry.py +++ b/gnomad/sample_qc/ancestry.py @@ -293,4 +293,4 @@ def run_pca_with_relateds( ) related_scores = pc_project(related_mt, pca_loadings) pca_scores = pca_scores.union(related_scores) - return pca_evals, pca_scores, pca_loadings \ No newline at end of file + return pca_evals, pca_scores, pca_loadings From d1ae2cf288a533b15a271a52c3fa60b7f314f499 Mon Sep 17 00:00:00 2001 From: klaricch Date: Tue, 17 May 2022 13:52:25 -0400 Subject: [PATCH 5/6] PR suggestions --- gnomad/sample_qc/ancestry.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py index cad8d2778..729e01aa4 100644 --- a/gnomad/sample_qc/ancestry.py +++ b/gnomad/sample_qc/ancestry.py @@ -148,7 +148,7 @@ def assign_population_pcs( can be used to expand this column into multiple `PC` columns. :param pop_pca_scores: Input Hail Table or Pandas Dataframe - :param pc_cols: List of which PCS to use/columns storing the PCs to use (i.e. [1,2,4,5]) + :param pc_cols: List of which PCs to use/columns storing the PCs to use. Values provided should be 1-based and should be a list of integers when passing in a Hail Table (i.e. [1, 2, 4, 5]) or a list of strings when passing in a Pandas Dataframe (i.e. ["PC1", "PC2", "PC4", "PC5"]). :param known_col: Column storing the known population labels :param fit: Fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call) :param seed: Random seed @@ -163,6 +163,10 @@ def assign_population_pcs( hail_input = isinstance(pop_pca_scores, hl.Table) if hail_input: + if not all(isinstance(n, int) for n in pc_cols): + raise TypeError( + "Using a Hail Table with pc_cols requires all values of pc_cols list to be integers" + ) pcs_to_pull = [pop_pca_scores.scores[i - 1] for i in pc_cols] if not fit: pop_pca_scores = pop_pca_scores.select(known_col, pca_scores=pcs_to_pull) From 8d4124860fa5a2f4481fae6300fcffe4cc07f7c2 Mon Sep 17 00:00:00 2001 From: klaricch Date: Tue, 17 May 2022 13:55:17 -0400 Subject: [PATCH 6/6] add type check --- gnomad/sample_qc/ancestry.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py index 729e01aa4..e7cf08384 100644 --- a/gnomad/sample_qc/ancestry.py +++ b/gnomad/sample_qc/ancestry.py @@ -165,7 +165,7 @@ def assign_population_pcs( if hail_input: if not all(isinstance(n, int) for n in pc_cols): raise TypeError( - "Using a Hail Table with pc_cols requires all values of pc_cols list to be integers" + "Using a Hail Table with pc_cols requires all values of the pc_cols list to be integers" ) pcs_to_pull = [pop_pca_scores.scores[i - 1] for i in pc_cols] if not fit: @@ -180,6 +180,10 @@ def assign_population_pcs( pop_pc_pd[pc_cols] = pd.DataFrame(pop_pc_pd["pca_scores"].values.tolist()) else: + if not all(isinstance(n, str) for n in pc_cols): + raise TypeError( + "Using a Pandas DataFrame with pc_cols requires all values of the pc_cols list to be strings" + ) pop_pc_pd = pop_pca_scores # Split training data into subsamples for fitting and evaluating